[kernel] r11867 - in dists/trunk/linux-2.6/debian: . config patches/features/all/openvz patches/series

Maximilian Attems maks at alioth.debian.org
Mon Jul 21 14:07:54 UTC 2008


Author: maks
Date: Mon Jul 21 14:07:51 2008
New Revision: 11867

Log:
add openvz patch

enabled for amd64.

TODO:
- i386, ia64
- userspace depends
- proper desc

Added:
   dists/trunk/linux-2.6/debian/patches/features/all/openvz/
   dists/trunk/linux-2.6/debian/patches/features/all/openvz/openvz.patch
Modified:
   dists/trunk/linux-2.6/debian/changelog
   dists/trunk/linux-2.6/debian/config/defines
   dists/trunk/linux-2.6/debian/copyright
   dists/trunk/linux-2.6/debian/patches/series/1~experimental.1-extra

Modified: dists/trunk/linux-2.6/debian/changelog
==============================================================================
--- dists/trunk/linux-2.6/debian/changelog	(original)
+++ dists/trunk/linux-2.6/debian/changelog	Mon Jul 21 14:07:51 2008
@@ -70,6 +70,7 @@
   * Enable BLK_DEV_BSG for SG v4 support.
   * [amd64] Enable default disabled memtest boot param.
   * topconfig: Enable PATA_SIS instead of SATA_SIS. (closes: #485609)
+  * Add OpenVZ countainer flavour for amd64. (closes: #392015)
 
   [ Martin Michlmayr ]
   * [arm/orion5x] Update the config to reflect upstream renaming this

Modified: dists/trunk/linux-2.6/debian/config/defines
==============================================================================
--- dists/trunk/linux-2.6/debian/config/defines	(original)
+++ dists/trunk/linux-2.6/debian/config/defines	Mon Jul 21 14:07:51 2008
@@ -25,7 +25,7 @@
  xen-vserver
 
 [featureset-openvz_base]
-enabled: false
+enabled: true
 
 [featureset-vserver_base]
 enabled: false

Modified: dists/trunk/linux-2.6/debian/copyright
==============================================================================
--- dists/trunk/linux-2.6/debian/copyright	(original)
+++ dists/trunk/linux-2.6/debian/copyright	Mon Jul 21 14:07:51 2008
@@ -62,3 +62,15 @@
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
    IN THE SOFTWARE.
+
+The openvz patch was obtained from:
+ git://git.openvz.org/pub/linux-2.6.26-openvz
+
+OpenVZ is distributed under the GPL v2 and later license and that notice:
+
+Nothing in this license should be construed as a grant by SWsoft of any rights
+beyond the rights specified in the GNU General Public License, and nothing in
+this license should be construed as a waiver by SWsoft of its patent, copyright
+and/or trademark rights, beyond the waiver required by the GNU General Public
+License. This license is expressly inapplicable to any product that is not
+within the scope of the GNU General Public License

Added: dists/trunk/linux-2.6/debian/patches/features/all/openvz/openvz.patch
==============================================================================
--- (empty file)
+++ dists/trunk/linux-2.6/debian/patches/features/all/openvz/openvz.patch	Mon Jul 21 14:07:51 2008
@@ -0,0 +1,82828 @@
+commit dbe7093bdda52ce46fb80f013e6937dddb03980c
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 18:49:45 2008 +0400
+
+    conntracks: register nf_conntrack_expect in each namespace
+    
+    Otherwise - warning:
+    
+    proc_dir_entry 'nf_conntrack_expect' already registered
+    Pid: 3955, comm: vzctl Not tainted 2.6.26 #130
+    
+    Call Trace:
+     [<ffffffff803637ac>] ? idr_get_new+0x13/0x33
+     [<ffffffff802d09ca>] proc_register+0x11b/0x151
+     [<ffffffff802d0a86>] proc_create_data+0x86/0x9f
+     [<ffffffff802d48fd>] proc_net_fops_create+0x18/0x1a
+     [<ffffffffa003284d>] :nf_conntrack:nf_conntrack_expect_init+0x11e/0x17c
+     [<ffffffffa00301c5>] :nf_conntrack:nf_conntrack_init+0x26d/0x31c
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 9429b536e6ed239b7c1752ba611482c7ed49205f
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:24:29 2008 +0400
+
+    Adjust Kconfig options to fix arbitrary compilation errors
+    
+    xemul: Some options are incompatible or are implemented in other way in OpenVZ
+    model, so we disable them.
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 4b58555e7bf62369b6cc065343cba1df8b085c2d
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:23:05 2008 +0400
+
+    conntracks: make nf_conntrack_proto_icmp compile and work
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 3fc2b5e403891efbb88887372e6764a4ee21c2c3
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:22:13 2008 +0400
+
+    conntracks: make nf_conntrack_l3proto_ipv4_compat compile and work
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 3c11bdc8d0005a659a72fc0805093a7c0a83632b
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:21:28 2008 +0400
+
+    conntracks: make nf_conntrack_l3proto_ipv4 part compile and work
+    
+    Main problem - poor sysctl tables registrarion.
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 2655d79febc106ac47c9bf925e23d99dadb52072
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:20:31 2008 +0400
+
+    netfilter: nf_conntrack_proto_fini() must be called before generic_sysctl_cleanup
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 2ce6bb08051038cb5e11d9523b5e1b67df03022f
+Author: Vitaliy Gusev <vgusev at openvz.org>
+Date:   Thu Jul 17 16:19:58 2008 +0400
+
+    netfilter: fix forgotten call nf_conntrack_proto_init().
+    
+    Signed-off-by: Vitaliy Gusev <vgusev at openvz.org>
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit 1f7c5382a62f5518c7f26bd83fbf68131dd30aee
+Author: Pavel Emelyanov <xemul at openvz.org>
+Date:   Wed Jul 16 17:31:55 2008 +0400
+
+    Fix __d_path codeflow wrt vfsmnt_lock locking
+    
+    Signed-off-by: Pavel Emelyanov <xemul at openvz.org>
+
+commit aafb4cebb852b897a594675389328c829e7680ea
+Author: OpenVZ team <devel at openvz.org>
+Date:   Wed Jul 16 17:20:32 2008 +0400
+
+    Linux 2.6.26-ovz
+    
+    Netfilter in container now works! Checkpointing now compiles!
+    These were good news. Bad news is that conntracks are still
+    broken, and checkpointing can be compiled *only*
+    
+    Will fix it all later :)
+diff --git a/COPYING.SWsoft b/COPYING.SWsoft
+new file mode 100644
+index 0000000..059256d
+--- /dev/null
++++ b/COPYING.SWsoft
+@@ -0,0 +1,350 @@
++
++Nothing in this license should be construed as a grant by SWsoft of any rights
++beyond the rights specified in the GNU General Public License, and nothing in
++this license should be construed as a waiver by SWsoft of its patent, copyright
++and/or trademark rights, beyond the waiver required by the GNU General Public
++License. This license is expressly inapplicable to any product that is not
++within the scope of the GNU General Public License
++
++----------------------------------------
++
++		    GNU GENERAL PUBLIC LICENSE
++		       Version 2, June 1991
++
++ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
++                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ Everyone is permitted to copy and distribute verbatim copies
++ of this license document, but changing it is not allowed.
++
++			    Preamble
++
++  The licenses for most software are designed to take away your
++freedom to share and change it.  By contrast, the GNU General Public
++License is intended to guarantee your freedom to share and change free
++software--to make sure the software is free for all its users.  This
++General Public License applies to most of the Free Software
++Foundation's software and to any other program whose authors commit to
++using it.  (Some other Free Software Foundation software is covered by
++the GNU Library General Public License instead.)  You can apply it to
++your programs, too.
++
++  When we speak of free software, we are referring to freedom, not
++price.  Our General Public Licenses are designed to make sure that you
++have the freedom to distribute copies of free software (and charge for
++this service if you wish), that you receive source code or can get it
++if you want it, that you can change the software or use pieces of it
++in new free programs; and that you know you can do these things.
++
++  To protect your rights, we need to make restrictions that forbid
++anyone to deny you these rights or to ask you to surrender the rights.
++These restrictions translate to certain responsibilities for you if you
++distribute copies of the software, or if you modify it.
++
++  For example, if you distribute copies of such a program, whether
++gratis or for a fee, you must give the recipients all the rights that
++you have.  You must make sure that they, too, receive or can get the
++source code.  And you must show them these terms so they know their
++rights.
++
++  We protect your rights with two steps: (1) copyright the software, and
++(2) offer you this license which gives you legal permission to copy,
++distribute and/or modify the software.
++
++  Also, for each author's protection and ours, we want to make certain
++that everyone understands that there is no warranty for this free
++software.  If the software is modified by someone else and passed on, we
++want its recipients to know that what they have is not the original, so
++that any problems introduced by others will not reflect on the original
++authors' reputations.
++
++  Finally, any free program is threatened constantly by software
++patents.  We wish to avoid the danger that redistributors of a free
++program will individually obtain patent licenses, in effect making the
++program proprietary.  To prevent this, we have made it clear that any
++patent must be licensed for everyone's free use or not licensed at all.
++
++  The precise terms and conditions for copying, distribution and
++modification follow.
++
++		    GNU GENERAL PUBLIC LICENSE
++   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
++
++  0. This License applies to any program or other work which contains
++a notice placed by the copyright holder saying it may be distributed
++under the terms of this General Public License.  The "Program", below,
++refers to any such program or work, and a "work based on the Program"
++means either the Program or any derivative work under copyright law:
++that is to say, a work containing the Program or a portion of it,
++either verbatim or with modifications and/or translated into another
++language.  (Hereinafter, translation is included without limitation in
++the term "modification".)  Each licensee is addressed as "you".
++
++Activities other than copying, distribution and modification are not
++covered by this License; they are outside its scope.  The act of
++running the Program is not restricted, and the output from the Program
++is covered only if its contents constitute a work based on the
++Program (independent of having been made by running the Program).
++Whether that is true depends on what the Program does.
++
++  1. You may copy and distribute verbatim copies of the Program's
++source code as you receive it, in any medium, provided that you
++conspicuously and appropriately publish on each copy an appropriate
++copyright notice and disclaimer of warranty; keep intact all the
++notices that refer to this License and to the absence of any warranty;
++and give any other recipients of the Program a copy of this License
++along with the Program.
++
++You may charge a fee for the physical act of transferring a copy, and
++you may at your option offer warranty protection in exchange for a fee.
++
++  2. You may modify your copy or copies of the Program or any portion
++of it, thus forming a work based on the Program, and copy and
++distribute such modifications or work under the terms of Section 1
++above, provided that you also meet all of these conditions:
++
++    a) You must cause the modified files to carry prominent notices
++    stating that you changed the files and the date of any change.
++
++    b) You must cause any work that you distribute or publish, that in
++    whole or in part contains or is derived from the Program or any
++    part thereof, to be licensed as a whole at no charge to all third
++    parties under the terms of this License.
++
++    c) If the modified program normally reads commands interactively
++    when run, you must cause it, when started running for such
++    interactive use in the most ordinary way, to print or display an
++    announcement including an appropriate copyright notice and a
++    notice that there is no warranty (or else, saying that you provide
++    a warranty) and that users may redistribute the program under
++    these conditions, and telling the user how to view a copy of this
++    License.  (Exception: if the Program itself is interactive but
++    does not normally print such an announcement, your work based on
++    the Program is not required to print an announcement.)
++
++These requirements apply to the modified work as a whole.  If
++identifiable sections of that work are not derived from the Program,
++and can be reasonably considered independent and separate works in
++themselves, then this License, and its terms, do not apply to those
++sections when you distribute them as separate works.  But when you
++distribute the same sections as part of a whole which is a work based
++on the Program, the distribution of the whole must be on the terms of
++this License, whose permissions for other licensees extend to the
++entire whole, and thus to each and every part regardless of who wrote it.
++
++Thus, it is not the intent of this section to claim rights or contest
++your rights to work written entirely by you; rather, the intent is to
++exercise the right to control the distribution of derivative or
++collective works based on the Program.
++
++In addition, mere aggregation of another work not based on the Program
++with the Program (or with a work based on the Program) on a volume of
++a storage or distribution medium does not bring the other work under
++the scope of this License.
++
++  3. You may copy and distribute the Program (or a work based on it,
++under Section 2) in object code or executable form under the terms of
++Sections 1 and 2 above provided that you also do one of the following:
++
++    a) Accompany it with the complete corresponding machine-readable
++    source code, which must be distributed under the terms of Sections
++    1 and 2 above on a medium customarily used for software interchange; or,
++
++    b) Accompany it with a written offer, valid for at least three
++    years, to give any third party, for a charge no more than your
++    cost of physically performing source distribution, a complete
++    machine-readable copy of the corresponding source code, to be
++    distributed under the terms of Sections 1 and 2 above on a medium
++    customarily used for software interchange; or,
++
++    c) Accompany it with the information you received as to the offer
++    to distribute corresponding source code.  (This alternative is
++    allowed only for noncommercial distribution and only if you
++    received the program in object code or executable form with such
++    an offer, in accord with Subsection b above.)
++
++The source code for a work means the preferred form of the work for
++making modifications to it.  For an executable work, complete source
++code means all the source code for all modules it contains, plus any
++associated interface definition files, plus the scripts used to
++control compilation and installation of the executable.  However, as a
++special exception, the source code distributed need not include
++anything that is normally distributed (in either source or binary
++form) with the major components (compiler, kernel, and so on) of the
++operating system on which the executable runs, unless that component
++itself accompanies the executable.
++
++If distribution of executable or object code is made by offering
++access to copy from a designated place, then offering equivalent
++access to copy the source code from the same place counts as
++distribution of the source code, even though third parties are not
++compelled to copy the source along with the object code.
++
++  4. You may not copy, modify, sublicense, or distribute the Program
++except as expressly provided under this License.  Any attempt
++otherwise to copy, modify, sublicense or distribute the Program is
++void, and will automatically terminate your rights under this License.
++However, parties who have received copies, or rights, from you under
++this License will not have their licenses terminated so long as such
++parties remain in full compliance.
++
++  5. You are not required to accept this License, since you have not
++signed it.  However, nothing else grants you permission to modify or
++distribute the Program or its derivative works.  These actions are
++prohibited by law if you do not accept this License.  Therefore, by
++modifying or distributing the Program (or any work based on the
++Program), you indicate your acceptance of this License to do so, and
++all its terms and conditions for copying, distributing or modifying
++the Program or works based on it.
++
++  6. Each time you redistribute the Program (or any work based on the
++Program), the recipient automatically receives a license from the
++original licensor to copy, distribute or modify the Program subject to
++these terms and conditions.  You may not impose any further
++restrictions on the recipients' exercise of the rights granted herein.
++You are not responsible for enforcing compliance by third parties to
++this License.
++
++  7. If, as a consequence of a court judgment or allegation of patent
++infringement or for any other reason (not limited to patent issues),
++conditions are imposed on you (whether by court order, agreement or
++otherwise) that contradict the conditions of this License, they do not
++excuse you from the conditions of this License.  If you cannot
++distribute so as to satisfy simultaneously your obligations under this
++License and any other pertinent obligations, then as a consequence you
++may not distribute the Program at all.  For example, if a patent
++license would not permit royalty-free redistribution of the Program by
++all those who receive copies directly or indirectly through you, then
++the only way you could satisfy both it and this License would be to
++refrain entirely from distribution of the Program.
++
++If any portion of this section is held invalid or unenforceable under
++any particular circumstance, the balance of the section is intended to
++apply and the section as a whole is intended to apply in other
++circumstances.
++
++It is not the purpose of this section to induce you to infringe any
++patents or other property right claims or to contest validity of any
++such claims; this section has the sole purpose of protecting the
++integrity of the free software distribution system, which is
++implemented by public license practices.  Many people have made
++generous contributions to the wide range of software distributed
++through that system in reliance on consistent application of that
++system; it is up to the author/donor to decide if he or she is willing
++to distribute software through any other system and a licensee cannot
++impose that choice.
++
++This section is intended to make thoroughly clear what is believed to
++be a consequence of the rest of this License.
++
++  8. If the distribution and/or use of the Program is restricted in
++certain countries either by patents or by copyrighted interfaces, the
++original copyright holder who places the Program under this License
++may add an explicit geographical distribution limitation excluding
++those countries, so that distribution is permitted only in or among
++countries not thus excluded.  In such case, this License incorporates
++the limitation as if written in the body of this License.
++
++  9. The Free Software Foundation may publish revised and/or new versions
++of the General Public License from time to time.  Such new versions will
++be similar in spirit to the present version, but may differ in detail to
++address new problems or concerns.
++
++Each version is given a distinguishing version number.  If the Program
++specifies a version number of this License which applies to it and "any
++later version", you have the option of following the terms and conditions
++either of that version or of any later version published by the Free
++Software Foundation.  If the Program does not specify a version number of
++this License, you may choose any version ever published by the Free Software
++Foundation.
++
++  10. If you wish to incorporate parts of the Program into other free
++programs whose distribution conditions are different, write to the author
++to ask for permission.  For software which is copyrighted by the Free
++Software Foundation, write to the Free Software Foundation; we sometimes
++make exceptions for this.  Our decision will be guided by the two goals
++of preserving the free status of all derivatives of our free software and
++of promoting the sharing and reuse of software generally.
++
++			    NO WARRANTY
++
++  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
++FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
++OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
++PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
++OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
++TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
++PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
++REPAIR OR CORRECTION.
++
++  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
++WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
++REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
++INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
++OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
++TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
++YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
++PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
++POSSIBILITY OF SUCH DAMAGES.
++
++		     END OF TERMS AND CONDITIONS
++
++	    How to Apply These Terms to Your New Programs
++
++  If you develop a new program, and you want it to be of the greatest
++possible use to the public, the best way to achieve this is to make it
++free software which everyone can redistribute and change under these terms.
++
++  To do so, attach the following notices to the program.  It is safest
++to attach them to the start of each source file to most effectively
++convey the exclusion of warranty; and each file should have at least
++the "copyright" line and a pointer to where the full notice is found.
++
++    <one line to give the program's name and a brief idea of what it does.>
++    Copyright (C) <year>  <name of author>
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++
++
++Also add information on how to contact you by electronic and paper mail.
++
++If the program is interactive, make it output a short notice like this
++when it starts in an interactive mode:
++
++    Gnomovision version 69, Copyright (C) year name of author
++    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
++    This is free software, and you are welcome to redistribute it
++    under certain conditions; type `show c' for details.
++
++The hypothetical commands `show w' and `show c' should show the appropriate
++parts of the General Public License.  Of course, the commands you use may
++be called something other than `show w' and `show c'; they could even be
++mouse-clicks or menu items--whatever suits your program.
++
++You should also get your employer (if you work as a programmer) or your
++school, if any, to sign a "copyright disclaimer" for the program, if
++necessary.  Here is a sample; alter the names:
++
++  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
++  `Gnomovision' (which makes passes at compilers) written by James Hacker.
++
++  <signature of Ty Coon>, 1 April 1989
++  Ty Coon, President of Vice
++
++This General Public License does not permit incorporating your program into
++proprietary programs.  If your program is a subroutine library, you may
++consider it more useful to permit linking proprietary applications with the
++library.  If this is what you want to do, use the GNU Library General
++Public License instead of this License.
+diff --git a/Makefile b/Makefile
+index e3c5eb6..62dc374 100644
+--- a/Makefile
++++ b/Makefile
+@@ -2,6 +2,7 @@ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 26
+ EXTRAVERSION =
++VZVERSION = 036test001
+ NAME = Rotary Wombat
+ 
+ # *DOCUMENTATION*
+@@ -339,7 +340,7 @@ KBUILD_AFLAGS   := -D__ASSEMBLY__
+ KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
+ KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
+ 
+-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
++export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION
+ export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
+ export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE
+ export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
+@@ -973,7 +974,8 @@ define filechk_utsrelease.h
+ 	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
+ 	  exit 1;                                                         \
+ 	fi;                                                               \
+-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
++	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; 		  \
++		echo \#define VZVERSION \"$(VZVERSION)\";)
+ endef
+ 
+ define filechk_version.h
+diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
+index eefae1d..0c3a894 100644
+--- a/arch/arm/kernel/smp.c
++++ b/arch/arm/kernel/smp.c
+@@ -201,7 +201,7 @@ int __cpuexit __cpu_disable(void)
+ 	local_flush_tlb_all();
+ 
+ 	read_lock(&tasklist_lock);
+-	for_each_process(p) {
++	for_each_process_all(p) {
+ 		if (p->mm)
+ 			cpu_clear(cpu, p->mm->cpu_vm_mask);
+ 	}
+diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
+index 16be414..826b220 100644
+--- a/arch/ia64/Kconfig
++++ b/arch/ia64/Kconfig
+@@ -611,6 +611,7 @@ source "arch/ia64/kvm/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "kernel/bc/Kconfig"
+ #
+ # Use the generic interrupt handling code in kernel/irq/:
+ #
+@@ -638,6 +639,8 @@ source "arch/ia64/hp/sim/Kconfig"
+ 
+ source "arch/ia64/Kconfig.debug"
+ 
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+ 
+ source "crypto/Kconfig"
+diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
+index 4f0c30c..067cb28 100644
+--- a/arch/ia64/ia32/binfmt_elf32.c
++++ b/arch/ia64/ia32/binfmt_elf32.c
+@@ -17,6 +17,8 @@
+ #include <asm/param.h>
+ #include <asm/signal.h>
+ 
++#include <bc/vmpages.h>
++
+ #include "ia32priv.h"
+ #include "elfcore32.h"
+ 
+@@ -132,6 +134,12 @@ ia64_elf32_init (struct pt_regs *regs)
+ 		up_write(&current->mm->mmap_sem);
+ 	}
+ 
++	if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES *
++					IA32_LDT_ENTRY_SIZE),
++				VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE,
++				NULL, UB_SOFT))
++		goto skip;
++
+ 	/*
+ 	 * Install LDT as anonymous memory.  This gives us all-zero segment descriptors
+ 	 * until a task modifies them via modify_ldt().
+@@ -152,7 +160,12 @@ ia64_elf32_init (struct pt_regs *regs)
+ 			}
+ 		}
+ 		up_write(&current->mm->mmap_sem);
+-	}
++	} else
++		ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES *
++					IA32_LDT_ENTRY_SIZE),
++				VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL);
++
++skip:
+ 
+ 	ia64_psr(regs)->ac = 0;		/* turn off alignment checking */
+ 	regs->loadrs = 0;
+diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
+index ca2bb95..e2fa9a0 100644
+--- a/arch/ia64/kernel/entry.S
++++ b/arch/ia64/kernel/entry.S
+@@ -504,6 +504,74 @@ GLOBAL_ENTRY(clone)
+ 	br.ret.sptk.many rp
+ END(clone)
+ 
++GLOBAL_ENTRY(ia64_ret_from_resume)
++	PT_REGS_UNWIND_INFO(0)
++{	/*
++	 * Some versions of gas generate bad unwind info if the first instruction of a
++	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
++	 */
++	nop.m 0
++	nop.i 0
++	/*
++	 * We need to call schedule_tail() to complete the scheduling process.
++	 * Called by ia64_switch_to() after do_fork()->copy_thread().  r8 contains the
++	 * address of the previously executing task.
++	 */
++	br.call.sptk.many rp=ia64_invoke_schedule_tail
++}
++	br.call.sptk.many rp=ia64_invoke_resume
++	;;
++	adds sp=256,sp
++	;;
++	/* Return from interrupt, we are all right. */
++(pNonSys) br ia64_leave_kernel
++	;;
++	/* Tricky part follows. We must restore correct syscall
++	 * register frame before doing normal syscall exit job.
++	 * It would the most natural to keep sw->ar_pfs correct,
++	 * then we would be here with correct register frame.
++	 * Unfortunately, IA64 has a feature. Registers were in backstore
++	 * after context switch, and the first br.ret does _NOT_ fetch
++	 * output registers.
++	 * It is quite natural:	look, if caller has output regs in his
++	 * frame, they should be consumed. If callee does not have (enough of)
++	 * input/local registers (1 in this case), the situation is unusual.
++	 * Practical evidence: they are filled with something random crap.
++	 * The only case, when this is essential in mainstream kernel
++	 * is sys_clone(). The result is that new process gets some kernel
++	 * information in its register frame. Which is a security problem, btw.
++	 *
++	 * So, we set sw->ar_pfs to pretend the whole frame is of local
++	 * regs. And we have to repartition the frame it manually, using
++	 * information from pt->cr_ifs (the register is invalid in this
++	 * case, but it holds correct pfm).
++	 */
++	adds r3=PT(CR_IFS)+16,sp
++	;;
++	ld8  r2=[r3],-(PT(CR_IFS)-PT(R8))
++	;;
++	extr.u  r2=r2,0,37
++	mov	r8=ar.ec
++	;;
++	extr.u  r8=r8,0,5
++	;;
++	shl	r8=r8,52
++	;;
++	or	r2=r2,r8
++	;;
++	mov  ar.pfs=r2
++	;;
++	movl r2=ia64_leave_syscall
++	;;
++	mov  rp=r2
++	/* Plus, we should fetch r8 and r10 from pt_regs. Something else? */
++	ld8  r8=[r3],PT(R10)-PT(R8)
++	;;
++	ld8  r10=[r3]
++	;;
++	br.ret.sptk.many rp
++END(ia64_ret_from_resume)
++
+ 	/*
+ 	 * Invoke a system call, but do some tracing before and after the call.
+ 	 * We MUST preserve the current register frame throughout this routine
+@@ -1236,6 +1304,34 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail)
+ 	br.ret.sptk.many rp
+ END(ia64_invoke_schedule_tail)
+ 
++GLOBAL_ENTRY(ia64_invoke_resume)
++	alloc loc1=ar.pfs,0,3,1,0
++	mov loc0=rp
++	adds out0=16,sp
++	;;
++	ld8  r8=[out0]
++	;;
++	cmp.eq p6,p0=r8,r0
++	;;
++(p6)	br.cond.sptk 1f
++	;;
++	mov  loc2=gp
++	;;
++	ld8  r10=[r8],8
++	;;
++	ld8  gp=[r8]
++	;;
++	mov  b7=r10
++	;;
++	br.call.sptk.many rp=b7
++	;;
++	mov  gp=loc2
++1:	
++	mov ar.pfs=loc1
++	mov rp=loc0
++	br.ret.sptk.many rp
++END(ia64_invoke_resume)
++
+ 	/*
+ 	 * Setup stack and call do_notify_resume_user(), keeping interrupts
+ 	 * disabled.
+@@ -1664,4 +1760,17 @@ sys_call_table:
+ 	data8 sys_timerfd_settime
+ 	data8 sys_timerfd_gettime
+ 
++.rept 1499-1313
++	data8 sys_ni_syscall
++.endr
++	data8 sys_fairsched_vcpus
++	data8 sys_fairsched_mknod		// 1500
++	data8 sys_fairsched_rmnod
++	data8 sys_fairsched_chwt
++	data8 sys_fairsched_mvpr
++	data8 sys_fairsched_rate
++	data8 sys_getluid			// 1505
++	data8 sys_setluid
++	data8 sys_setublimit
++	data8 sys_ubstat
+ 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
+diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
+index c1625c7..634b102 100644
+--- a/arch/ia64/kernel/fsys.S
++++ b/arch/ia64/kernel/fsys.S
+@@ -90,53 +90,6 @@ ENTRY(fsys_getpid)
+ 	FSYS_RETURN
+ END(fsys_getpid)
+ 
+-ENTRY(fsys_getppid)
+-	.prologue
+-	.altrp b6
+-	.body
+-	add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16
+-	;;
+-	ld8 r17=[r17]				// r17 = current->group_leader
+-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
+-	;;
+-
+-	ld4 r9=[r9]
+-	add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = &current->group_leader->real_parent
+-	;;
+-	and r9=TIF_ALLWORK_MASK,r9
+-
+-1:	ld8 r18=[r17]				// r18 = current->group_leader->real_parent
+-	;;
+-	cmp.ne p8,p0=0,r9
+-	add r8=IA64_TASK_TGID_OFFSET,r18	// r8 = &current->group_leader->real_parent->tgid
+-	;;
+-
+-	/*
+-	 * The .acq is needed to ensure that the read of tgid has returned its data before
+-	 * we re-check "real_parent".
+-	 */
+-	ld4.acq r8=[r8]				// r8 = current->group_leader->real_parent->tgid
+-#ifdef CONFIG_SMP
+-	/*
+-	 * Re-read current->group_leader->real_parent.
+-	 */
+-	ld8 r19=[r17]				// r19 = current->group_leader->real_parent
+-(p8)	br.spnt.many fsys_fallback_syscall
+-	;;
+-	cmp.ne p6,p0=r18,r19			// did real_parent change?
+-	mov r19=0			// i must not leak kernel bits...
+-(p6)	br.cond.spnt.few 1b			// yes -> redo the read of tgid and the check
+-	;;
+-	mov r17=0			// i must not leak kernel bits...
+-	mov r18=0			// i must not leak kernel bits...
+-#else
+-	mov r17=0			// i must not leak kernel bits...
+-	mov r18=0			// i must not leak kernel bits...
+-	mov r19=0			// i must not leak kernel bits...
+-#endif
+-	FSYS_RETURN
+-END(fsys_getppid)
+-
+ ENTRY(fsys_set_tid_address)
+ 	.prologue
+ 	.altrp b6
+@@ -767,7 +720,7 @@ fsyscall_table:
+ 	data8 0				// chown
+ 	data8 0				// lseek		// 1040
+ 	data8 fsys_getpid		// getpid
+-	data8 fsys_getppid		// getppid
++	data8 0				// getppid
+ 	data8 0				// mount
+ 	data8 0				// umount
+ 	data8 0				// setuid		// 1045
+diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
+index ddeab4e..f9b6281 100644
+--- a/arch/ia64/kernel/head.S
++++ b/arch/ia64/kernel/head.S
+@@ -1031,7 +1031,7 @@ GLOBAL_ENTRY(start_kernel_thread)
+ 	mov out1 = r11;;
+ 	br.call.sptk.many rp = kernel_thread_helper;;
+ 	mov out0 = r8
+-	br.call.sptk.many rp = sys_exit;;
++	br.call.sptk.many rp = do_exit;;
+ 1:	br.sptk.few 1b				// not reached
+ END(start_kernel_thread)
+ 
+diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
+index 6da1f20..24950d6 100644
+--- a/arch/ia64/kernel/ia64_ksyms.c
++++ b/arch/ia64/kernel/ia64_ksyms.c
+@@ -75,6 +75,8 @@ EXPORT_SYMBOL(xor_ia64_4);
+ EXPORT_SYMBOL(xor_ia64_5);
+ #endif
+ 
++EXPORT_SYMBOL(empty_zero_page);
++
+ #include <asm/pal.h>
+ EXPORT_SYMBOL(ia64_pal_call_phys_stacked);
+ EXPORT_SYMBOL(ia64_pal_call_phys_static);
+diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
+index 705176b..b00c3af 100644
+--- a/arch/ia64/kernel/mca.c
++++ b/arch/ia64/kernel/mca.c
+@@ -1608,10 +1608,10 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi
+ 	}
+ 	printk("\n\n");
+ 	if (read_trylock(&tasklist_lock)) {
+-		do_each_thread (g, t) {
++		do_each_thread_all (g, t) {
+ 			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+ 			show_stack(t, NULL);
+-		} while_each_thread (g, t);
++		} while_each_thread_all (g, t);
+ 		read_unlock(&tasklist_lock);
+ 	}
+ 	/* FIXME: This will not restore zapped printk locks. */
+diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
+index 7714a97..49fb204 100644
+--- a/arch/ia64/kernel/perfmon.c
++++ b/arch/ia64/kernel/perfmon.c
+@@ -4176,12 +4176,12 @@ pfm_check_task_exist(pfm_context_t *ctx)
+ 
+ 	read_lock(&tasklist_lock);
+ 
+-	do_each_thread (g, t) {
++	do_each_thread_ve (g, t) {
+ 		if (t->thread.pfm_context == ctx) {
+ 			ret = 0;
+ 			goto out;
+ 		}
+-	} while_each_thread (g, t);
++	} while_each_thread_ve (g, t);
+ out:
+ 	read_unlock(&tasklist_lock);
+ 
+diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
+index a3a34b4..54179a7 100644
+--- a/arch/ia64/kernel/process.c
++++ b/arch/ia64/kernel/process.c
+@@ -28,6 +28,7 @@
+ #include <linux/delay.h>
+ #include <linux/kdebug.h>
+ #include <linux/utsname.h>
++#include <linux/sysctl.h>
+ 
+ #include <asm/cpu.h>
+ #include <asm/delay.h>
+@@ -387,6 +388,9 @@ ia64_load_extra (struct task_struct *task)
+ #endif
+ }
+ 
++extern char ia64_ret_from_resume;
++EXPORT_SYMBOL(ia64_ret_from_resume);
++
+ /*
+  * Copy the state of an ia-64 thread.
+  *
+@@ -460,7 +464,6 @@ copy_thread (int nr, unsigned long clone_flags,
+ 			child_ptregs->r12 = user_stack_base + user_stack_size - 16;
+ 			child_ptregs->ar_bspstore = user_stack_base;
+ 			child_ptregs->ar_rnat = 0;
+-			child_ptregs->loadrs = 0;
+ 		}
+ 	} else {
+ 		/*
+@@ -672,16 +675,25 @@ out:
+ 	return error;
+ }
+ 
++extern void start_kernel_thread (void);
++EXPORT_SYMBOL(start_kernel_thread);
++
+ pid_t
+ kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
+ {
+-	extern void start_kernel_thread (void);
+ 	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
+ 	struct {
+ 		struct switch_stack sw;
+ 		struct pt_regs pt;
+ 	} regs;
+ 
++	/* Don't allow kernel_thread() inside VE */
++	if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) {
++		printk("kernel_thread call inside container\n");
++		dump_stack();
++		return -EPERM;
++	}
++
+ 	memset(&regs, 0, sizeof(regs));
+ 	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
+ 	regs.pt.r1 = helper_fptr[1];		/* set GP */
+diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
+index 2a9943b..e44debf 100644
+--- a/arch/ia64/kernel/ptrace.c
++++ b/arch/ia64/kernel/ptrace.c
+@@ -10,6 +10,7 @@
+  * Derived from the x86 and Alpha versions.
+  */
+ #include <linux/kernel.h>
++#include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/slab.h>
+ #include <linux/mm.h>
+@@ -105,6 +106,8 @@ ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat)
+ 
+ #	undef GET_BITS
+ }
++EXPORT_SYMBOL(ia64_get_scratch_nat_bits);
++EXPORT_SYMBOL(__ia64_save_fpu);
+ 
+ /*
+  * Set the NaT bits for the scratch registers according to NAT and
+@@ -461,6 +464,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
+ 	*val = ret;
+ 	return 0;
+ }
++EXPORT_SYMBOL(ia64_peek);
+ 
+ long
+ ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
+@@ -525,6 +529,7 @@ ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt,
+ 		*cfmp = cfm;
+ 	return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty);
+ }
++EXPORT_SYMBOL(ia64_get_user_rbs_end);
+ 
+ /*
+  * Synchronize (i.e, write) the RSE backing store living in kernel
+@@ -820,20 +825,20 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt,
+ 	if (write_access) {
+ 		nat_bits = *data;
+ 		scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits);
+-		if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) {
+-			dprintk("ptrace: failed to set ar.unat\n");
+-			return -1;
+-		}
++		if (info->pri_unat_loc)
++			*info->pri_unat_loc = scratch_unat;
++		else
++			info->sw->caller_unat = scratch_unat;
+ 		for (regnum = 4; regnum <= 7; ++regnum) {
+ 			unw_get_gr(info, regnum, &dummy, &nat);
+ 			unw_set_gr(info, regnum, dummy,
+ 				   (nat_bits >> regnum) & 1);
+ 		}
+ 	} else {
+-		if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) {
+-			dprintk("ptrace: failed to read ar.unat\n");
+-			return -1;
+-		}
++		if (info->pri_unat_loc)
++			scratch_unat = *info->pri_unat_loc;
++		else
++			scratch_unat = info->sw->caller_unat;
+ 		nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat);
+ 		for (regnum = 4; regnum <= 7; ++regnum) {
+ 			unw_get_gr(info, regnum, &dummy, &nat);
+diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
+index 19c5a78..cc6c4e6 100644
+--- a/arch/ia64/kernel/signal.c
++++ b/arch/ia64/kernel/signal.c
+@@ -14,6 +14,7 @@
+ #include <linux/sched.h>
+ #include <linux/signal.h>
+ #include <linux/smp.h>
++#include <linux/freezer.h>
+ #include <linux/stddef.h>
+ #include <linux/tty.h>
+ #include <linux/binfmts.h>
+@@ -464,6 +465,12 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
+ 	if (!user_mode(&scr->pt))
+ 		return;
+ 
++	if (try_to_freeze() && !signal_pending(current)) {
++		if ((long) scr->pt.r10 != -1)
++			restart = 0;
++ 		goto no_signal;
++	}
++
+ 	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+ 		oldset = &current->saved_sigmask;
+ 	else
+@@ -519,8 +526,10 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
+ 				if (IS_IA32_PROCESS(&scr->pt)) {
+ 					scr->pt.r8 = scr->pt.r1;
+ 					scr->pt.cr_iip -= 2;
+-				} else
++				} else {
+ 					ia64_decrement_ip(&scr->pt);
++					scr->pt.r10 = 0;
++				}
+ 				restart = 0; /* don't restart twice if handle_signal() fails... */
+ 			}
+ 		}
+@@ -542,6 +551,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
+ 	}
+ 
+ 	/* Did we come from a system call? */
++no_signal:
+ 	if (restart) {
+ 		/* Restart the system call - no handlers present */
+ 		if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
+@@ -561,6 +571,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall)
+ 				ia64_decrement_ip(&scr->pt);
+ 				if (errno == ERESTART_RESTARTBLOCK)
+ 					scr->pt.r15 = __NR_restart_syscall;
++				scr->pt.r10 = 0;
+ 			}
+ 		}
+ 	}
+diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
+index 1eda194..e93e7d3 100644
+--- a/arch/ia64/kernel/sys_ia64.c
++++ b/arch/ia64/kernel/sys_ia64.c
+@@ -204,7 +204,7 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un
+ 
+ 	/* Careful about overflows.. */
+ 	len = PAGE_ALIGN(len);
+-	if (!len || len > TASK_SIZE) {
++	if (len > TASK_SIZE) {
+ 		addr = -EINVAL;
+ 		goto out;
+ 	}
+diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
+index aad1b7b..9194bf5 100644
+--- a/arch/ia64/kernel/time.c
++++ b/arch/ia64/kernel/time.c
+@@ -40,6 +40,8 @@ struct fsyscall_gtod_data_t fsyscall_gtod_data = {
+ struct itc_jitter_data_t itc_jitter_data;
+ 
+ volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
++unsigned int cpu_khz;                                   /* TSC clocks / usec, not used here */
++EXPORT_SYMBOL(cpu_khz);
+ 
+ #ifdef CONFIG_IA64_DEBUG_IRQ
+ 
+@@ -335,6 +337,8 @@ ia64_init_itm (void)
+ 		 */
+ 		clocksource_itc.rating = 50;
+ 
++	cpu_khz = local_cpu_data->proc_freq / 1000;
++
+ 	/* Setup the CPU local timer tick */
+ 	ia64_cpu_local_tick();
+ 
+diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
+index ff0e7c1..7288a9f 100644
+--- a/arch/ia64/kernel/unaligned.c
++++ b/arch/ia64/kernel/unaligned.c
+@@ -1291,7 +1291,7 @@ within_logging_rate_limit (void)
+ {
+ 	static unsigned long count, last_time;
+ 
+-	if (time_after(jiffies, last_time + 5 * HZ))
++	if (time_after(jiffies, last_time + 60 * HZ))
+ 		count = 0;
+ 	if (count < 5) {
+ 		last_time = jiffies;
+diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
+index 23088be..da13815 100644
+--- a/arch/ia64/mm/fault.c
++++ b/arch/ia64/mm/fault.c
+@@ -148,7 +148,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
+ 	if ((vma->vm_flags & mask) != mask)
+ 		goto bad_area;
+ 
+-  survive:
+ 	/*
+ 	 * If for any reason at all we couldn't handle the fault, make
+ 	 * sure we exit gracefully rather than endlessly redo the
+@@ -276,13 +275,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
+ 
+   out_of_memory:
+ 	up_read(&mm->mmap_sem);
+-	if (is_global_init(current)) {
+-		yield();
+-		down_read(&mm->mmap_sem);
+-		goto survive;
++	if (user_mode(regs)) {
++		/*
++		 * 0-order allocation always success if something really
++		 * fatal not happen: beancounter overdraft or OOM.
++		 */
++		force_sig(SIGKILL, current);
++		return;
+ 	}
+-	printk(KERN_CRIT "VM: killing process %s\n", current->comm);
+-	if (user_mode(regs))
+-		do_group_exit(SIGKILL);
+ 	goto no_context;
+ }
+diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
+index 200100e..226b5cc 100644
+--- a/arch/ia64/mm/init.c
++++ b/arch/ia64/mm/init.c
+@@ -37,6 +37,8 @@
+ #include <asm/unistd.h>
+ #include <asm/mca.h>
+ 
++#include <bc/vmpages.h>
++
+ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+ 
+ extern void ia64_tlb_init (void);
+@@ -111,6 +113,10 @@ ia64_init_addr_space (void)
+ 
+ 	ia64_set_rbs_bot();
+ 
++	if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS,
++				NULL, UB_SOFT))
++		goto skip;
++
+ 	/*
+ 	 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
+ 	 * the problem.  When the process attempts to write to the register backing store
+@@ -127,11 +133,16 @@ ia64_init_addr_space (void)
+ 		if (insert_vm_struct(current->mm, vma)) {
+ 			up_write(&current->mm->mmap_sem);
+ 			kmem_cache_free(vm_area_cachep, vma);
++			ub_memory_uncharge(current->mm, PAGE_SIZE,
++					VM_DATA_DEFAULT_FLAGS, NULL);
+ 			return;
+ 		}
+ 		up_write(&current->mm->mmap_sem);
+-	}
++	} else
++		ub_memory_uncharge(current->mm, PAGE_SIZE,
++				VM_DATA_DEFAULT_FLAGS, NULL);
+ 
++skip:
+ 	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
+ 	if (!(current->personality & MMAP_PAGE_ZERO)) {
+ 		vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
+index 3934e26..f0f8abb 100644
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -798,8 +798,12 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig"
+ 
+ source "lib/Kconfig"
+ 
++source "kernel/bc/Kconfig"
++
+ source "arch/powerpc/Kconfig.debug"
+ 
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+ 
+ config KEYS_COMPAT
+diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
+index 89aaaa6..05a57b3 100644
+--- a/arch/powerpc/kernel/misc_32.S
++++ b/arch/powerpc/kernel/misc_32.S
+@@ -835,7 +835,7 @@ _GLOBAL(abs)
+  * Create a kernel thread
+  *   kernel_thread(fn, arg, flags)
+  */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ 	stwu	r1,-16(r1)
+ 	stw	r30,8(r1)
+ 	stw	r31,12(r1)
+diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
+index 942951e..5d704fe 100644
+--- a/arch/powerpc/kernel/misc_64.S
++++ b/arch/powerpc/kernel/misc_64.S
+@@ -415,7 +415,7 @@ _GLOBAL(scom970_write)
+  * Create a kernel thread
+  *   kernel_thread(fn, arg, flags)
+  */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ 	std	r29,-24(r1)
+ 	std	r30,-16(r1)
+ 	stdu	r1,-STACK_FRAME_OVERHEAD(r1)
+diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
+index 7de41c3..4fec839 100644
+--- a/arch/powerpc/kernel/process.c
++++ b/arch/powerpc/kernel/process.c
+@@ -48,6 +48,8 @@
+ #include <asm/firmware.h>
+ #endif
+ 
++#include <linux/utsrelease.h>
++
+ extern unsigned long _get_SP(void);
+ 
+ #ifndef CONFIG_SMP
+@@ -452,8 +454,9 @@ void show_regs(struct pt_regs * regs)
+ 
+ 	printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
+ 	       regs->nip, regs->link, regs->ctr);
+-	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
+-	       regs, regs->trap, print_tainted(), init_utsname()->release);
++	printk("REGS: %p TRAP: %04lx   %s  (%s %s)\n",
++	       regs, regs->trap, print_tainted(), init_utsname()->release,
++	       VZVERSION);
+ 	printk("MSR: "REG" ", regs->msr);
+ 	printbits(regs->msr, msr_bits);
+ 	printk("  CR: %08lx  XER: %08lx\n", regs->ccr, regs->xer);
+@@ -1004,6 +1007,20 @@ void dump_stack(void)
+ }
+ EXPORT_SYMBOL(dump_stack);
+ 
++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
++{
++	extern long ppc_kernel_thread(int (*fn)(void *), void *arg,
++			unsigned long flags);
++
++	if (!ve_is_super(get_exec_env())) {
++		printk("kernel_thread call inside container\n");
++		dump_stack();
++		return -EPERM;
++	}
++
++	return ppc_kernel_thread(fn, arg, flags);
++}
++
+ #ifdef CONFIG_PPC64
+ void ppc64_runlatch_on(void)
+ {
+diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
+index 93219c3..a9e16bb 100644
+--- a/arch/powerpc/kernel/systbl.S
++++ b/arch/powerpc/kernel/systbl.S
+@@ -43,5 +43,9 @@
+ 	.p2align	3
+ #endif
+ 
++#define SYS_SKIP(from, to)	.rept to - from		\
++				SYSCALL(sys_ni_syscall)	\
++				.endr
++
+ _GLOBAL(sys_call_table)
+ #include <asm/systbl.h>
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index 7b25107..bc28b20 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -333,7 +333,6 @@ good_area:
+ 	 * make sure we exit gracefully rather than endlessly redo
+ 	 * the fault.
+ 	 */
+- survive:
+ 	ret = handle_mm_fault(mm, vma, address, is_write);
+ 	if (unlikely(ret & VM_FAULT_ERROR)) {
+ 		if (ret & VM_FAULT_OOM)
+@@ -373,14 +372,12 @@ bad_area_nosemaphore:
+  */
+ out_of_memory:
+ 	up_read(&mm->mmap_sem);
+-	if (is_global_init(current)) {
+-		yield();
+-		down_read(&mm->mmap_sem);
+-		goto survive;
+-	}
+-	printk("VM: killing process %s\n", current->comm);
+ 	if (user_mode(regs))
+-		do_group_exit(SIGKILL);
++		/*
++		 * 0-order allocation always success if something really
++		 * fatal not happen: beancounter overdraft or OOM. Den
++		 */
++		force_sig(SIGKILL, current);
+ 	return SIGKILL;
+ 
+ do_sigbus:
+diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
+index 6aa6537..139b841 100644
+--- a/arch/powerpc/mm/init_64.c
++++ b/arch/powerpc/mm/init_64.c
+@@ -173,7 +173,7 @@ void pgtable_cache_init(void)
+ 			"for size: %08x...\n", name, i, size);
+ 		pgtable_cache[i] = kmem_cache_create(name,
+ 						     size, size,
+-						     SLAB_PANIC,
++						     SLAB_PANIC|SLAB_UBC|SLAB_NO_CHARGE,
+ 						     zero_ctor);
+ 	}
+ }
+diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
+index e0ff59f..083ce8b 100644
+--- a/arch/powerpc/mm/pgtable_32.c
++++ b/arch/powerpc/mm/pgtable_32.c
+@@ -83,7 +83,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ 	pgd_t *ret;
+ 
+-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
++	ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC |
++			__GFP_ZERO, PGDIR_ORDER);
+ 	return ret;
+ }
+ 
+@@ -117,6 +118,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ #else
+ 	gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO;
+ #endif
++	flags |= (__GFP_UBC | __GFP_SOFT_UBC);
+ 
+ 	ptepage = alloc_pages(flags, 0);
+ 	if (!ptepage)
+diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
+index 19f6bfd..4f23f43 100644
+--- a/arch/powerpc/platforms/cell/spu_callbacks.c
++++ b/arch/powerpc/platforms/cell/spu_callbacks.c
+@@ -46,6 +46,8 @@ static void *spu_syscall_table[] = {
+ #define PPC_SYS_SPU(func)	ppc_##func,
+ #define SYSX_SPU(f, f3264, f32)	f,
+ 
++#define SYS_SKIP(from, to) [from ... to] = sys_ni_syscall,
++
+ #include <asm/systbl.h>
+ };
+ 
+diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig
+index 0f1863e..4849a57 100644
+--- a/arch/ppc/Kconfig
++++ b/arch/ppc/Kconfig
+@@ -1181,6 +1181,10 @@ source "lib/Kconfig"
+ 
+ source "arch/ppc/Kconfig.debug"
+ 
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+ 
++source "kernel/bc/Kconfig"
++
+ source "crypto/Kconfig"
+diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S
+index d5e0dfc..b217a25 100644
+--- a/arch/ppc/kernel/misc.S
++++ b/arch/ppc/kernel/misc.S
+@@ -826,7 +826,7 @@ _GLOBAL(_get_SP)
+  * Create a kernel thread
+  *   kernel_thread(fn, arg, flags)
+  */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ 	stwu	r1,-16(r1)
+ 	stw	r30,8(r1)
+ 	stw	r31,12(r1)
+diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c
+index 36c0e75..276d861 100644
+--- a/arch/ppc/mm/fault.c
++++ b/arch/ppc/mm/fault.c
+@@ -249,7 +249,6 @@ good_area:
+ 	 * make sure we exit gracefully rather than endlessly redo
+ 	 * the fault.
+ 	 */
+- survive:
+ 	fault = handle_mm_fault(mm, vma, address, is_write);
+ 	if (unlikely(fault & VM_FAULT_ERROR)) {
+ 		if (fault & VM_FAULT_OOM)
+@@ -290,14 +289,12 @@ bad_area:
+  */
+ out_of_memory:
+ 	up_read(&mm->mmap_sem);
+-	if (is_global_init(current)) {
+-		yield();
+-		down_read(&mm->mmap_sem);
+-		goto survive;
+-	}
+-	printk("VM: killing process %s\n", current->comm);
+ 	if (user_mode(regs))
+-		do_group_exit(SIGKILL);
++		/*
++		 * 0-order allocation always success if something really
++		 * fatal not happen: beancounter overdraft or OOM. Den
++		 */
++		force_sig(SIGKILL, current);
+ 	return SIGKILL;
+ 
+ do_sigbus:
+diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
+index 03a79bf..0cf0355 100644
+--- a/arch/ppc/mm/pgtable.c
++++ b/arch/ppc/mm/pgtable.c
+@@ -70,7 +70,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ 	pgd_t *ret;
+ 
+-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
++	ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC |
++			__GFP_ZERO, PGDIR_ORDER);
+ 	return ret;
+ }
+ 
+@@ -104,6 +105,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ #else
+ 	gfp_t flags = GFP_KERNEL | __GFP_REPEAT;
+ #endif
++	flags |= (__GFP_UBC | __GFP_SOFT_UBC);
+ 
+ 	ptepage = alloc_pages(flags, 0);
+ 	if (ptepage) {
+diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
+index 107e492..9477a5a 100644
+--- a/arch/s390/Kconfig
++++ b/arch/s390/Kconfig
+@@ -562,6 +562,8 @@ source "fs/Kconfig"
+ 
+ source "arch/s390/Kconfig.debug"
+ 
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+ 
+ source "crypto/Kconfig"
+diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
+index 5d4fa4b..4ac7ef3 100644
+--- a/arch/s390/kernel/smp.c
++++ b/arch/s390/kernel/smp.c
+@@ -577,8 +577,19 @@ out:
+  */
+ int __cpuinit start_secondary(void *cpuvoid)
+ {
+-	/* Setup the cpu */
+-	cpu_init();
++        /* Setup the cpu */
++        cpu_init();
++
++#ifdef CONFIG_VE
++	/* TSC reset. kill whatever might rely on old values */
++	VE_TASK_INFO(current)->wakeup_stamp = 0;
++	/*
++	 * Cosmetic: sleep_time won't be changed afterwards for the idle
++	 * thread;  keep it 0 rather than -cycles.
++	 */
++	VE_TASK_INFO(idle)->sleep_time = 0;
++#endif
++
+ 	preempt_disable();
+ 	/* Enable TOD clock interrupts on the secondary cpu. */
+ 	init_cpu_timer();
+@@ -836,6 +847,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
+ 	for_each_possible_cpu(cpu)
+ 		if (cpu != smp_processor_id())
+ 			smp_create_idle(cpu);
++
++#ifdef CONFIG_VE
++	/* TSC reset. kill whatever might rely on old values */
++	VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ }
+ 
+ void __init smp_prepare_boot_cpu(void)
+diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
+index 0283d81..e7815f6 100644
+--- a/arch/sh/kernel/process_64.c
++++ b/arch/sh/kernel/process_64.c
+@@ -680,7 +680,7 @@ asids_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, void
+ 	int len=0;
+ 	struct task_struct *p;
+ 	read_lock(&tasklist_lock);
+-	for_each_process(p) {
++	for_each_process_ve(p) {
+ 		int pid = p->pid;
+ 
+ 		if (!pid)
+diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
+index eb36f3b..5c8eb15 100644
+--- a/arch/sparc64/Kconfig
++++ b/arch/sparc64/Kconfig
+@@ -407,8 +407,12 @@ source "fs/Kconfig"
+ 
+ source "arch/sparc64/Kconfig.debug"
+ 
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+ 
+ source "crypto/Kconfig"
+ 
+ source "lib/Kconfig"
++
++source "kernel/bc/Kconfig"
+diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
+index 2084f81..552ed1b 100644
+--- a/arch/sparc64/kernel/process.c
++++ b/arch/sparc64/kernel/process.c
+@@ -747,6 +747,13 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ {
+ 	long retval;
+ 
++	/* Don't allow kernel_thread() inside VE */
++	if (!ve_is_super(get_exec_env())) {
++		printk("kernel_thread call inside container\n");
++		dump_stack();
++		return -EPERM;
++	}
++
+ 	/* If the parent runs before fn(arg) is called by the child,
+ 	 * the input registers of this function can be clobbered.
+ 	 * So we stash 'fn' and 'arg' into global registers which
+diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
+index 8b5282d..9a58e19 100644
+--- a/arch/sparc64/kernel/systbls.S
++++ b/arch/sparc64/kernel/systbls.S
+@@ -83,6 +83,24 @@ sys_call_table32:
+ /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
+ 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime
+ 
++	.rept 500-317
++	.word sys_nis_syscall
++	.endr
++	.word sys_fairsched_mknod	/* 500 */
++	.word sys_fairsched_rmnod
++	.word sys_fairsched_chwt
++	.word sys_fairsched_mvpr
++	.word sys_fairsched_rate
++	.word sys_nis_syscall		/* 505 */
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_getluid		/* 510 */
++	.word sys_setluid
++	.word compat_sys_setublimit
++	.word compat_sys_ubstat
++
+ #endif /* CONFIG_COMPAT */
+ 
+ 	/* Now the 64-bit native Linux syscall table. */
+@@ -155,3 +173,20 @@ sys_call_table:
+ 	.word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
+ /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
+ 	.word sys_timerfd_settime, sys_timerfd_gettime
++	.rept 500-317
++	.word sys_nis_syscall
++	.endr
++	.word sys_fairsched_mknod	/* 500 */
++	.word sys_fairsched_rmnod
++	.word sys_fairsched_chwt
++	.word sys_fairsched_mvpr
++	.word sys_fairsched_rate
++	.word sys_nis_syscall		/* 505 */
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_nis_syscall
++	.word sys_getluid		/* 510 */
++	.word sys_setluid
++	.word sys_setublimit
++	.word sys_ubstat
+diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
+index 3697492..a60f6dd 100644
+--- a/arch/sparc64/kernel/traps.c
++++ b/arch/sparc64/kernel/traps.c
+@@ -2197,6 +2197,10 @@ void die_if_kernel(char *str, struct pt_regs *regs)
+ "                 \\__U_/\n");
+ 
+ 	printk("%s(%d): %s [#%d]\n", current->comm, task_pid_nr(current), str, ++die_counter);
++	printk("VE:EXCVE %d:%d, CPU %d, VCPU %d:%d\n",
++		VEID(VE_TASK_INFO(current)->owner_env), VEID(get_exec_env()),
++		smp_processor_id(),
++		task_vsched_id(current), task_cpu(current));
+ 	notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV);
+ 	__asm__ __volatile__("flushw");
+ 	__show_regs(regs);
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index bf07b6f..3574b92 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -1711,6 +1711,7 @@ config SYSVIPC_COMPAT
+ 
+ endmenu
+ 
++source "kernel/Kconfig.openvz"
+ 
+ source "net/Kconfig"
+ 
+@@ -1729,3 +1730,5 @@ source "crypto/Kconfig"
+ source "arch/x86/kvm/Kconfig"
+ 
+ source "lib/Kconfig"
++
++source "kernel/bc/Kconfig"
+diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
+index b5e329d..434e6cf 100644
+--- a/arch/x86/ia32/ia32entry.S
++++ b/arch/x86/ia32/ia32entry.S
+@@ -517,7 +517,7 @@ ia32_sys_call_table:
+ 	.quad stub32_iopl		/* 110 */
+ 	.quad sys_vhangup
+ 	.quad quiet_ni_syscall	/* old "idle" system call */
+-	.quad sys32_vm86_warning	/* vm86old */ 
++	.quad quiet_ni_syscall	/* vm86old */ 
+ 	.quad compat_sys_wait4
+ 	.quad sys_swapoff		/* 115 */
+ 	.quad compat_sys_sysinfo
+@@ -570,7 +570,7 @@ ia32_sys_call_table:
+ 	.quad sys_mremap
+ 	.quad sys_setresuid16
+ 	.quad sys_getresuid16	/* 165 */
+-	.quad sys32_vm86_warning	/* vm86 */ 
++	.quad quiet_ni_syscall	/* vm86 */ 
+ 	.quad quiet_ni_syscall	/* query_module */
+ 	.quad sys_poll
+ 	.quad compat_sys_nfsservctl
+diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
+index f00afdf..fd045ce 100644
+--- a/arch/x86/ia32/sys_ia32.c
++++ b/arch/x86/ia32/sys_ia32.c
+@@ -817,20 +817,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
+ 				advice);
+ }
+ 
+-long sys32_vm86_warning(void)
+-{
+-	struct task_struct *me = current;
+-	static char lastcomm[sizeof(me->comm)];
+-
+-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+-		compat_printk(KERN_INFO
+-			      "%s: vm86 mode not supported on 64 bit kernel\n",
+-			      me->comm);
+-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
+-	}
+-	return -ENOSYS;
+-}
+-
+ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
+ 			  char __user *buf, size_t len)
+ {
+diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
+index c778e4f..732716c 100644
+--- a/arch/x86/kernel/entry_32.S
++++ b/arch/x86/kernel/entry_32.S
+@@ -214,6 +214,7 @@ ENTRY(ret_from_fork)
+ 	GET_THREAD_INFO(%ebp)
+ 	popl %eax
+ 	CFI_ADJUST_CFA_OFFSET -4
++ret_from_fork_tail:
+ 	pushl $0x0202			# Reset kernel eflags
+ 	CFI_ADJUST_CFA_OFFSET 4
+ 	popfl
+@@ -222,6 +223,25 @@ ENTRY(ret_from_fork)
+ 	CFI_ENDPROC
+ END(ret_from_fork)
+ 
++ENTRY(i386_ret_from_resume)
++	CFI_STARTPROC
++	pushl %eax
++	CFI_ADJUST_CFA_OFFSET 4
++	call schedule_tail
++	GET_THREAD_INFO(%ebp)
++	popl %eax
++	CFI_ADJUST_CFA_OFFSET -4
++	movl (%esp),%eax
++	testl %eax,%eax
++	jz    1f
++	pushl %esp
++	call  *%eax
++	addl  $4,%esp
++1:
++	addl  $256,%esp
++	jmp   ret_from_fork_tail
++	CFI_ENDPROC
++
+ /*
+  * Return to user mode is not as complex as all this looks,
+  * but we want the default path for a system call return to
+diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
+index 556a8df..87cd7f2 100644
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -168,7 +168,12 @@ ENTRY(ret_from_fork)
+ 	popf				# reset kernel eflags
+ 	CFI_ADJUST_CFA_OFFSET -4
+ 	call schedule_tail
++ret_from_fork_tail:
+ 	GET_THREAD_INFO(%rcx)
++	btr $TIF_RESUME,threadinfo_flags(%rcx)
++	jc  x86_64_ret_from_resume
++
++ret_from_fork_check:
+ 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+ 	jnz rff_trace
+ rff_action:	
+@@ -184,6 +189,19 @@ rff_trace:
+ 	call syscall_trace_leave
+ 	GET_THREAD_INFO(%rcx)	
+ 	jmp rff_action
++
++x86_64_ret_from_resume:
++	movq (%rsp),%rax
++	testq %rax,%rax
++	jz 1f
++	movq  %rsp,%rdi
++	call  *%rax
++1:
++	addq $256,%rsp
++	cmpq $0,ORIG_RAX(%rsp)
++	jge  ret_from_fork_tail
++	RESTORE_REST
++	jmp  int_ret_from_sys_call
+ 	CFI_ENDPROC
+ END(ret_from_fork)
+ 
+@@ -992,7 +1010,7 @@ ENTRY(kernel_thread)
+ 	xorl %r9d,%r9d
+ 	
+ 	# clone now
+-	call do_fork
++	call do_fork_kthread
+ 	movq %rax,RAX(%rsp)
+ 	xorl %edi,%edi
+ 
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index 0224c36..e6385eb 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -12,6 +12,7 @@
+ #include <linux/mm.h>
+ #include <linux/smp.h>
+ #include <linux/vmalloc.h>
++#include <linux/module.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -19,6 +20,8 @@
+ #include <asm/desc.h>
+ #include <asm/mmu_context.h>
+ 
++#include <bc/kmem.h>
++
+ #ifdef CONFIG_SMP
+ static void flush_ldt(void *null)
+ {
+@@ -38,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+ 	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+ 			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
+ 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
++		newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE);
+ 	else
+-		newldt = (void *)__get_free_page(GFP_KERNEL);
++		newldt = (void *)__get_free_page(GFP_KERNEL_UBC);
+ 
+ 	if (!newldt)
+ 		return -ENOMEM;
+@@ -112,6 +115,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+ 	}
+ 	return retval;
+ }
++EXPORT_SYMBOL_GPL(init_new_context);
+ 
+ /*
+  * No need to lock the MM as we are the last user
+diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
+index 84160f7..3fe4e6d 100644
+--- a/arch/x86/kernel/nmi_32.c
++++ b/arch/x86/kernel/nmi_32.c
+@@ -316,6 +316,21 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
+ 
+ extern void die_nmi(struct pt_regs *, const char *msg);
+ 
++void smp_show_regs(struct pt_regs *regs, void *info)
++{
++	static DEFINE_SPINLOCK(show_regs_lock);
++
++	if (regs == NULL)
++		return;
++
++	spin_lock(&show_regs_lock);
++	bust_spinlocks(1);
++	printk("----------- IPI show regs -----------");
++	show_regs(regs);
++	bust_spinlocks(0);
++	spin_unlock(&show_regs_lock);
++}
++
+ notrace __kprobes int
+ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+ {
+diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
+index 5a29ded..e96721c 100644
+--- a/arch/x86/kernel/nmi_64.c
++++ b/arch/x86/kernel/nmi_64.c
+@@ -354,10 +354,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+ 	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
+ 		/*
+ 		 * Ayiee, looks like this CPU is stuck ...
+-		 * wait a few IRQs (5 seconds) before doing the oops ...
++		 * wait a few IRQs (30 seconds) before doing the oops ...
+ 		 */
+ 		local_inc(&__get_cpu_var(alert_counter));
+-		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
++		if (local_read(&__get_cpu_var(alert_counter)) == 30*nmi_hz)
+ 			die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
+ 				panic_on_timeout);
+ 	} else {
+@@ -385,16 +385,35 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+ 
+ static unsigned ignore_nmis;
+ 
++static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
++{
++	return 0;
++}
++
++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback;
++
+ asmlinkage notrace __kprobes void
+ do_nmi(struct pt_regs *regs, long error_code)
+ {
+ 	nmi_enter();
+ 	add_pda(__nmi_count,1);
+-	if (!ignore_nmis)
+-		default_do_nmi(regs);
++	if (!ignore_nmis) {
++		if (!nmi_ipi_callback(regs, smp_processor_id()))
++			default_do_nmi(regs);
++	}
+ 	nmi_exit();
+ }
+ 
++void set_nmi_ipi_callback(nmi_callback_t callback)
++{
++	nmi_ipi_callback = callback;
++}
++
++void unset_nmi_ipi_callback(void)
++{
++	nmi_ipi_callback = dummy_nmi_callback;
++}
++
+ void stop_nmi(void)
+ {
+ 	acpi_nmi_disable();
+diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
+index e2db9ac..9260537 100644
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -37,6 +37,7 @@
+ #include <linux/tick.h>
+ #include <linux/percpu.h>
+ #include <linux/prctl.h>
++#include <linux/sysctl.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -51,12 +52,16 @@
+ #endif
+ 
+ #include <linux/err.h>
++#include <linux/utsrelease.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/cpu.h>
+ #include <asm/kdebug.h>
+ 
+ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++EXPORT_SYMBOL(ret_from_fork);
++asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume");
++EXPORT_SYMBOL_GPL(i386_ret_from_resume);
+ 
+ static int hlt_counter;
+ 
+@@ -212,16 +217,17 @@ void __show_registers(struct pt_regs *regs, int all)
+ 	}
+ 
+ 	printk("\n");
+-	printk("Pid: %d, comm: %s %s (%s %.*s)\n",
++	printk("Pid: %d, comm: %s %s (%s %.*s %s)\n",
+ 			task_pid_nr(current), current->comm,
+ 			print_tainted(), init_utsname()->release,
+ 			(int)strcspn(init_utsname()->version, " "),
+-			init_utsname()->version);
++			init_utsname()->version, VZVERSION);
+ 
+ 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+ 			(u16)regs->cs, regs->ip, regs->flags,
+ 			smp_processor_id());
+-	print_symbol("EIP is at %s\n", regs->ip);
++	if (decode_call_traces)
++		print_symbol("EIP is at %s\n", regs->ip);
+ 
+ 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ 		regs->ax, regs->bx, regs->cx, regs->dx);
+@@ -257,6 +263,8 @@ void show_regs(struct pt_regs *regs)
+ {
+ 	__show_registers(regs, 1);
+ 	show_trace(NULL, regs, &regs->sp, regs->bp);
++	if (!decode_call_traces)
++		printk(" EIP: [<%08lx>]\n", regs->ip);
+ }
+ 
+ /*
+@@ -265,6 +273,7 @@ void show_regs(struct pt_regs *regs)
+  * the "args".
+  */
+ extern void kernel_thread_helper(void);
++EXPORT_SYMBOL(kernel_thread_helper);
+ 
+ /*
+  * Create a kernel thread
+@@ -273,6 +282,13 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ {
+ 	struct pt_regs regs;
+ 
++	/* Don't allow kernel_thread() inside VE */
++	if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) {
++		printk("kernel_thread call inside container\n");
++		dump_stack();
++		return -EPERM;
++	}
++
+ 	memset(&regs, 0, sizeof(regs));
+ 
+ 	regs.bx = (unsigned long) fn;
+diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
+index c6eb5c9..c303e39 100644
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -26,8 +26,10 @@
+ #include <linux/smp.h>
+ #include <linux/slab.h>
+ #include <linux/user.h>
++#include <linux/sysctl.h>
+ #include <linux/interrupt.h>
+ #include <linux/utsname.h>
++#include <linux/utsrelease.h>
+ #include <linux/delay.h>
+ #include <linux/module.h>
+ #include <linux/ptrace.h>
+@@ -52,8 +54,6 @@
+ #include <asm/ia32.h>
+ #include <asm/idle.h>
+ 
+-asmlinkage extern void ret_from_fork(void);
+-
+ unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+ 
+ unsigned long boot_option_idle_override = 0;
+@@ -189,13 +189,14 @@ void __show_regs(struct pt_regs * regs)
+ 
+ 	printk("\n");
+ 	print_modules();
+-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
++	printk("Pid: %d, comm: %.20s %s %s %.*s %s\n",
+ 		current->pid, current->comm, print_tainted(),
+ 		init_utsname()->release,
+ 		(int)strcspn(init_utsname()->version, " "),
+-		init_utsname()->version);
++		init_utsname()->version, VZVERSION);
+ 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+-	printk_address(regs->ip, 1);
++	if (decode_call_traces)
++		printk_address(regs->ip, 1);
+ 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
+ 		regs->flags);
+ 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+@@ -243,9 +244,26 @@ void show_regs(struct pt_regs *regs)
+ {
+ 	printk("CPU %d:", smp_processor_id());
+ 	__show_regs(regs);
+-	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
++	show_trace(NULL, regs, &regs->sp, regs->bp);
++	if (!decode_call_traces)
++		printk(" EIP: [<%08lx>]\n", regs->ip);
+ }
+ 
++void smp_show_regs(struct pt_regs *regs, void *data)
++{
++	static DEFINE_SPINLOCK(show_regs_lock);
++
++	if (regs == NULL)
++		return;
++
++	spin_lock(&show_regs_lock);
++	bust_spinlocks(1);
++	printk("----------- IPI show regs -----------\n");
++	show_regs(regs);
++	bust_spinlocks(0);
++	spin_unlock(&show_regs_lock);
++ }
++
+ /*
+  * Free current thread data structures etc..
+  */
+@@ -868,3 +886,20 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
+ 	unsigned long range_end = mm->brk + 0x02000000;
+ 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ }
++
++long do_fork_kthread(unsigned long clone_flags,
++	      unsigned long stack_start,
++	      struct pt_regs *regs,
++	      unsigned long stack_size,
++	      int __user *parent_tidptr,
++	      int __user *child_tidptr)
++{
++	if (ve_allow_kthreads || ve_is_super(get_exec_env()))
++		return do_fork(clone_flags, stack_start, regs, stack_size,
++				parent_tidptr, child_tidptr);
++
++	/* Don't allow kernel_thread() inside VE */
++	printk("kernel_thread call inside container\n");
++	dump_stack();
++	return -EPERM;
++}
+diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
+index a7835f2..595dae6 100644
+--- a/arch/x86/kernel/ptrace.c
++++ b/arch/x86/kernel/ptrace.c
+@@ -1422,8 +1422,11 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
+ 		return 0;
+ 
+ 	/* Fake a debug trap */
+-	if (is_singlestep)
++	if (is_singlestep) {
++		set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY);
+ 		send_sigtrap(current, regs, 0);
++		clear_pn_state(current);
++	}
+ 
+  	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+ 		goto out;
+diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
+index aee0e82..2dd69cc 100644
+--- a/arch/x86/kernel/setup64.c
++++ b/arch/x86/kernel/setup64.c
+@@ -285,3 +285,5 @@ void __cpuinit cpu_init (void)
+ 	if (is_uv_system())
+ 		uv_cpu_init();
+ }
++
++EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
+index d923736..193b8bf 100644
+--- a/arch/x86/kernel/signal_32.c
++++ b/arch/x86/kernel/signal_32.c
+@@ -20,6 +20,7 @@
+ #include <linux/elf.h>
+ #include <linux/smp.h>
+ #include <linux/mm.h>
++#include <linux/freezer.h>
+ 
+ #include <asm/processor.h>
+ #include <asm/ucontext.h>
+@@ -593,6 +594,9 @@ static void do_signal(struct pt_regs *regs)
+ 	if (!user_mode(regs))
+ 		return;
+ 
++	if (try_to_freeze() && !signal_pending(current))
++ 		goto no_signal;
++
+ 	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+ 		oldset = &current->saved_sigmask;
+ 	else
+@@ -622,6 +626,7 @@ static void do_signal(struct pt_regs *regs)
+ 		return;
+ 	}
+ 
++no_signal:
+ 	/* Did we come from a system call? */
+ 	if ((long)regs->orig_ax >= 0) {
+ 		/* Restart the system call - no handlers present */
+diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
+index e53b267..3319d4a 100644
+--- a/arch/x86/kernel/signal_64.c
++++ b/arch/x86/kernel/signal_64.c
+@@ -19,6 +19,7 @@
+ #include <linux/stddef.h>
+ #include <linux/personality.h>
+ #include <linux/compiler.h>
++#include <linux/freezer.h>
+ #include <asm/processor.h>
+ #include <asm/ucontext.h>
+ #include <asm/uaccess.h>
+@@ -427,6 +428,9 @@ static void do_signal(struct pt_regs *regs)
+ 	if (!user_mode(regs))
+ 		return;
+ 
++	if (try_to_freeze() && !signal_pending(current))
++  		goto no_signal;
++  
+ 	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+ 		oldset = &current->saved_sigmask;
+ 	else
+@@ -455,6 +459,7 @@ static void do_signal(struct pt_regs *regs)
+ 		return;
+ 	}
+ 
++no_signal:
+ 	/* Did we come from a system call? */
+ 	if (current_syscall(regs) >= 0) {
+ 		/* Restart the system call - no handlers present */
+diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
+index 0cb7aad..5657402 100644
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -22,6 +22,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/cpu.h>
+ 
++#include <linux/nmi.h>
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+@@ -249,6 +250,89 @@ native_smp_call_function_mask(cpumask_t mask,
+ 	return 0;
+ }
+ 
++static DEFINE_SPINLOCK(nmi_call_lock);
++static struct nmi_call_data_struct {
++	smp_nmi_function func;
++	void *info;
++	atomic_t started;
++	atomic_t finished;
++	cpumask_t cpus_called;
++	int wait;
++} *nmi_call_data;
++
++static int smp_nmi_callback(struct pt_regs *regs, int cpu)
++{
++	smp_nmi_function func;
++	void *info;
++	int wait;
++
++	func = nmi_call_data->func;
++	info = nmi_call_data->info;
++	wait = nmi_call_data->wait;
++	ack_APIC_irq();
++	/* prevent from calling func() multiple times */
++	if (cpu_test_and_set(cpu, nmi_call_data->cpus_called))
++		return 0;
++	/*
++	 * notify initiating CPU that I've grabbed the data and am
++	 * about to execute the function
++	 */
++	mb();
++	atomic_inc(&nmi_call_data->started);
++	/* at this point the nmi_call_data structure is out of scope */
++	irq_enter();
++	func(regs, info);
++	irq_exit();
++	if (wait)
++		atomic_inc(&nmi_call_data->finished);
++
++	return 1;
++}
++
++/*
++ * This function tries to call func(regs, info) on each cpu.
++ * Func must be fast and non-blocking.
++ * May be called with disabled interrupts and from any context.
++ */
++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
++{
++	struct nmi_call_data_struct data;
++	int cpus;
++
++	cpus = num_online_cpus() - 1;
++	if (!cpus)
++		return 0;
++
++	data.func = func;
++	data.info = info;
++	data.wait = wait;
++	atomic_set(&data.started, 0);
++	atomic_set(&data.finished, 0);
++	cpus_clear(data.cpus_called);
++	/* prevent this cpu from calling func if NMI happens */
++	cpu_set(smp_processor_id(), data.cpus_called);
++
++	if (!spin_trylock(&nmi_call_lock))
++		return -1;
++
++	nmi_call_data = &data;
++	set_nmi_ipi_callback(smp_nmi_callback);
++	mb();
++
++	/* Send a message to all other CPUs and wait for them to respond */
++	send_IPI_allbutself(APIC_DM_NMI);
++	while (atomic_read(&data.started) != cpus)
++		barrier();
++
++	unset_nmi_ipi_callback();
++	if (wait)
++		while (atomic_read(&data.finished) != cpus)
++			barrier();
++	spin_unlock(&nmi_call_lock);
++
++	return 0;
++}
++
+ static void stop_this_cpu(void *dummy)
+ {
+ 	local_irq_disable();
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 3e1cece..ddc677e 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -918,6 +918,13 @@ do_rest:
+ 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ #endif
+ 
++
++#ifdef CONFIG_VE
++	/* Cosmetic: sleep_time won't be changed afterwards for the idle
++	* thread;  keep it 0 rather than -cycles. */
++	VE_TASK_INFO(c_idle.idle)->sleep_time = 0;
++#endif
++
+ 	/* start_ip had better be page-aligned! */
+ 	start_ip = setup_trampoline();
+ 
+diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
+index adff556..7f59fad 100644
+--- a/arch/x86/kernel/syscall_table_32.S
++++ b/arch/x86/kernel/syscall_table_32.S
+@@ -326,3 +326,22 @@ ENTRY(sys_call_table)
+ 	.long sys_fallocate
+ 	.long sys_timerfd_settime	/* 325 */
+ 	.long sys_timerfd_gettime
++	.rept 500-(.-sys_call_table)/4
++		.long sys_ni_syscall
++	.endr
++	.long sys_fairsched_mknod	/* 500 */
++	.long sys_fairsched_rmnod
++	.long sys_fairsched_chwt
++	.long sys_fairsched_mvpr
++	.long sys_fairsched_rate
++	.long sys_fairsched_vcpus	/* 505 */
++	.long sys_ni_syscall
++	.long sys_ni_syscall
++	.long sys_ni_syscall
++	.long sys_ni_syscall
++	.long sys_getluid		/* 510 */
++	.long sys_setluid
++	.long sys_setublimit
++	.long sys_ubstat
++	.long sys_ni_syscall
++	.long sys_ni_syscall
+diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
+index 9bb2363..318aa46 100644
+--- a/arch/x86/kernel/tlb_32.c
++++ b/arch/x86/kernel/tlb_32.c
+@@ -204,6 +204,8 @@ void flush_tlb_mm(struct mm_struct *mm)
+ 	preempt_enable();
+ }
+ 
++EXPORT_SYMBOL(flush_tlb_mm);
++
+ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+ {
+ 	struct mm_struct *mm = vma->vm_mm;
+diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
+index a1f07d7..8fdac14 100644
+--- a/arch/x86/kernel/tlb_64.c
++++ b/arch/x86/kernel/tlb_64.c
+@@ -237,6 +237,8 @@ void flush_tlb_mm(struct mm_struct *mm)
+ 	preempt_enable();
+ }
+ 
++EXPORT_SYMBOL(flush_tlb_mm);
++
+ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+ {
+ 	struct mm_struct *mm = vma->vm_mm;
+diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
+index 08d752d..c78417e 100644
+--- a/arch/x86/kernel/traps_32.c
++++ b/arch/x86/kernel/traps_32.c
+@@ -222,6 +222,8 @@ print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+ {
+ 	printk(data);
+ 	print_symbol(msg, symbol);
++	if (decode_call_traces)
++		print_symbol("%s\n", symbol);
+ 	printk("\n");
+ }
+ 
+@@ -259,7 +261,10 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ 		   unsigned long *stack, unsigned long bp, char *log_lvl)
+ {
+ 	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+-	printk("%s =======================\n", log_lvl);
++	if (decode_call_traces)
++		printk("%s =======================\n", log_lvl);
++	else
++		printk("%s =<ctx>=", log_lvl);
+ }
+ 
+ void show_trace(struct task_struct *task, struct pt_regs *regs,
+@@ -290,9 +295,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ 			printk("\n%s       ", log_lvl);
+ 		printk("%08lx ", *stack++);
+ 	}
+-	printk("\n%sCall Trace:\n", log_lvl);
++	if (decode_call_traces)
++		printk("\n%s Call Trace:\n", log_lvl);
++	else
++		printk("\n%s Call Trace: ", log_lvl);
+ 
+ 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
++	if (!decode_call_traces)
++		printk("\n");
+ }
+ 
+ void show_stack(struct task_struct *task, unsigned long *sp)
+@@ -321,6 +331,8 @@ void dump_stack(void)
+ 		init_utsname()->version);
+ 
+ 	show_trace(current, NULL, &stack, bp);
++	if (!decode_call_traces)
++		printk("\n");
+ }
+ 
+ EXPORT_SYMBOL(dump_stack);
+@@ -332,8 +344,9 @@ void show_registers(struct pt_regs *regs)
+ 	print_modules();
+ 	__show_registers(regs, 0);
+ 
+-	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
++	printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)",
+ 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
++		VEID(current->ve_task_info.owner_env),
+ 		current_thread_info(), current, task_thread_info(current));
+ 	/*
+ 	 * When in-kernel, we also print out the stack and code at the
+@@ -754,6 +767,21 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+ 	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ }
+ 
++/*
++ * Voyager doesn't implement these
++ */
++void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info)
++{
++}
++
++#ifdef CONFIG_SMP
++int __attribute__((weak))
++smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
++{
++	return 0;
++}
++#endif
++
+ static DEFINE_SPINLOCK(nmi_print_lock);
+ 
+ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+@@ -771,6 +799,10 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+ 	printk(" on CPU%d, ip %08lx, registers:\n",
+ 		smp_processor_id(), regs->ip);
+ 	show_registers(regs);
++	smp_nmi_call_function(smp_show_regs, NULL, 1);
++	bust_spinlocks(1);
++	if (!decode_call_traces)
++		show_registers(regs);
+ 	console_silent();
+ 	spin_unlock(&nmi_print_lock);
+ 	bust_spinlocks(0);
+@@ -787,6 +819,13 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+ 	do_exit(SIGSEGV);
+ }
+ 
++static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
++{
++	return 0;
++}
++
++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback;
++
+ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+ {
+ 	unsigned char reason = 0;
+@@ -839,12 +878,24 @@ notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
+ 
+ 	++nmi_count(cpu);
+ 
+-	if (!ignore_nmis)
+-		default_do_nmi(regs);
++	if (!ignore_nmis) {
++		if (!nmi_ipi_callback(regs, cpu))
++			default_do_nmi(regs);
++	}
+ 
+ 	nmi_exit();
+ }
+ 
++void set_nmi_ipi_callback(nmi_callback_t callback)
++{
++	nmi_ipi_callback = callback;
++}
++
++void unset_nmi_ipi_callback(void)
++{
++	nmi_ipi_callback = dummy_nmi_callback;
++}
++
+ void stop_nmi(void)
+ {
+ 	acpi_nmi_disable();
+diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
+index adff76e..0bcb70c 100644
+--- a/arch/x86/kernel/traps_64.c
++++ b/arch/x86/kernel/traps_64.c
+@@ -112,6 +112,11 @@ void printk_address(unsigned long address, int reliable)
+ 	char namebuf[KSYM_NAME_LEN];
+ 	char reliab[4] = "";
+ 
++	if (!decode_call_traces) {
++		printk("[<%016lx>]", address);
++		return;
++	}
++
+ 	symname = kallsyms_lookup(address, &symsize, &offset,
+ 					&modname, namebuf);
+ 	if (!symname) {
+@@ -421,7 +426,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
+ 		if (((long) stack & (THREAD_SIZE-1)) == 0)
+ 			break;
+ 		}
+-		if (i && ((i % 4) == 0))
++		if (i && ((i % 4) == 0) && decode_call_traces)
+ 			printk("\n");
+ 		printk(" %016lx", *stack++);
+ 		touch_nmi_watchdog();
+@@ -469,10 +474,12 @@ void show_registers(struct pt_regs *regs)
+ 
+ 	sp = regs->sp;
+ 	ip = (u8 *) regs->ip - code_prologue;
+-	printk("CPU %d ", cpu);
++	printk("CPU: %d ", cpu);
+ 	__show_regs(regs);
+-	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+-		cur->comm, cur->pid, task_thread_info(cur), cur);
++	printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n",
++		cur->comm, cur->pid,
++		VEID(VE_TASK_INFO(current)->owner_env),
++		task_thread_info(cur), cur);
+ 
+ 	/*
+ 	 * When in-kernel, we also print out the stack and code at the
+diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
+index 0577825..b7ce3ba 100644
+--- a/arch/x86/kernel/tsc_sync.c
++++ b/arch/x86/kernel/tsc_sync.c
+@@ -142,6 +142,10 @@ void __cpuinit check_tsc_sync_source(int cpu)
+ 		printk(" passed.\n");
+ 	}
+ 
++#ifdef CONFIG_VE
++	/* TSC reset. kill whatever might rely on old values */
++	VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ 	/*
+ 	 * Reset it - just in case we boot another CPU later:
+ 	 */
+diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
+index f6c05d0..03fb381 100644
+--- a/arch/x86/kernel/x8664_ksyms_64.c
++++ b/arch/x86/kernel/x8664_ksyms_64.c
+@@ -4,12 +4,14 @@
+ #include <linux/module.h>
+ #include <net/checksum.h>
+ #include <linux/smp.h>
++#include <linux/syscalls.h>
+ 
+ #include <asm/processor.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+ #include <asm/desc.h>
+ 
++EXPORT_SYMBOL(kernel_execve);
+ EXPORT_SYMBOL(kernel_thread);
+ 
+ EXPORT_SYMBOL(__get_user_1);
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 8bcb6f4..3eb1991 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -402,7 +402,8 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ 	printk(KERN_CONT " at %016lx\n", address);
+ #endif
+ 	printk(KERN_ALERT "IP:");
+-	printk_address(regs->ip, 1);
++	if (decode_call_traces)
++		printk_address(regs->ip, 1);
+ 	dump_pagetable(address);
+ }
+ 
+@@ -568,7 +569,7 @@ static int vmalloc_fault(unsigned long address)
+ #endif
+ }
+ 
+-int show_unhandled_signals = 1;
++int show_unhandled_signals = 0;
+ 
+ /*
+  * This routine handles page faults.  It determines the address,
+@@ -673,7 +674,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+ 	 */
+ 	if (user_mode_vm(regs))
+ 		error_code |= PF_USER;
+-again:
+ #endif
+ 	/* When running in the kernel we expect faults to occur only to
+ 	 * addresses in user space.  All other faults represent errors in the
+@@ -739,7 +739,6 @@ good_area:
+ 	}
+ 
+ #ifdef CONFIG_X86_32
+-survive:
+ #endif
+ 	/*
+ 	 * If for any reason at all we couldn't handle the fault,
+@@ -799,7 +798,7 @@ bad_area_nosemaphore:
+ 
+ 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ 		    printk_ratelimit()) {
+-			printk(
++			ve_printk(VE_LOG,
+ #ifdef CONFIG_X86_32
+ 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+ #else
+@@ -877,19 +876,14 @@ no_context:
+  */
+ out_of_memory:
+ 	up_read(&mm->mmap_sem);
+-	if (is_global_init(tsk)) {
+-		yield();
+-#ifdef CONFIG_X86_32
+-		down_read(&mm->mmap_sem);
+-		goto survive;
+-#else
+-		goto again;
+-#endif
++	if (error_code & PF_USER) {
++		/*
++		 * 0-order allocation always success if something really
++		 * fatal not happen: beancounter overdraft or OOM.
++		 */
++		force_sig(SIGKILL, tsk);
++		return;
+ 	}
+-
+-	printk("VM: killing process %s\n", tsk->comm);
+-	if (error_code & PF_USER)
+-		do_group_exit(SIGKILL);
+ 	goto no_context;
+ 
+ do_sigbus:
+diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
+index 0b3d567..768e246 100644
+--- a/arch/x86/mm/hugetlbpage.c
++++ b/arch/x86/mm/hugetlbpage.c
+@@ -12,6 +12,7 @@
+ #include <linux/slab.h>
+ #include <linux/err.h>
+ #include <linux/sysctl.h>
++#include <linux/module.h>
+ #include <asm/mman.h>
+ #include <asm/tlb.h>
+ #include <asm/tlbflush.h>
+@@ -207,6 +208,7 @@ int pmd_huge(pmd_t pmd)
+ {
+ 	return !!(pmd_val(pmd) & _PAGE_PSE);
+ }
++EXPORT_SYMBOL(pmd_huge);
+ 
+ struct page *
+ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
+index 819dad9..b07653e 100644
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -113,6 +113,7 @@ void show_mem(void)
+ 	printk(KERN_INFO "%lu pages shared\n",		shared);
+ 	printk(KERN_INFO "%lu pages swap cached\n",	cached);
+ }
++EXPORT_SYMBOL_GPL(show_mem);
+ 
+ int after_bootmem;
+ 
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 5015976..97ff257 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -13,9 +13,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ 	struct page *pte;
+ 
+ #ifdef CONFIG_HIGHPTE
+-	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
++	pte = alloc_pages(GFP_KERNEL_UBC|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+ #else
+-	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++	pte = alloc_pages(GFP_KERNEL_UBC|__GFP_REPEAT|__GFP_ZERO, 0);
+ #endif
+ 	if (pte)
+ 		pgtable_page_ctor(pte);
+@@ -210,7 +210,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+ 
+ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
++	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC | __GFP_ZERO);
+ 
+ 	/* so that alloc_pmd can use it */
+ 	mm->pgd = pgd;
+diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
+index 369cf06..aab674a 100644
+--- a/arch/x86/mm/pgtable_32.c
++++ b/arch/x86/mm/pgtable_32.c
+@@ -66,6 +66,7 @@ void show_mem(void)
+ 	printk(KERN_INFO "%lu pages pagetables\n",
+ 					global_page_state(NR_PAGETABLE));
+ }
++EXPORT_SYMBOL_GPL(show_mem);
+ 
+ /*
+  * Associate a virtual page frame with a given physical page frame 
+diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
+index cf058fe..4579afd 100644
+--- a/arch/x86/vdso/vdso32-setup.c
++++ b/arch/x86/vdso/vdso32-setup.c
+@@ -17,6 +17,8 @@
+ #include <linux/err.h>
+ #include <linux/module.h>
+ 
++#include <bc/vmpages.h>
++
+ #include <asm/cpufeature.h>
+ #include <asm/msr.h>
+ #include <asm/pgtable.h>
+@@ -37,6 +39,8 @@ enum {
+ #else
+ #define VDSO_DEFAULT	VDSO_ENABLED
+ #endif
++#undef VDSO_DEFAULT
++#define VDSO_DEFAULT VDSO_DISABLED
+ 
+ #ifdef CONFIG_X86_64
+ #define vdso_enabled			sysctl_vsyscall32
+@@ -199,7 +203,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+  */
+ extern const char vdso32_default_start, vdso32_default_end;
+ extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+-static struct page *vdso32_pages[1];
++struct page *vdso32_pages[1];
++EXPORT_SYMBOL_GPL(vdso32_pages);
+ 
+ #ifdef CONFIG_X86_64
+ 
+@@ -319,16 +324,30 @@ int __init sysenter_setup(void)
+ 	return 0;
+ }
+ 
++EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
++EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
++
+ /* Setup a VMA at program startup for the vsyscall page */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack,
++				unsigned long map_address)
+ {
+ 	struct mm_struct *mm = current->mm;
+-	unsigned long addr;
++	unsigned long addr = map_address;
+ 	int ret = 0;
+ 	bool compat;
++	unsigned long flags;
+ 
+-	if (vdso_enabled == VDSO_DISABLED)
++	if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
++		current->mm->context.vdso = NULL;
+ 		return 0;
++	}
++
++	flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
++		mm->def_flags;
++
++	ret = -ENOMEM;
++	if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
++		goto err_charge;
+ 
+ 	down_write(&mm->mmap_sem);
+ 
+@@ -338,17 +357,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+ 
+ 	map_compat_vdso(compat);
+ 
+-	if (compat)
+-		addr = VDSO_HIGH_BASE;
+-	else {
+-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
++	if (!compat || map_address) {
++		addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
+ 		if (IS_ERR_VALUE(addr)) {
+ 			ret = addr;
+ 			goto up_fail;
+ 		}
+-	}
++	} else
++		addr = VDSO_HIGH_BASE;
+ 
+-	if (compat_uses_vma || !compat) {
++	if (compat_uses_vma || !compat || map_address) {
+ 		/*
+ 		 * MAYWRITE to allow gdb to COW and set breakpoints
+ 		 *
+@@ -374,9 +392,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+ 
+   up_fail:
+ 	up_write(&mm->mmap_sem);
++	if (ret < 0)
++		ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
++err_charge:
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ 
+ #ifdef CONFIG_X86_64
+ 
+diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
+index 3fdd514..785d7fd 100644
+--- a/arch/x86/vdso/vma.c
++++ b/arch/x86/vdso/vma.c
+@@ -4,6 +4,7 @@
+  * Subject to the GPL, v.2
+  */
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/err.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+@@ -16,7 +17,7 @@
+ #include "vextern.h"		/* Just for VMAGIC.  */
+ #undef VEXTERN
+ 
+-int vdso_enabled = 1;
++unsigned int vdso_enabled = 1;
+ 
+ extern char vdso_start[], vdso_end[];
+ extern unsigned short vdso_sync_cpuid;
+@@ -96,18 +97,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
+ 
+ /* Setup a VMA at program startup for the vsyscall page.
+    Not called for compat tasks */
+-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
++int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack,
++				unsigned long map_address)
+ {
+ 	struct mm_struct *mm = current->mm;
+ 	unsigned long addr;
+ 	int ret;
+ 	unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
+ 
+-	if (!vdso_enabled)
++	if (!vdso_enabled && map_address == 0) {
++		current->mm->context.vdso = NULL;
+ 		return 0;
++	}
+ 
+ 	down_write(&mm->mmap_sem);
+-	addr = vdso_addr(mm->start_stack, len);
++	if (map_address)
++		addr = map_address;
++	else
++		addr = vdso_addr(mm->start_stack, len);
+ 	addr = get_unmapped_area(NULL, addr, len, 0, 0);
+ 	if (IS_ERR_VALUE(addr)) {
+ 		ret = addr;
+@@ -127,6 +134,7 @@ up_fail:
+ 	up_write(&mm->mmap_sem);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
+ 
+ static __init int vdso_setup(char *s)
+ {
+diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
+index d01b411..497b791 100644
+--- a/block/cfq-iosched.c
++++ b/block/cfq-iosched.c
+@@ -11,6 +11,11 @@
+ #include <linux/elevator.h>
+ #include <linux/rbtree.h>
+ #include <linux/ioprio.h>
++#include <linux/cfq-iosched.h>
++#include <bc/beancounter.h>
++#include <bc/io_prio.h>
++#include <bc/io_acct.h>
++#include <bc/hash.h>
+ 
+ /*
+  * tunables
+@@ -26,6 +31,7 @@ static const int cfq_slice_sync = HZ / 10;
+ static int cfq_slice_async = HZ / 25;
+ static const int cfq_slice_async_rq = 2;
+ static int cfq_slice_idle = HZ / 125;
++static int cfq_ub_slice = HZ / 2;
+ 
+ /*
+  * offset from end of service tree
+@@ -43,13 +49,11 @@ static int cfq_slice_idle = HZ / 125;
+ 	((struct cfq_io_context *) (rq)->elevator_private)
+ #define RQ_CFQQ(rq)		((rq)->elevator_private2)
+ 
+-static struct kmem_cache *cfq_pool;
+ static struct kmem_cache *cfq_ioc_pool;
+ 
+ static DEFINE_PER_CPU(unsigned long, ioc_count);
+ static struct completion *ioc_gone;
+ 
+-#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
+ #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+ #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
+ 
+@@ -58,105 +62,6 @@ static struct completion *ioc_gone;
+ 
+ #define sample_valid(samples)	((samples) > 80)
+ 
+-/*
+- * Most of our rbtree usage is for sorting with min extraction, so
+- * if we cache the leftmost node we don't have to walk down the tree
+- * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
+- * move this into the elevator for the rq sorting as well.
+- */
+-struct cfq_rb_root {
+-	struct rb_root rb;
+-	struct rb_node *left;
+-};
+-#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
+-
+-/*
+- * Per block device queue structure
+- */
+-struct cfq_data {
+-	struct request_queue *queue;
+-
+-	/*
+-	 * rr list of queues with requests and the count of them
+-	 */
+-	struct cfq_rb_root service_tree;
+-	unsigned int busy_queues;
+-
+-	int rq_in_driver;
+-	int sync_flight;
+-	int hw_tag;
+-
+-	/*
+-	 * idle window management
+-	 */
+-	struct timer_list idle_slice_timer;
+-	struct work_struct unplug_work;
+-
+-	struct cfq_queue *active_queue;
+-	struct cfq_io_context *active_cic;
+-
+-	/*
+-	 * async queue for each priority case
+-	 */
+-	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+-	struct cfq_queue *async_idle_cfqq;
+-
+-	sector_t last_position;
+-	unsigned long last_end_request;
+-
+-	/*
+-	 * tunables, see top of file
+-	 */
+-	unsigned int cfq_quantum;
+-	unsigned int cfq_fifo_expire[2];
+-	unsigned int cfq_back_penalty;
+-	unsigned int cfq_back_max;
+-	unsigned int cfq_slice[2];
+-	unsigned int cfq_slice_async_rq;
+-	unsigned int cfq_slice_idle;
+-
+-	struct list_head cic_list;
+-};
+-
+-/*
+- * Per process-grouping structure
+- */
+-struct cfq_queue {
+-	/* reference count */
+-	atomic_t ref;
+-	/* various state flags, see below */
+-	unsigned int flags;
+-	/* parent cfq_data */
+-	struct cfq_data *cfqd;
+-	/* service_tree member */
+-	struct rb_node rb_node;
+-	/* service_tree key */
+-	unsigned long rb_key;
+-	/* sorted list of pending requests */
+-	struct rb_root sort_list;
+-	/* if fifo isn't expired, next request to serve */
+-	struct request *next_rq;
+-	/* requests queued in sort_list */
+-	int queued[2];
+-	/* currently allocated requests */
+-	int allocated[2];
+-	/* fifo list of requests in sort_list */
+-	struct list_head fifo;
+-
+-	unsigned long slice_end;
+-	long slice_resid;
+-
+-	/* pending metadata requests */
+-	int meta_pending;
+-	/* number of requests that are on the dispatch list or inside driver */
+-	int dispatched;
+-
+-	/* io prio of this group */
+-	unsigned short ioprio, org_ioprio;
+-	unsigned short ioprio_class, org_ioprio_class;
+-
+-};
+-
+ enum cfqq_state_flags {
+ 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
+ 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
+@@ -201,6 +106,67 @@ CFQ_CFQQ_FNS(sync);
+ static void cfq_dispatch_insert(struct request_queue *, struct request *);
+ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
+ 				       struct io_context *, gfp_t);
++static void cfq_put_queue(struct cfq_queue *cfqq);
++
++static void __cfq_put_async_queues(struct cfq_bc_data *cfq_bc)
++{
++	int i;
++
++	for (i = 0; i < CFQ_PRIO_LISTS; i++) {
++		if (cfq_bc->async_cfqq[0][i]) {
++			cfq_put_queue(cfq_bc->async_cfqq[0][i]);
++			cfq_bc->async_cfqq[0][i] = NULL;
++		}
++		if (cfq_bc->async_cfqq[1][i]) {
++			cfq_put_queue(cfq_bc->async_cfqq[1][i]);
++			cfq_bc->async_cfqq[1][i] = NULL;
++		}
++	}
++	if (cfq_bc->async_idle_cfqq) {
++		cfq_put_queue(cfq_bc->async_idle_cfqq);
++		cfq_bc->async_idle_cfqq = NULL;
++	}
++}
++
++#ifdef CONFIG_BC_IO_SCHED
++static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync)
++{
++	int mode;
++
++	mode = sync ? cfqd->virt_mode : cfqd->write_virt_mode;
++	return mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv;
++}
++
++static inline void cfq_put_async_queues(struct cfq_data *cfqd)
++{
++	struct user_beancounter *ub;
++	struct cfq_bc_data *cfq_bc;
++
++	rcu_read_lock();
++	for_each_beancounter(ub) {
++		write_lock(&ub->iopriv.cfq_bc_list_lock);
++		cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd);
++		if (!cfq_bc) {
++			write_unlock(&ub->iopriv.cfq_bc_list_lock);
++			continue;
++		}
++		__cfq_put_async_queues(cfq_bc);
++		write_unlock(&ub->iopriv.cfq_bc_list_lock);
++	}
++	rcu_read_unlock();
++}
++#else
++static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync)
++{
++	return NULL;
++}
++
++static inline void cfq_put_async_queues(struct cfq_data *cfqd)
++{
++	__cfq_put_async_queues(&cfqd->cfq_bc);
++}
++#endif
++
+ static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
+ 						struct io_context *);
+ 
+@@ -287,6 +253,11 @@ static inline int cfq_slice_used(struct cfq_queue *cfqq)
+ 	return 1;
+ }
+ 
++static inline struct user_beancounter *ub_by_iopriv(struct ub_iopriv *iopriv)
++{
++	return container_of(iopriv, struct user_beancounter, iopriv);
++}
++
+ /*
+  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+  * We choose the request that is closest to the head right now. Distance
+@@ -450,6 +421,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
+ static void cfq_service_tree_add(struct cfq_data *cfqd,
+ 				    struct cfq_queue *cfqq, int add_front)
+ {
++	struct cfq_bc_data *cfq_bc = cfqq->cfq_bc;
+ 	struct rb_node **p, *parent;
+ 	struct cfq_queue *__cfqq;
+ 	unsigned long rb_key;
+@@ -457,7 +429,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
+ 
+ 	if (cfq_class_idle(cfqq)) {
+ 		rb_key = CFQ_IDLE_DELAY;
+-		parent = rb_last(&cfqd->service_tree.rb);
++		parent = rb_last(&cfq_bc->service_tree.rb);
+ 		if (parent && parent != &cfqq->rb_node) {
+ 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
+ 			rb_key += __cfqq->rb_key;
+@@ -477,12 +449,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
+ 		if (rb_key == cfqq->rb_key)
+ 			return;
+ 
+-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++		cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree);
+ 	}
+ 
+ 	left = 1;
+ 	parent = NULL;
+-	p = &cfqd->service_tree.rb.rb_node;
++	p = &cfq_bc->service_tree.rb.rb_node;
+ 	while (*p) {
+ 		struct rb_node **n;
+ 
+@@ -514,11 +486,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd,
+ 	}
+ 
+ 	if (left)
+-		cfqd->service_tree.left = &cfqq->rb_node;
++		cfq_bc->service_tree.left = &cfqq->rb_node;
+ 
+ 	cfqq->rb_key = rb_key;
+ 	rb_link_node(&cfqq->rb_node, parent, p);
+-	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
++	rb_insert_color(&cfqq->rb_node, &cfq_bc->service_tree.rb);
+ }
+ 
+ /*
+@@ -542,6 +514,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ 	BUG_ON(cfq_cfqq_on_rr(cfqq));
+ 	cfq_mark_cfqq_on_rr(cfqq);
+ 	cfqd->busy_queues++;
++	bc_inc_rqnum(cfqq);
+ 
+ 	cfq_resort_rr_list(cfqd, cfqq);
+ }
+@@ -552,14 +525,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+  */
+ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
++	struct cfq_bc_data *cfq_bc;
++
+ 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
+ 	cfq_clear_cfqq_on_rr(cfqq);
+ 
++	cfq_bc = cfqq->cfq_bc;
++
+ 	if (!RB_EMPTY_NODE(&cfqq->rb_node))
+-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
++		cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree);
+ 
+ 	BUG_ON(!cfqd->busy_queues);
+ 	cfqd->busy_queues--;
++	bc_dec_rqnum(cfqq);
+ }
+ 
+ /*
+@@ -675,8 +653,7 @@ static void cfq_remove_request(struct request *rq)
+ 	}
+ }
+ 
+-static int cfq_merge(struct request_queue *q, struct request **req,
+-		     struct bio *bio)
++static int cfq_merge(struct request_queue *q, struct request **req, struct bio *bio)
+ {
+ 	struct cfq_data *cfqd = q->elevator->elevator_data;
+ 	struct request *__rq;
+@@ -800,10 +777,16 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out)
+  */
+ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+ {
+-	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
++	struct cfq_bc_data *cfq_bc;
++
++	cfq_bc = cfqd->active_cfq_bc;
++	if (!cfq_bc)
+ 		return NULL;
+ 
+-	return cfq_rb_first(&cfqd->service_tree);
++	if (RB_EMPTY_ROOT(&cfq_bc->service_tree.rb))
++		return NULL;
++
++	return cfq_rb_first(&cfq_bc->service_tree);
+ }
+ 
+ /*
+@@ -811,9 +794,17 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
+  */
+ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
+ {
+-	struct cfq_queue *cfqq;
++	struct cfq_queue *cfqq = NULL;
++	struct cfq_bc_data *cfq_bc;
++
++	bc_schedule_active(cfqd);
++
++	cfq_bc = cfqd->active_cfq_bc;
++	if (!cfq_bc)
++		goto out;
+ 
+ 	cfqq = cfq_get_next_queue(cfqd);
++out:
+ 	__cfq_set_active_queue(cfqd, cfqq);
+ 	return cfqq;
+ }
+@@ -904,6 +895,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
+ 
+ 	cfq_remove_request(rq);
+ 	cfqq->dispatched++;
++	cfqq->cfq_bc->on_dispatch++;
+ 	elv_dispatch_sort(q, rq);
+ 
+ 	if (cfq_cfqq_sync(cfqq))
+@@ -961,7 +953,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
+ 	/*
+ 	 * The active queue has run out of time, expire it and select new.
+ 	 */
+-	if (cfq_slice_used(cfqq))
++	if (cfq_slice_used(cfqq) || bc_expired(cfqd))
+ 		goto expire;
+ 
+ 	/*
+@@ -1060,14 +1052,33 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
+  * Drain our current requests. Used for barriers and when switching
+  * io schedulers on-the-fly.
+  */
+-static int cfq_forced_dispatch(struct cfq_data *cfqd)
++static int __cfq_forced_dispatch(struct cfq_bc_data *cfq_bc)
+ {
+ 	struct cfq_queue *cfqq;
+ 	int dispatched = 0;
+ 
+-	while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
++	while ((cfqq = cfq_rb_first(&cfq_bc->service_tree)) != NULL)
+ 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+ 
++	return dispatched;
++}
++
++static int cfq_forced_dispatch(struct cfq_data *cfqd)
++{
++	struct cfq_bc_data *cfq_bc;
++	struct cfq_bc_data *cfq_bc_tmp;
++	int dispatched;
++
++	dispatched = 0;
++	/*
++	 * We use here _safe iterating, because
++	 * __cfq_forced_dispatch() produces list_del() implicitly
++ 	 */
++	list_for_each_entry_safe(cfq_bc, cfq_bc_tmp,
++		&cfqd->act_cfq_bc_head, act_cfq_bc_list) {
++		dispatched += __cfq_forced_dispatch(cfq_bc);
++	}
++
+ 	cfq_slice_expired(cfqd, 0);
+ 
+ 	BUG_ON(cfqd->busy_queues);
+@@ -1243,6 +1254,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
+ 	if (ioc->ioc_data == cic)
+ 		rcu_assign_pointer(ioc->ioc_data, NULL);
+ 
++	/*
++	 * cic->cfqq[ASYNC] is always NULL and the put of async queues
++	 * happens on appropriate bc death or device unplug
++	 */
+ 	if (cic->cfqq[ASYNC]) {
+ 		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
+ 		cic->cfqq[ASYNC] = NULL;
+@@ -1351,6 +1366,10 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+ 
+ 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+ 
++	/* 
++	 * cic->cfqq[ASYNC] is always NULL, ioprio change
++	 * for async queues happens automatically
++	 */
+ 	cfqq = cic->cfqq[ASYNC];
+ 	if (cfqq) {
+ 		struct cfq_queue *new_cfqq;
+@@ -1380,8 +1399,11 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync,
+ {
+ 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+ 	struct cfq_io_context *cic;
++	struct ub_iopriv *iopriv;
++	struct cfq_bc_data *cfq_bc = NULL;
+ 
+ retry:
++	iopriv = cfqq_ub_iopriv(cfqd, is_sync);
+ 	cic = cfq_cic_lookup(cfqd, ioc);
+ 	/* cic always exists here */
+ 	cfqq = cic_to_cfqq(cic, is_sync);
+@@ -1399,18 +1421,32 @@ retry:
+ 			 */
+ 			spin_unlock_irq(cfqd->queue->queue_lock);
+ 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
+-					gfp_mask | __GFP_NOFAIL | __GFP_ZERO,
++					gfp_mask|__GFP_NOFAIL|__GFP_ZERO,
+ 					cfqd->queue->node);
++			if (new_cfqq) {
++				cfq_bc = bc_findcreate_cfq_bc(iopriv,
++							cfqd, gfp_mask);
++				if (!cfq_bc) {
++					kmem_cache_free(cfq_pool, new_cfqq);
++					new_cfqq = NULL;
++				}
++			}
+ 			spin_lock_irq(cfqd->queue->queue_lock);
+ 			goto retry;
+ 		} else {
+ 			cfqq = kmem_cache_alloc_node(cfq_pool,
+-					gfp_mask | __GFP_ZERO,
+-					cfqd->queue->node);
++					gfp_mask|__GFP_ZERO, cfqd->queue->node);
+ 			if (!cfqq)
+ 				goto out;
++			cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask);
++			if (!cfq_bc) {
++				kmem_cache_free(cfq_pool, cfqq);
++				cfqq = NULL;
++				goto out;
++			}
+ 		}
+ 
++		cfqq->cfq_bc = cfq_bc;
+ 		RB_CLEAR_NODE(&cfqq->rb_node);
+ 		INIT_LIST_HEAD(&cfqq->fifo);
+ 
+@@ -1438,15 +1474,15 @@ out:
+ }
+ 
+ static struct cfq_queue **
+-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
++cfq_async_queue_prio(struct cfq_bc_data *cfq_bc, int ioprio_class, int ioprio)
+ {
+ 	switch (ioprio_class) {
+ 	case IOPRIO_CLASS_RT:
+-		return &cfqd->async_cfqq[0][ioprio];
++		return &cfq_bc->async_cfqq[0][ioprio];
+ 	case IOPRIO_CLASS_BE:
+-		return &cfqd->async_cfqq[1][ioprio];
++		return &cfq_bc->async_cfqq[1][ioprio];
+ 	case IOPRIO_CLASS_IDLE:
+-		return &cfqd->async_idle_cfqq;
++		return &cfq_bc->async_idle_cfqq;
+ 	default:
+ 		BUG();
+ 	}
+@@ -1460,9 +1496,16 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc,
+ 	const int ioprio_class = task_ioprio_class(ioc);
+ 	struct cfq_queue **async_cfqq = NULL;
+ 	struct cfq_queue *cfqq = NULL;
++	struct cfq_bc_data *cfq_bc;
++	struct ub_iopriv *iopriv;
++
++	iopriv = cfqq_ub_iopriv(cfqd, is_sync);
+ 
+ 	if (!is_sync) {
+-		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
++		cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask);
++		if (!cfq_bc)
++			return NULL;
++		async_cfqq = cfq_async_queue_prio(cfq_bc, ioprio_class, ioprio);
+ 		cfqq = *async_cfqq;
+ 	}
+ 
+@@ -1840,6 +1883,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
+ 	WARN_ON(!cfqq->dispatched);
+ 	cfqd->rq_in_driver--;
+ 	cfqq->dispatched--;
++	cfqq->cfq_bc->on_dispatch--;
+ 
+ 	if (cfq_cfqq_sync(cfqq))
+ 		cfqd->sync_flight--;
+@@ -1952,6 +1996,7 @@ static void cfq_put_request(struct request *rq)
+ 		rq->elevator_private = NULL;
+ 		rq->elevator_private2 = NULL;
+ 
++		put_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv));
+ 		cfq_put_queue(cfqq);
+ 	}
+ }
+@@ -1968,14 +2013,19 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+ 	const int is_sync = rq_is_sync(rq);
+ 	struct cfq_queue *cfqq;
+ 	unsigned long flags;
++	struct ub_iopriv *iopriv;
++	struct cfq_bc_data *cfq_bc = NULL;
+ 
+ 	might_sleep_if(gfp_mask & __GFP_WAIT);
+ 
+ 	cic = cfq_get_io_context(cfqd, gfp_mask);
++	iopriv = cfqq_ub_iopriv(cfqd, is_sync);
++	if (!is_sync)
++		cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask);
+ 
+ 	spin_lock_irqsave(q->queue_lock, flags);
+ 
+-	if (!cic)
++	if (!cic || (!is_sync && cfq_bc == NULL))
+ 		goto queue_fail;
+ 
+ 	cfqq = cic_to_cfqq(cic, is_sync);
+@@ -1996,6 +2046,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
+ 
+ 	rq->elevator_private = cic;
+ 	rq->elevator_private2 = cfqq;
++	get_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv));
+ 	return 0;
+ 
+ queue_fail:
+@@ -2070,21 +2121,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
+ 	kblockd_flush_work(&cfqd->unplug_work);
+ }
+ 
+-static void cfq_put_async_queues(struct cfq_data *cfqd)
+-{
+-	int i;
+-
+-	for (i = 0; i < IOPRIO_BE_NR; i++) {
+-		if (cfqd->async_cfqq[0][i])
+-			cfq_put_queue(cfqd->async_cfqq[0][i]);
+-		if (cfqd->async_cfqq[1][i])
+-			cfq_put_queue(cfqd->async_cfqq[1][i]);
+-	}
+-
+-	if (cfqd->async_idle_cfqq)
+-		cfq_put_queue(cfqd->async_idle_cfqq);
+-}
+-
+ static void cfq_exit_queue(elevator_t *e)
+ {
+ 	struct cfq_data *cfqd = e->elevator_data;
+@@ -2111,6 +2147,8 @@ static void cfq_exit_queue(elevator_t *e)
+ 
+ 	cfq_shutdown_timer_wq(cfqd);
+ 
++	bc_cfq_exit_queue(cfqd);
++
+ 	kfree(cfqd);
+ }
+ 
+@@ -2118,11 +2156,19 @@ static void *cfq_init_queue(struct request_queue *q)
+ {
+ 	struct cfq_data *cfqd;
+ 
+-	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
++	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL|__GFP_ZERO, q->node);
+ 	if (!cfqd)
+ 		return NULL;
+ 
+-	cfqd->service_tree = CFQ_RB_ROOT;
++	INIT_LIST_HEAD(&cfqd->act_cfq_bc_head);
++#ifndef CONFIG_BC_IO_SCHED
++	cfq_init_cfq_bc(&cfqd->cfq_bc);
++	/*
++	 *  Adding ub0 to active list in order to serve force dispatching
++	 *  case uniformally. Note, that nobody removes ub0 from this list.
++	 */
++	list_add_tail(&cfqd->cfq_bc.act_cfq_bc_list, &cfqd->act_cfq_bc_head);
++#endif
+ 	INIT_LIST_HEAD(&cfqd->cic_list);
+ 
+ 	cfqd->queue = q;
+@@ -2143,6 +2189,9 @@ static void *cfq_init_queue(struct request_queue *q)
+ 	cfqd->cfq_slice[1] = cfq_slice_sync;
+ 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
+ 	cfqd->cfq_slice_idle = cfq_slice_idle;
++	cfqd->cfq_ub_slice = cfq_ub_slice;
++	cfqd->virt_mode = 1;
++	cfqd->write_virt_mode = 1;
+ 
+ 	return cfqd;
+ }
+@@ -2211,6 +2260,9 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
+ SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
+ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
++SHOW_FUNCTION(cfq_ub_slice_show, cfqd->cfq_ub_slice, 1);
++SHOW_FUNCTION(cfq_virt_mode_show, cfqd->virt_mode, 0);
++SHOW_FUNCTION(cfq_write_virt_mode_show, cfqd->write_virt_mode, 0);
+ #undef SHOW_FUNCTION
+ 
+ #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+@@ -2242,6 +2294,9 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
+ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
+ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
+ 		UINT_MAX, 0);
++STORE_FUNCTION(cfq_ub_slice_store, &cfqd->cfq_ub_slice, 1, UINT_MAX, 1);
++STORE_FUNCTION(cfq_virt_mode_store, &cfqd->virt_mode, 0, 1, 0);
++STORE_FUNCTION(cfq_write_virt_mode_store, &cfqd->write_virt_mode, 0, 1, 0);
+ #undef STORE_FUNCTION
+ 
+ #define CFQ_ATTR(name) \
+@@ -2257,6 +2312,9 @@ static struct elv_fs_entry cfq_attrs[] = {
+ 	CFQ_ATTR(slice_async),
+ 	CFQ_ATTR(slice_async_rq),
+ 	CFQ_ATTR(slice_idle),
++	CFQ_ATTR(ub_slice),
++	CFQ_ATTR(virt_mode),
++	CFQ_ATTR(write_virt_mode),
+ 	__ATTR_NULL
+ };
+ 
+@@ -2280,6 +2338,7 @@ static struct elevator_type iosched_cfq = {
+ 		.elevator_init_fn =		cfq_init_queue,
+ 		.elevator_exit_fn =		cfq_exit_queue,
+ 		.trim =				cfq_free_io_context,
++		.put_queue =			cfq_put_queue,
+ 	},
+ 	.elevator_attrs =	cfq_attrs,
+ 	.elevator_name =	"cfq",
+diff --git a/block/elevator.c b/block/elevator.c
+index 902dd13..7241736 100644
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -40,6 +40,9 @@
+ static DEFINE_SPINLOCK(elv_list_lock);
+ static LIST_HEAD(elv_list);
+ 
++struct kmem_cache *cfq_pool;
++EXPORT_SYMBOL_GPL(cfq_pool);
++
+ /*
+  * Merge hash stuff.
+  */
+@@ -1028,12 +1031,12 @@ void elv_unregister(struct elevator_type *e)
+ 	 */
+ 	if (e->ops.trim) {
+ 		read_lock(&tasklist_lock);
+-		do_each_thread(g, p) {
++		do_each_thread_all(g, p) {
+ 			task_lock(p);
+ 			if (p->io_context)
+ 				e->ops.trim(p->io_context);
+ 			task_unlock(p);
+-		} while_each_thread(g, p);
++		} while_each_thread_all(g, p);
+ 		read_unlock(&tasklist_lock);
+ 	}
+ 
+diff --git a/block/genhd.c b/block/genhd.c
+index b922d48..901cf04 100644
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -513,6 +513,7 @@ static void disk_release(struct device *dev)
+ struct class block_class = {
+ 	.name		= "block",
+ };
++EXPORT_SYMBOL(block_class);
+ 
+ static struct device_type disk_type = {
+ 	.name		= "disk",
+diff --git a/drivers/base/class.c b/drivers/base/class.c
+index e085af0..b7fbb22 100644
+--- a/drivers/base/class.c
++++ b/drivers/base/class.c
+@@ -18,6 +18,8 @@
+ #include <linux/err.h>
+ #include <linux/slab.h>
+ #include <linux/genhd.h>
++#include <linux/sched.h>
++#include <linux/ve.h>
+ #include "base.h"
+ 
+ #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
+@@ -71,8 +73,14 @@ static struct kobj_type class_ktype = {
+ };
+ 
+ /* Hotplug events for classes go to the class_obj subsys */
+-static struct kset *class_kset;
++struct kset *class_kset;
++EXPORT_SYMBOL_GPL(class_kset);
+ 
++#ifndef CONFIG_VE
++#define visible_class_kset class_kset
++#else
++#define visible_class_kset (get_exec_env()->class_kset)
++#endif
+ 
+ int class_create_file(struct class *cls, const struct class_attribute *attr)
+ {
+@@ -151,9 +159,9 @@ int class_register(struct class *cls)
+ #if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
+ 	/* let the block class directory show up in the root of sysfs */
+ 	if (cls != &block_class)
+-		cls->subsys.kobj.kset = class_kset;
++		cls->subsys.kobj.kset = visible_class_kset;
+ #else
+-	cls->subsys.kobj.kset = class_kset;
++	cls->subsys.kobj.kset = visible_class_kset;
+ #endif
+ 	cls->subsys.kobj.ktype = &class_ktype;
+ 
+@@ -379,13 +387,20 @@ void class_interface_unregister(struct class_interface *class_intf)
+ 	class_put(parent);
+ }
+ 
+-int __init classes_init(void)
++int classes_init(void)
+ {
+-	class_kset = kset_create_and_add("class", NULL, NULL);
+-	if (!class_kset)
++	visible_class_kset = kset_create_and_add("class", NULL, NULL);
++	if (!visible_class_kset)
+ 		return -ENOMEM;
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(classes_init);
++
++void classes_fini(void)
++{
++	kset_unregister(visible_class_kset);
++}
++EXPORT_SYMBOL_GPL(classes_fini);
+ 
+ EXPORT_SYMBOL_GPL(class_create_file);
+ EXPORT_SYMBOL_GPL(class_remove_file);
+diff --git a/drivers/base/core.c b/drivers/base/core.c
+index ee0a51a..660ecc0 100644
+--- a/drivers/base/core.c
++++ b/drivers/base/core.c
+@@ -21,6 +21,8 @@
+ #include <linux/genhd.h>
+ #include <linux/kallsyms.h>
+ #include <linux/semaphore.h>
++#include <linux/sched.h>
++#include <linux/ve.h>
+ 
+ #include "base.h"
+ #include "power/power.h"
+@@ -417,9 +419,13 @@ static ssize_t show_dev(struct device *dev, struct device_attribute *attr,
+ static struct device_attribute devt_attr =
+ 	__ATTR(dev, S_IRUGO, show_dev, NULL);
+ 
+-/* kset to create /sys/devices/  */
+ struct kset *devices_kset;
+ 
++/* kset to create /sys/devices/  */
++#ifdef CONFIG_VE
++#define ve_devices_kset	(get_exec_env()->devices_kset)
++#endif
++
+ /**
+  * device_create_file - create sysfs attribute file for device.
+  * @dev: device.
+@@ -529,7 +535,7 @@ static void klist_children_put(struct klist_node *n)
+  */
+ void device_initialize(struct device *dev)
+ {
+-	dev->kobj.kset = devices_kset;
++	dev->kobj.kset = ve_devices_kset;
+ 	kobject_init(&dev->kobj, &device_ktype);
+ 	klist_init(&dev->klist_children, klist_children_get,
+ 		   klist_children_put);
+@@ -566,7 +572,7 @@ static struct kobject *virtual_device_parent(struct device *dev)
+ 
+ 	if (!virtual_dir)
+ 		virtual_dir = kobject_create_and_add("virtual",
+-						     &devices_kset->kobj);
++						     &ve_devices_kset->kobj);
+ 
+ 	return virtual_dir;
+ }
+@@ -1069,13 +1075,23 @@ struct device *device_find_child(struct device *parent, void *data,
+ 	return child;
+ }
+ 
+-int __init devices_init(void)
++int devices_init(void)
+ {
+-	devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
+-	if (!devices_kset)
++	ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
++	if (!ve_devices_kset)
+ 		return -ENOMEM;
++	if (ve_is_super(get_exec_env()))
++		devices_kset = ve_devices_kset;
++
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(devices_init);
++
++void devices_fini(void)
++{
++	kset_unregister(devices_kset);
++}
++EXPORT_SYMBOL_GPL(devices_fini);
+ 
+ EXPORT_SYMBOL_GPL(device_for_each_child);
+ EXPORT_SYMBOL_GPL(device_find_child);
+diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
+index d9a0a53..ab32c26 100644
+--- a/drivers/char/keyboard.c
++++ b/drivers/char/keyboard.c
+@@ -160,6 +160,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 1] =
+ static int sysrq_down;
+ static int sysrq_alt_use;
+ #endif
++int sysrq_key_scancode = KEY_SYSRQ;
+ static int sysrq_alt;
+ 
+ /*
+@@ -1065,6 +1066,9 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
+ {
+ 	int code;
+ 
++	if (keycode == sysrq_key_scancode && sysrq_alt)
++		goto sysrq;
++
+ 	switch (keycode) {
+ 		case KEY_PAUSE:
+ 			put_queue(vc, 0xe1);
+@@ -1083,6 +1087,7 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
+ 			break;
+ 
+ 		case KEY_SYSRQ:
++sysrq:
+ 			/*
+ 			 * Real AT keyboards (that's what we're trying
+ 			 * to emulate here emit 0xe0 0x2a 0xe0 0x37 when
+@@ -1179,7 +1184,8 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
+ 				printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode);
+ 
+ #ifdef CONFIG_MAGIC_SYSRQ	       /* Handle the SysRq Hack */
+-	if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) {
++	if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) &&
++				(sysrq_down || (down == 1 && sysrq_alt))) {
+ 		if (!sysrq_down) {
+ 			sysrq_down = down;
+ 			sysrq_alt_use = sysrq_alt;
+diff --git a/drivers/char/pty.c b/drivers/char/pty.c
+index 0a05c03..9c0ccce 100644
+--- a/drivers/char/pty.c
++++ b/drivers/char/pty.c
+@@ -29,16 +29,22 @@
+ #include <linux/bitops.h>
+ #include <linux/devpts_fs.h>
+ 
++#include <bc/misc.h>
++
+ /* These are global because they are accessed in tty_io.c */
+ #ifdef CONFIG_UNIX98_PTYS
+ struct tty_driver *ptm_driver;
+-static struct tty_driver *pts_driver;
++struct tty_driver *pts_driver;
++EXPORT_SYMBOL(ptm_driver);
++EXPORT_SYMBOL(pts_driver);
+ #endif
+ 
+ static void pty_close(struct tty_struct * tty, struct file * filp)
+ {
+ 	if (!tty)
+ 		return;
++
++	ub_pty_uncharge(tty);
+ 	if (tty->driver->subtype == PTY_TYPE_MASTER) {
+ 		if (tty->count > 1)
+ 			printk("master pty_close: count = %d!!\n", tty->count);
+@@ -58,8 +64,12 @@ static void pty_close(struct tty_struct * tty, struct file * filp)
+ 	if (tty->driver->subtype == PTY_TYPE_MASTER) {
+ 		set_bit(TTY_OTHER_CLOSED, &tty->flags);
+ #ifdef CONFIG_UNIX98_PTYS
+-		if (tty->driver == ptm_driver)
++		if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) {
++			struct ve_struct *old_env;
++			old_env = set_exec_env(tty->owner_env);
+ 			devpts_pty_kill(tty->index);
++			(void)set_exec_env(old_env);
++		}
+ #endif
+ 		tty_vhangup(tty->link);
+ 	}
+@@ -212,6 +222,10 @@ static int pty_open(struct tty_struct *tty, struct file * filp)
+ 	if (tty->link->count != 1)
+ 		goto out;
+ 
++	retval = -ENOMEM;
++	if (ub_pty_charge(tty))
++		goto out;
++
+ 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
+ 	set_bit(TTY_THROTTLED, &tty->flags);
+ 	set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+@@ -239,7 +253,9 @@ static const struct tty_operations pty_ops = {
+ 
+ /* Traditional BSD devices */
+ #ifdef CONFIG_LEGACY_PTYS
+-static struct tty_driver *pty_driver, *pty_slave_driver;
++struct tty_driver *pty_driver, *pty_slave_driver;
++EXPORT_SYMBOL(pty_driver);
++EXPORT_SYMBOL(pty_slave_driver);
+ 
+ static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file,
+ 			 unsigned int cmd, unsigned long arg)
+@@ -452,6 +468,9 @@ static void __init unix98_pty_init(void)
+ 
+ 	pty_table[1].data = &ptm_driver->refcount;
+ 	register_sysctl_table(pty_root_table);
++#ifdef CONFIG_VE
++	get_ve0()->ptm_driver = ptm_driver;
++#endif
+ }
+ #else
+ static inline void unix98_pty_init(void) { }
+diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
+index dbce126..cb46ae2 100644
+--- a/drivers/char/sysrq.c
++++ b/drivers/char/sysrq.c
+@@ -36,6 +36,8 @@
+ #include <linux/kexec.h>
+ #include <linux/irq.h>
+ #include <linux/hrtimer.h>
++#include <linux/kallsyms.h>
++#include <linux/slab.h>
+ #include <linux/oom.h>
+ 
+ #include <asm/ptrace.h>
+@@ -241,9 +243,16 @@ static struct sysrq_key_op sysrq_showallcpus_op = {
+ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
+ {
+ 	struct pt_regs *regs = get_irq_regs();
++
++	bust_spinlocks(1);
+ 	if (regs)
+ 		show_regs(regs);
++	bust_spinlocks(0);
++#if defined(__i386__) || defined(__x86_64__)
++	smp_nmi_call_function(smp_show_regs, NULL, 1);
++#endif
+ }
++
+ static struct sysrq_key_op sysrq_showregs_op = {
+ 	.handler	= sysrq_handle_showregs,
+ 	.help_msg	= "showPc",
+@@ -277,6 +286,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
+ static void sysrq_handle_showmem(int key, struct tty_struct *tty)
+ {
+ 	show_mem();
++	show_slab_info();
+ }
+ static struct sysrq_key_op sysrq_showmem_op = {
+ 	.handler	= sysrq_handle_showmem,
+@@ -292,7 +302,7 @@ static void send_sig_all(int sig)
+ {
+ 	struct task_struct *p;
+ 
+-	for_each_process(p) {
++	for_each_process_all(p) {
+ 		if (p->mm && !is_global_init(p))
+ 			/* Not swapper, init nor kernel thread */
+ 			force_sig(sig, p);
+@@ -354,7 +364,267 @@ static struct sysrq_key_op sysrq_unrt_op = {
+ /* Key Operations table and lock */
+ static DEFINE_SPINLOCK(sysrq_key_table_lock);
+ 
+-static struct sysrq_key_op *sysrq_key_table[36] = {
++#define SYSRQ_KEY_TABLE_LENGTH 37
++static struct sysrq_key_op **sysrq_key_table;
++static struct sysrq_key_op *sysrq_default_key_table[];
++
++#ifdef CONFIG_SYSRQ_DEBUG
++#define SYSRQ_NAMELEN_MAX	64
++#define SYSRQ_DUMP_LINES	32
++
++static struct sysrq_key_op *sysrq_debug_key_table[];
++static struct sysrq_key_op *sysrq_input_key_table[];
++static unsigned long *dump_address;
++static int orig_console_loglevel;
++static void (*sysrq_input_return)(char *) = NULL;
++
++static void dump_mem(void)
++{
++	unsigned long value[4];
++	mm_segment_t old_fs;
++	int line, err;
++
++	old_fs = get_fs();
++	set_fs(KERNEL_DS);
++	err = 0;
++
++	for (line = 0; line < SYSRQ_DUMP_LINES; line++) {
++		err |= __get_user(value[0], dump_address++);
++		err |= __get_user(value[1], dump_address++);
++		err |= __get_user(value[2], dump_address++);
++		err |= __get_user(value[3], dump_address++);
++		if (err) {
++			printk("Invalid address %p\n", dump_address - 4);
++			break;
++		}
++#if BITS_PER_LONG == 32
++		printk("0x%p: %08lx %08lx %08lx %08lx\n",
++				dump_address - 4,
++				value[0], value[1], value[2], value[3]);
++#else
++		printk("0x%p: %016lx %016lx %016lx %016lx\n",
++				dump_address - 4,
++				value[0], value[1], value[2], value[3]);
++#endif
++	}
++	set_fs(old_fs);
++}
++
++static void write_mem(unsigned long val)
++{
++	mm_segment_t old_fs;
++	unsigned long old_val;
++
++	old_fs = get_fs();
++	set_fs(KERNEL_DS);
++	if (__get_user(old_val, dump_address)) {
++		printk("Invalid address %p\n", dump_address);
++		goto out;
++	}
++
++#if BITS_PER_LONG == 32
++	printk("Changing [%p] from %08lx to %08lx\n",
++			dump_address, old_val, val);
++#else
++	printk("Changing [%p] from %016lx to %016lx\n",
++			dump_address, old_val, val);
++#endif
++	__put_user(val, dump_address);
++out:
++	set_fs(old_fs);
++}
++
++static void handle_read(int key, struct tty_struct *tty)
++{
++	static int pos;
++	static int upper_case;
++	static char str[SYSRQ_NAMELEN_MAX];
++
++	if (key == 0) {
++		/* actually 0 is not shift only... */
++		upper_case = 1;
++		return;
++	}
++
++	if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) {
++		/* enter */
++		sysrq_key_table = sysrq_debug_key_table;
++		str[pos] = '\0';
++		pos = upper_case = 0;
++		printk("\n");
++		if (sysrq_input_return == NULL)
++			printk("No return handler!!!\n");
++		else
++			sysrq_input_return(str);
++		return;
++	};
++
++	/* check for alowed symbols */
++	if (key == '-') {
++		if (upper_case)
++			key = '_';
++		goto correct;
++	};
++	if (key >= 'a' && key <= 'z') {
++		if (upper_case)
++			key = key - 'a' + 'A';
++		goto correct;
++	};
++	if (key >= '0' && key <= '9')
++		goto correct;
++
++	upper_case = 0;
++	return;
++
++correct:
++	str[pos] = key;
++	printk("%c", (char)key);
++	pos++;
++	upper_case = 0;
++}
++
++static struct sysrq_key_op input_read = {
++	.handler	= handle_read,
++	.help_msg	= "",
++	.action_msg	= NULL,
++};
++
++static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
++	[0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read,
++};
++
++static void return_dump_mem(char *str)
++{
++	unsigned long address;
++	char *end;
++
++	address = simple_strtoul(str, &end, 0);
++	if (*end != '\0') {
++		printk("Bad address [%s]\n", str);
++		return;
++	}
++
++	dump_address = (unsigned long *)address;
++	dump_mem();
++}
++
++static void handle_dump_mem(int key, struct tty_struct *tty)
++{
++	sysrq_input_return = return_dump_mem;
++	sysrq_key_table = sysrq_input_key_table;
++}
++
++static struct sysrq_key_op debug_dump_mem = {
++	.handler	= handle_dump_mem,
++	.help_msg	= "Dump",
++	.action_msg	= "Enter address:",
++};
++
++static void return_resolve(char *str)
++{
++	unsigned long address;
++
++	address = kallsyms_lookup_name(str);
++	printk("%s : %lx\n", str, address);
++	if (address) {
++		dump_address = (unsigned long *)address;
++		printk("Now you can dump it via X\n");
++	}
++}
++
++static void handle_resolve(int key, struct tty_struct *tty)
++{
++	sysrq_input_return = return_resolve;
++	sysrq_key_table = sysrq_input_key_table;
++}
++
++static struct sysrq_key_op debug_resolve = {
++	.handler	= handle_resolve,
++	.help_msg	= "Resolve",
++	.action_msg	= "Enter symbol name:",
++};
++
++static void return_write_mem(char *str)
++{
++	unsigned long address;
++	unsigned long value;
++	char *end;
++
++	address = simple_strtoul(str, &end, 0);
++	if (*end != '-') {
++		printk("Bad address in %s\n", str);
++		return;
++	}
++	value = simple_strtoul(end + 1, &end, 0);
++	if (*end != '\0') {
++		printk("Bad value in %s\n", str);
++		return;
++	}
++
++	dump_address = (unsigned long *)address;
++	write_mem(value);
++}
++
++static void handle_write_mem(int key, struct tty_struct *tty)
++{
++	sysrq_input_return = return_write_mem;
++	sysrq_key_table = sysrq_input_key_table;
++}
++
++static struct sysrq_key_op debug_write_mem = {
++	.handler	= handle_write_mem,
++	.help_msg	= "Writemem",
++	.action_msg	= "Enter address-value:",
++};
++
++static void handle_next(int key, struct tty_struct *tty)
++{
++	dump_mem();
++}
++
++static struct sysrq_key_op debug_next = {
++	.handler	= handle_next,
++	.help_msg	= "neXt",
++	.action_msg	= "continuing",
++};
++
++static void handle_quit(int key, struct tty_struct *tty)
++{
++	sysrq_key_table = sysrq_default_key_table;
++	console_loglevel = orig_console_loglevel;
++}
++
++static struct sysrq_key_op debug_quit = {
++	.handler	= handle_quit,
++	.help_msg	= "Quit",
++	.action_msg	= "Tnahk you for using debugger",
++};
++
++static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
++	[13] = &debug_dump_mem,		/* d */
++	[26] = &debug_quit,		/* q */
++	[27] = &debug_resolve,		/* r */
++	[32] = &debug_write_mem,	/* w */
++	[33] = &debug_next,		/* x */
++};
++
++static void sysrq_handle_debug(int key, struct tty_struct *tty)
++{
++	orig_console_loglevel = console_loglevel;
++	console_loglevel = 8;
++	sysrq_key_table = sysrq_debug_key_table;
++	printk("Welcome sysrq debugging mode\n"
++			"Press H for help\n");
++}
++
++static struct sysrq_key_op sysrq_debug_op = {
++	.handler        = sysrq_handle_debug,
++	.help_msg       = "debuG",
++	.action_msg     = "Select desired action",
++};
++#endif
++
++static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
+ 	&sysrq_loglevel_op,		/* 0 */
+ 	&sysrq_loglevel_op,		/* 1 */
+ 	&sysrq_loglevel_op,		/* 2 */
+@@ -377,7 +647,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
+ 	&sysrq_term_op,			/* e */
+ 	&sysrq_moom_op,			/* f */
+ 	/* g: May be registered by ppc for kgdb */
++#ifdef CONFIG_SYSRQ_DEBUG
++	&sysrq_debug_op,		/* g */
++#else
+ 	NULL,				/* g */
++#endif
+ 	NULL,				/* h */
+ 	&sysrq_kill_op,			/* i */
+ 	NULL,				/* j */
+@@ -404,9 +678,12 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
+ 	NULL,				/* x */
+ 	/* y: May be registered on sparc64 for global register dump */
+ 	NULL,				/* y */
+-	NULL				/* z */
++	NULL,				/* z */
++	NULL,				/* for debugger */
+ };
+ 
++static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table;
++
+ /* key2index calculation, -1 on invalid index */
+ static int sysrq_key_table_key2index(int key)
+ {
+@@ -416,6 +693,10 @@ static int sysrq_key_table_key2index(int key)
+ 		retval = key - '0';
+ 	else if ((key >= 'a') && (key <= 'z'))
+ 		retval = key + 10 - 'a';
++#ifdef CONFIG_SYSRQ_DEBUG
++	else if (key == 0 || key == 0x0d || key == '-')
++		retval = SYSRQ_KEY_TABLE_LENGTH - 1;
++#endif
+ 	else
+ 		retval = -1;
+ 	return retval;
+@@ -457,7 +738,6 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
+ 	spin_lock_irqsave(&sysrq_key_table_lock, flags);
+ 	orig_log_level = console_loglevel;
+ 	console_loglevel = 7;
+-	printk(KERN_INFO "SysRq : ");
+ 
+         op_p = __sysrq_get_key_op(key);
+         if (op_p) {
+@@ -466,16 +746,17 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
+ 		 * should not) and is the invoked operation enabled?
+ 		 */
+ 		if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
+-			printk("%s\n", op_p->action_msg);
++			if (op_p->action_msg)
++				printk("%s\n", op_p->action_msg);
+ 			console_loglevel = orig_log_level;
+ 			op_p->handler(key, tty);
+ 		} else {
+ 			printk("This sysrq operation is disabled.\n");
+ 		}
+ 	} else {
+-		printk("HELP : ");
++		printk("SysRq HELP : ");
+ 		/* Only print the help msg once per handler */
+-		for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) {
++		for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) {
+ 			if (sysrq_key_table[i]) {
+ 				int j;
+ 
+diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
+index 7501310..fb3a725 100644
+--- a/drivers/char/tty_io.c
++++ b/drivers/char/tty_io.c
+@@ -95,6 +95,8 @@
+ #include <linux/wait.h>
+ #include <linux/bitops.h>
+ #include <linux/delay.h>
++#include <linux/nsproxy.h>
++#include <linux/ve.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -105,6 +107,7 @@
+ 
+ #include <linux/kmod.h>
+ #include <linux/nsproxy.h>
++#include <bc/kmem.h>
+ 
+ #undef TTY_DEBUG_HANGUP
+ 
+@@ -129,6 +132,7 @@ EXPORT_SYMBOL(tty_std_termios);
+    into this file */
+ 
+ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
++EXPORT_SYMBOL(tty_drivers);
+ 
+ /* Mutex to protect creating and releasing a tty. This is shared with
+    vt.c for deeply disgusting hack reasons */
+@@ -136,7 +140,11 @@ DEFINE_MUTEX(tty_mutex);
+ EXPORT_SYMBOL(tty_mutex);
+ 
+ #ifdef CONFIG_UNIX98_PTYS
++#ifdef CONFIG_VE
++#define ptm_driver	(get_exec_env()->ptm_driver)
++#else
+ extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
++#endif
+ static int ptmx_open(struct inode *, struct file *);
+ #endif
+ 
+@@ -172,7 +180,7 @@ static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty);
+ 
+ static struct tty_struct *alloc_tty_struct(void)
+ {
+-	return kzalloc(sizeof(struct tty_struct), GFP_KERNEL);
++	return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC);
+ }
+ 
+ static void tty_buffer_free_all(struct tty_struct *);
+@@ -1146,9 +1154,29 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index)
+ 		if (device < base || device >= base + p->num)
+ 			continue;
+ 		*index = device - base;
+-		return p;
++#ifdef CONFIG_VE
++		if (in_interrupt())
++			goto found;
++		if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR
++#ifdef CONFIG_UNIX98_PTYS
++		    && (p->major<UNIX98_PTY_MASTER_MAJOR ||
++		    	p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) &&
++		       (p->major<UNIX98_PTY_SLAVE_MAJOR ||
++		        p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1)
++#endif
++		)
++			goto found;
++		if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env()))
++			goto found;
++		if (!ve_accessible_strict(p->owner_env, get_exec_env()))
++			continue;
++#endif
++		goto found;
+ 	}
+ 	return NULL;
++
++found:
++	return p;
+ }
+ 
+ #ifdef CONFIG_CONSOLE_POLL
+@@ -2070,13 +2098,21 @@ static void tty_line_name(struct tty_driver *driver, int index, char *p)
+  */
+ 
+ static int init_dev(struct tty_driver *driver, int idx,
+-	struct tty_struct **ret_tty)
++	struct tty_struct *i_tty, struct tty_struct **ret_tty)
+ {
+ 	struct tty_struct *tty, *o_tty;
+ 	struct ktermios *tp, **tp_loc, *o_tp, **o_tp_loc;
+ 	struct ktermios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc;
++	struct ve_struct * owner;
+ 	int retval = 0;
+ 
++	owner = driver->owner_env;
++
++	if (i_tty) {
++		tty = i_tty;
++		goto fast_track;
++	}
++
+ 	/* check whether we're reopening an existing tty */
+ 	if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ 		tty = devpts_get_tty(idx);
+@@ -2126,6 +2162,7 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 	tty->ops = driver->ops;
+ 	tty->index = idx;
+ 	tty_line_name(driver, idx, tty->name);
++	tty->owner_env = owner;
+ 
+ 	if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ 		tp_loc = &tty->termios;
+@@ -2136,14 +2173,14 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 	}
+ 
+ 	if (!*tp_loc) {
+-		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
++		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC);
+ 		if (!tp)
+ 			goto free_mem_out;
+ 		*tp = driver->init_termios;
+ 	}
+ 
+ 	if (!*ltp_loc) {
+-		ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL);
++		ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC);
+ 		if (!ltp)
+ 			goto free_mem_out;
+ 	}
+@@ -2157,6 +2194,7 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 		o_tty->ops = driver->ops;
+ 		o_tty->index = idx;
+ 		tty_line_name(driver->other, idx, o_tty->name);
++		o_tty->owner_env = owner;
+ 
+ 		if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ 			o_tp_loc = &o_tty->termios;
+@@ -2167,14 +2205,14 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 		}
+ 
+ 		if (!*o_tp_loc) {
+-			o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
++			o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC);
+ 			if (!o_tp)
+ 				goto free_mem_out;
+ 			*o_tp = driver->other->init_termios;
+ 		}
+ 
+ 		if (!*o_ltp_loc) {
+-			o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL);
++			o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC);
+ 			if (!o_ltp)
+ 				goto free_mem_out;
+ 		}
+@@ -2190,6 +2228,10 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 			*o_ltp_loc = o_ltp;
+ 		o_tty->termios = *o_tp_loc;
+ 		o_tty->termios_locked = *o_ltp_loc;
++#ifdef CONFIG_VE
++		if (driver->other->refcount == 0)
++			(void)get_ve(owner);
++#endif
+ 		driver->other->refcount++;
+ 		if (driver->subtype == PTY_TYPE_MASTER)
+ 			o_tty->count++;
+@@ -2213,6 +2255,10 @@ static int init_dev(struct tty_driver *driver, int idx,
+ 		*ltp_loc = ltp;
+ 	tty->termios = *tp_loc;
+ 	tty->termios_locked = *ltp_loc;
++#ifdef CONFIG_VE
++	if (driver->refcount == 0)
++		(void)get_ve(owner);
++#endif
+ 	/* Compatibility until drivers always set this */
+ 	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
+ 	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
+@@ -2337,7 +2383,8 @@ static void release_one_tty(struct tty_struct *tty, int idx)
+ 
+ 	tty->magic = 0;
+ 	tty->driver->refcount--;
+-
++	if (tty->driver->refcount == 0)
++		put_ve(tty->owner_env);
+ 	file_list_lock();
+ 	list_del_init(&tty->tty_files);
+ 	file_list_unlock();
+@@ -2667,7 +2714,7 @@ static void release_dev(struct file *filp)
+ 
+ static int tty_open(struct inode *inode, struct file *filp)
+ {
+-	struct tty_struct *tty;
++	struct tty_struct *tty, *c_tty;
+ 	int noctty, retval;
+ 	struct tty_driver *driver;
+ 	int index;
+@@ -2680,6 +2727,7 @@ retry_open:
+ 	noctty = filp->f_flags & O_NOCTTY;
+ 	index  = -1;
+ 	retval = 0;
++	c_tty = NULL;
+ 
+ 	mutex_lock(&tty_mutex);
+ 
+@@ -2691,6 +2739,7 @@ retry_open:
+ 		}
+ 		driver = tty->driver;
+ 		index = tty->index;
++		c_tty = tty;
+ 		filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
+ 		/* noctty = 1; */
+ 		goto got_driver;
+@@ -2698,6 +2747,12 @@ retry_open:
+ #ifdef CONFIG_VT
+ 	if (device == MKDEV(TTY_MAJOR, 0)) {
+ 		extern struct tty_driver *console_driver;
++#ifdef CONFIG_VE
++		if (!ve_is_super(get_exec_env())) {
++			mutex_unlock(&tty_mutex);
++			return -ENODEV;
++		}
++#endif
+ 		driver = console_driver;
+ 		index = fg_console;
+ 		noctty = 1;
+@@ -2705,6 +2760,12 @@ retry_open:
+ 	}
+ #endif
+ 	if (device == MKDEV(TTYAUX_MAJOR, 1)) {
++#ifdef CONFIG_VE
++		if (!ve_is_super(get_exec_env())) {
++			mutex_unlock(&tty_mutex);
++			return -ENODEV;
++		}
++#endif
+ 		driver = console_device(&index);
+ 		if (driver) {
+ 			/* Don't let /dev/console block */
+@@ -2722,7 +2783,7 @@ retry_open:
+ 		return -ENODEV;
+ 	}
+ got_driver:
+-	retval = init_dev(driver, index, &tty);
++	retval = init_dev(driver, index, c_tty, &tty);
+ 	mutex_unlock(&tty_mutex);
+ 	if (retval)
+ 		return retval;
+@@ -2806,7 +2867,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
+ 		return index;
+ 
+ 	mutex_lock(&tty_mutex);
+-	retval = init_dev(ptm_driver, index, &tty);
++	retval = init_dev(ptm_driver, index, NULL, &tty);
+ 	mutex_unlock(&tty_mutex);
+ 
+ 	if (retval)
+@@ -3049,6 +3110,8 @@ static int tioccons(struct file *file)
+ {
+ 	if (!capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
++	if (!ve_is_super(get_exec_env()))
++		return -EACCES;
+ 	if (file->f_op->write == redirected_tty_write) {
+ 		struct file *f;
+ 		spin_lock(&redirect_lock);
+@@ -3639,7 +3702,7 @@ void __do_SAK(struct tty_struct *tty)
+ 	/* Now kill any processes that happen to have the
+ 	 * tty open.
+ 	 */
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		if (p->signal->tty == tty) {
+ 			printk(KERN_NOTICE "SAK: killed process %d"
+ 			    " (%s): task_session_nr(p)==tty->session\n",
+@@ -3671,7 +3734,7 @@ void __do_SAK(struct tty_struct *tty)
+ 			spin_unlock(&p->files->file_lock);
+ 		}
+ 		task_unlock(p);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 	read_unlock(&tasklist_lock);
+ #endif
+ }
+@@ -4005,6 +4068,7 @@ int tty_register_driver(struct tty_driver *driver)
+ 	}
+ 
+ 	mutex_lock(&tty_mutex);
++	driver->owner_env = get_exec_env();
+ 	list_add(&driver->tty_drivers, &tty_drivers);
+ 	mutex_unlock(&tty_mutex);
+ 
+@@ -4202,3 +4266,43 @@ static int __init tty_init(void)
+ 	return 0;
+ }
+ module_init(tty_init);
++
++#ifdef CONFIG_UNIX98_PTYS
++int init_ve_tty_class(void)
++{
++	struct class * ve_tty_class;
++	struct device * ve_ptmx_dev_class;
++
++	ve_tty_class = class_create(THIS_MODULE, "tty");
++	if (IS_ERR(ve_tty_class))
++		return -ENOMEM;
++
++	ve_ptmx_dev_class = device_create(ve_tty_class, NULL,
++				MKDEV(TTYAUX_MAJOR, 2), "ptmx");
++	if (IS_ERR(ve_ptmx_dev_class)) {
++		class_destroy(ve_tty_class);
++		return PTR_ERR(ve_ptmx_dev_class);
++	}
++
++	get_exec_env()->tty_class = ve_tty_class;
++	return 0;
++}
++
++void fini_ve_tty_class(void)
++{
++	struct class *ve_tty_class = get_exec_env()->tty_class;
++
++	device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2));
++	class_destroy(ve_tty_class);
++}
++#else
++int init_ve_tty_class(void)
++{
++	return 0;
++}
++void fini_ve_tty_class(void)
++{
++}
++#endif
++EXPORT_SYMBOL(init_ve_tty_class);
++EXPORT_SYMBOL(fini_ve_tty_class);
+diff --git a/drivers/net/Makefile b/drivers/net/Makefile
+index dcbfe84..097d877 100644
+--- a/drivers/net/Makefile
++++ b/drivers/net/Makefile
+@@ -27,6 +27,10 @@ gianfar_driver-objs := gianfar.o \
+ obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o
+ ucc_geth_driver-objs := ucc_geth.o ucc_geth_mii.o ucc_geth_ethtool.o
+ 
++obj-$(CONFIG_VE_NETDEV) += vznetdev.o
++vznetdev-objs := open_vznet.o venet_core.o
++obj-$(CONFIG_VE_ETHDEV) += vzethdev.o
++
+ #
+ # link order important here
+ #
+diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
+index 41b774b..78395c0 100644
+--- a/drivers/net/loopback.c
++++ b/drivers/net/loopback.c
+@@ -134,6 +134,12 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ 	struct pcpu_lstats *pcpu_lstats, *lb_stats;
+ 
++#ifdef CONFIG_VE
++	if (unlikely(get_exec_env()->disable_net)) {
++		kfree_skb(skb);
++		return 0;
++	}
++#endif
+ 	skb_orphan(skb);
+ 
+ 	skb->protocol = eth_type_trans(skb,dev);
+@@ -240,7 +246,8 @@ static void loopback_setup(struct net_device *dev)
+ 		| NETIF_F_NO_CSUM
+ 		| NETIF_F_HIGHDMA
+ 		| NETIF_F_LLTX
+-		| NETIF_F_NETNS_LOCAL;
++		| NETIF_F_NETNS_LOCAL
++		| NETIF_F_VIRTUAL;
+ 	dev->ethtool_ops	= &loopback_ethtool_ops;
+ 	dev->header_ops		= &eth_header_ops;
+ 	dev->init = loopback_dev_init;
+diff --git a/drivers/net/open_vznet.c b/drivers/net/open_vznet.c
+new file mode 100644
+index 0000000..79bf640
+--- /dev/null
++++ b/drivers/net/open_vznet.c
+@@ -0,0 +1,244 @@
++/*
++ *  open_vznet.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Virtual Networking device used to change VE ownership on packets
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/seq_file.h>
++
++#include <linux/inet.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <linux/venet.h>
++
++void veip_stop(struct ve_struct *ve)
++{
++	struct list_head *p, *tmp;
++
++	write_lock_irq(&veip_hash_lock);
++	if (ve->veip == NULL)
++		goto unlock;
++	list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
++		struct ip_entry_struct *ptr;
++		ptr = list_entry(p, struct ip_entry_struct, ve_list);
++		ptr->active_env = NULL;
++		list_del(&ptr->ve_list);
++		list_del(&ptr->ip_hash);
++		kfree(ptr);
++	}
++	veip_put(ve->veip);
++	ve->veip = NULL;
++	if (!ve_is_super(ve))
++		module_put(THIS_MODULE);
++unlock:
++	write_unlock_irq(&veip_hash_lock);
++}
++
++int veip_start(struct ve_struct *ve)
++{
++	int err, get;
++
++	err = 0;
++	write_lock_irq(&veip_hash_lock);
++	get = ve->veip == NULL;
++	ve->veip = veip_findcreate(ve->veid);
++	if (ve->veip == NULL)
++		err = -ENOMEM;
++	write_unlock_irq(&veip_hash_lock);
++	if (err == 0 && get && !ve_is_super(ve))
++		__module_get(THIS_MODULE);
++	return err;
++}
++
++int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr)
++{
++	struct ip_entry_struct *entry, *found;
++	int err;
++
++	entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
++	if (entry == NULL)
++		return -ENOMEM;
++
++	if (ve->veip == NULL) {
++		/* This can happen if we load venet AFTER ve was started */
++	       	err = veip_start(ve);
++		if (err < 0)
++			goto out;
++	}
++
++	write_lock_irq(&veip_hash_lock);
++	err = -EADDRINUSE;
++	found = venet_entry_lookup(addr);
++	if (found != NULL)
++		goto out_unlock;
++
++	entry->active_env = ve;
++	entry->addr = *addr;
++	ip_entry_hash(entry, ve->veip);
++
++	err = 0;
++	entry = NULL;
++out_unlock:
++	write_unlock_irq(&veip_hash_lock);
++out:
++	if (entry != NULL)
++		kfree(entry);
++	return err;
++}
++
++int veip_entry_del(envid_t veid, struct ve_addr_struct *addr)
++{
++	struct ip_entry_struct *found;
++	int err;
++
++	err = -EADDRNOTAVAIL;
++	write_lock_irq(&veip_hash_lock);
++	found = venet_entry_lookup(addr);
++	if (found == NULL)
++		goto out;
++	if (found->active_env->veid != veid)
++		goto out;
++
++	err = 0;
++	found->active_env = NULL;
++
++	list_del(&found->ip_hash);
++	list_del(&found->ve_list);
++	kfree(found);
++out:
++	write_unlock_irq(&veip_hash_lock);
++	return err;
++}
++
++static int skb_extract_addr(struct sk_buff *skb,
++		struct ve_addr_struct *addr, int dir)
++{
++	switch (skb->protocol) {
++	case __constant_htons(ETH_P_IP):
++		addr->family = AF_INET;
++		addr->key[0] = 0;
++		addr->key[1] = 0;
++		addr->key[2] = 0;
++		addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr);
++		return 0;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	case __constant_htons(ETH_P_IPV6):
++		addr->family = AF_INET6;
++		memcpy(&addr->key, dir ?
++				ipv6_hdr(skb)->daddr.s6_addr32 :
++				ipv6_hdr(skb)->saddr.s6_addr32,
++				sizeof(addr->key));
++		return 0;
++#endif
++	}
++
++	return -EAFNOSUPPORT;
++}
++
++static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir)
++{
++	struct ip_entry_struct *entry;
++	struct ve_addr_struct addr;
++
++	if (skb_extract_addr(skb, &addr, dir) < 0)
++		return NULL;
++
++	entry = venet_entry_lookup(&addr);
++	if (entry == NULL)
++		return NULL;
++
++	return entry->active_env;
++}
++
++int venet_change_skb_owner(struct sk_buff *skb)
++{
++	struct ve_struct *ve, *ve_old;
++
++	ve_old = skb->owner_env;
++
++	read_lock(&veip_hash_lock);
++	if (!ve_is_super(ve_old)) {
++		/* from VE to host */
++		ve = venet_find_ve(skb, 0);
++		if (ve == NULL)
++			goto out_drop;
++		if (!ve_accessible_strict(ve, ve_old))
++			goto out_source;
++		skb->owner_env = get_ve0();
++	} else {
++		/* from host to VE */
++		ve = venet_find_ve(skb, 1);
++		if (ve == NULL)
++			goto out_drop;
++		skb->owner_env = ve;
++	}
++	read_unlock(&veip_hash_lock);
++
++	return 0;
++
++out_drop:
++	read_unlock(&veip_hash_lock);
++	return -ESRCH;
++
++out_source:
++	read_unlock(&veip_hash_lock);
++	if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
++		printk(KERN_WARNING "Dropped packet, source wrong "
++		       "veid=%u src-IP=%u.%u.%u.%u "
++		       "dst-IP=%u.%u.%u.%u\n",
++		       skb->owner_env->veid,
++		       NIPQUAD(ip_hdr(skb)->saddr),
++		       NIPQUAD(ip_hdr(skb)->daddr));
++	}
++	return -EACCES;
++}
++
++#ifdef CONFIG_PROC_FS
++int veip_seq_show(struct seq_file *m, void *v)
++{
++	struct list_head *p;
++	struct ip_entry_struct *entry;
++	char s[40];
++
++	p = (struct list_head *)v;
++	if (p == ip_entry_hash_table) {
++		seq_puts(m, "Version: 2.5\n");
++		return 0;
++	}
++	entry = list_entry(p, struct ip_entry_struct, ip_hash);
++	veaddr_print(s, sizeof(s), &entry->addr);
++	seq_printf(m, "%39s %10u\n", s, 0);
++	return 0;
++}
++#endif
++
++__exit void veip_cleanup(void)
++{
++	int i;
++
++	write_lock_irq(&veip_hash_lock);
++	for (i = 0; i < VEIP_HASH_SZ; i++)
++		while (!list_empty(ip_entry_hash_table + i)) {
++			struct ip_entry_struct *entry;
++
++			entry = list_first_entry(ip_entry_hash_table + i,
++					struct ip_entry_struct, ip_hash);
++			list_del(&entry->ip_hash);
++			kfree(entry);
++		}
++	write_unlock_irq(&veip_hash_lock);
++}
++
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
++MODULE_LICENSE("GPL v2");
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index b9018bf..d9a5222 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -82,45 +82,20 @@ static int debug;
+ #define DBG1( a... )
+ #endif
+ 
+-struct tun_struct {
+-	struct list_head        list;
+-	unsigned long 		flags;
+-	int			attached;
+-	uid_t			owner;
+-	gid_t			group;
+-
+-	wait_queue_head_t	read_wait;
+-	struct sk_buff_head	readq;
+-
+-	struct net_device	*dev;
+-
+-	struct fasync_struct    *fasync;
+-
+-	unsigned long if_flags;
+-	u8 dev_addr[ETH_ALEN];
+-	u32 chr_filter[2];
+-	u32 net_filter[2];
+-
+-#ifdef TUN_DEBUG
+-	int debug;
+-#endif
+-};
+-
+ /* Network device part of the driver */
+ 
+-static unsigned int tun_net_id;
+-struct tun_net {
+-	struct list_head dev_list;
+-};
++unsigned int tun_net_id;
++EXPORT_SYMBOL(tun_net_id);
+ 
+ static const struct ethtool_ops tun_ethtool_ops;
+ 
+ /* Net device open. */
+-static int tun_net_open(struct net_device *dev)
++int tun_net_open(struct net_device *dev)
+ {
+ 	netif_start_queue(dev);
+ 	return 0;
+ }
++EXPORT_SYMBOL(tun_net_open);
+ 
+ /* Net device close. */
+ static int tun_net_close(struct net_device *dev)
+@@ -223,7 +198,7 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu)
+ }
+ 
+ /* Initialize net device. */
+-static void tun_net_init(struct net_device *dev)
++void tun_net_init(struct net_device *dev)
+ {
+ 	struct tun_struct *tun = netdev_priv(dev);
+ 
+@@ -255,6 +230,7 @@ static void tun_net_init(struct net_device *dev)
+ 		break;
+ 	}
+ }
++EXPORT_SYMBOL(tun_net_init);
+ 
+ /* Character device part */
+ 
+@@ -477,7 +453,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ 	return ret;
+ }
+ 
+-static void tun_setup(struct net_device *dev)
++void tun_setup(struct net_device *dev)
+ {
+ 	struct tun_struct *tun = netdev_priv(dev);
+ 
+@@ -494,6 +470,7 @@ static void tun_setup(struct net_device *dev)
+ 	dev->destructor = free_netdev;
+ 	dev->features |= NETIF_F_NETNS_LOCAL;
+ }
++EXPORT_SYMBOL(tun_setup);
+ 
+ static struct tun_struct *tun_get_by_name(struct tun_net *tn, const char *name)
+ {
+@@ -526,7 +503,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
+ 		      current->euid != tun->owner) ||
+ 		     (tun->group != -1 &&
+ 		      current->egid != tun->group)) &&
+-		     !capable(CAP_NET_ADMIN))
++		     !capable(CAP_NET_ADMIN) &&
++		     !capable(CAP_VE_NET_ADMIN))
+ 			return -EPERM;
+ 	}
+ 	else if (__dev_get_by_name(net, ifr->ifr_name))
+@@ -601,6 +579,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
+ 	file->private_data = tun;
+ 	tun->attached = 1;
+ 	get_net(dev_net(tun->dev));
++	tun->bind_file = file;
+ 
+ 	/* Make sure persistent devices do not get stuck in
+ 	 * xoff state.
+@@ -816,12 +795,13 @@ static int tun_chr_fasync(int fd, struct file *file, int on)
+ 	return 0;
+ }
+ 
+-static int tun_chr_open(struct inode *inode, struct file * file)
++int tun_chr_open(struct inode *inode, struct file * file)
+ {
+ 	DBG1(KERN_INFO "tunX: tun_chr_open\n");
+ 	file->private_data = NULL;
+ 	return 0;
+ }
++EXPORT_SYMBOL(tun_chr_open);
+ 
+ static int tun_chr_close(struct inode *inode, struct file *file)
+ {
+diff --git a/drivers/net/venet_core.c b/drivers/net/venet_core.c
+new file mode 100644
+index 0000000..6b21630
+--- /dev/null
++++ b/drivers/net/venet_core.c
+@@ -0,0 +1,768 @@
++/*
++ *  venet_core.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Common part for Virtuozzo virtual network devices
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/fs.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/socket.h>
++#include <linux/errno.h>
++#include <linux/fcntl.h>
++#include <linux/in.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/nsproxy.h>
++#include <linux/tcp.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <net/addrconf.h>
++
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/unistd.h>
++
++#include <linux/inet.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <linux/if_ether.h>	/* For the statistics structure. */
++#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
++#include <linux/ethtool.h>
++#include <linux/venet.h>
++#include <linux/ve_proto.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_venet.h>
++
++struct list_head ip_entry_hash_table[VEIP_HASH_SZ];
++rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED;
++LIST_HEAD(veip_lh);
++
++#define ip_entry_hash_function(ip)  (ntohl(ip) & (VEIP_HASH_SZ - 1))
++
++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
++{
++	list_add(&entry->ip_hash,
++			ip_entry_hash_table +
++			ip_entry_hash_function(entry->addr.key[3]));
++	list_add(&entry->ve_list, &veip->ip_lh);
++}
++
++void veip_put(struct veip_struct *veip)
++{
++	if (!list_empty(&veip->ip_lh))
++		return;
++	if (!list_empty(&veip->src_lh))
++		return;
++	if (!list_empty(&veip->dst_lh))
++		return;
++
++	list_del(&veip->list);
++	kfree(veip);
++}
++
++struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr)
++{
++	struct ip_entry_struct *entry;
++
++	list_for_each_entry (entry, ip_entry_hash_table +
++			ip_entry_hash_function(addr->key[3]), ip_hash)
++		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
++			return entry;
++	return NULL;
++}
++
++struct veip_struct *veip_find(envid_t veid)
++{
++	struct veip_struct *ptr;
++
++	list_for_each_entry(ptr, &veip_lh, list) {
++		if (ptr->veid != veid)
++			continue;
++		return ptr;
++	}
++	return NULL;
++}
++
++struct veip_struct *veip_findcreate(envid_t veid)
++{
++	struct veip_struct *ptr;
++
++	ptr = veip_find(veid);
++	if (ptr != NULL)
++		return ptr;
++
++	ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
++	if (ptr == NULL)
++		return NULL;
++	memset(ptr, 0, sizeof(struct veip_struct));
++	INIT_LIST_HEAD(&ptr->ip_lh);
++	INIT_LIST_HEAD(&ptr->src_lh);
++	INIT_LIST_HEAD(&ptr->dst_lh);
++	ptr->veid = veid;
++	list_add(&ptr->list, &veip_lh);
++	return ptr;
++}
++
++static int convert_sockaddr(struct sockaddr *addr, int addrlen,
++		struct ve_addr_struct *veaddr)
++{
++	int err;
++
++	switch (addr->sa_family) {
++	case AF_INET: {
++		struct sockaddr_in *sin;
++
++		err = -EINVAL;
++		if (addrlen != sizeof(struct sockaddr_in))
++			break;
++
++		err = 0;
++		sin = (struct sockaddr_in *)addr;
++		veaddr->family = AF_INET;
++		veaddr->key[0] = 0;
++		veaddr->key[1] = 0;
++		veaddr->key[2] = 0;
++		veaddr->key[3] = sin->sin_addr.s_addr;
++		break;
++	}
++	case AF_INET6: {
++		struct sockaddr_in6 *sin;
++
++		err = -EINVAL;
++		if (addrlen != sizeof(struct sockaddr_in6))
++			break;
++
++		err = 0;
++		sin = (struct sockaddr_in6 *)addr;
++		veaddr->family = AF_INET6;
++		memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key));
++		break;
++	}
++	default:
++		err = -EAFNOSUPPORT;
++	}
++	return err;
++}
++
++int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
++		struct ve_addr_struct *veaddr)
++{
++	int err;
++	char addr[MAX_SOCK_ADDR];
++
++	err = move_addr_to_kernel(uaddr, addrlen, &addr);
++	if (err < 0)
++		goto out;
++
++	err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr);
++out:
++	return err;
++}
++
++void veaddr_print(char *str, int len, struct ve_addr_struct *a)
++{
++	if (a->family == AF_INET)
++		snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3]));
++	else
++		snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x",
++				ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF,
++				ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF,
++				ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF,
++				ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF
++			);
++}
++
++/*
++ * Device functions
++ */
++
++static int venet_open(struct net_device *dev)
++{
++	if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE))
++		return -EBUSY;
++	return 0;
++}
++
++static int venet_close(struct net_device *master)
++{
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++	return 0;
++}
++
++static void venet_destructor(struct net_device *dev)
++{
++	struct venet_stats *stats = (struct venet_stats *)dev->priv;
++	if (stats == NULL)
++		return;
++	free_percpu(stats->real_stats);
++	kfree(stats);
++	dev->priv = NULL;
++}
++
++/*
++ * The higher levels take care of making this non-reentrant (it's
++ * called with bh's disabled).
++ */
++static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct net_device_stats *stats;
++	struct net_device *rcv = NULL;
++	int length;
++
++	stats = venet_stats(dev, smp_processor_id());
++	if (unlikely(get_exec_env()->disable_net))
++		goto outf;
++
++	if (skb->protocol == __constant_htons(ETH_P_IP)) {
++		struct iphdr *iph;
++		iph = ip_hdr(skb);
++		if (ipv4_is_multicast(iph->daddr))
++			goto outf;
++	} else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
++		struct ipv6hdr *ip6h;
++		ip6h = ipv6_hdr(skb);
++		if (ipv6_addr_is_multicast(&ip6h->daddr))
++			goto outf;
++		skb_orphan(skb);
++	} else {
++		goto outf;
++	}
++
++	if (venet_change_skb_owner(skb) < 0)
++		goto outf;
++
++	if (unlikely(skb->owner_env->disable_net))
++		goto outf;
++
++	rcv = skb->owner_env->_venet_dev;
++	if (!rcv)
++		/* VE going down */
++		goto outf;
++
++	dev_hold(rcv);
++
++	if (!(rcv->flags & IFF_UP)) {
++		/* Target VE does not want to receive packets */
++		dev_put(rcv);
++		goto outf;
++	}
++
++	skb->pkt_type = PACKET_HOST;
++	skb->dev = rcv;
++
++	skb_reset_mac_header(skb);
++	memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
++
++	dst_release(skb->dst);
++	skb->dst = NULL;
++#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
++	nf_conntrack_put(skb->nfct);
++	skb->nfct = NULL;
++#endif
++	length = skb->len;
++
++	netif_rx(skb);
++
++	stats->tx_bytes += length;
++	stats->tx_packets++;
++	if (rcv) {
++		struct net_device_stats *rcv_stats;
++
++		rcv_stats = venet_stats(rcv, smp_processor_id());
++		rcv_stats->rx_bytes += length;
++		rcv_stats->rx_packets++;
++		dev_put(rcv);
++	}
++
++	return 0;
++
++outf:
++	kfree_skb(skb);
++	++stats->tx_dropped;
++	return 0;
++}
++
++static struct net_device_stats *get_stats(struct net_device *dev)
++{
++	int i;
++	struct venet_stats *stats;
++
++	stats = (struct venet_stats *)dev->priv;
++	memset(&stats->stats, 0, sizeof(struct net_device_stats));
++	for (i=0; i < NR_CPUS; i++) {
++		struct net_device_stats *dev_stats;
++
++		if (!cpu_possible(i)) 
++			continue;
++		dev_stats = venet_stats(dev, i);
++		stats->stats.rx_bytes   += dev_stats->rx_bytes;
++		stats->stats.tx_bytes   += dev_stats->tx_bytes;
++		stats->stats.rx_packets += dev_stats->rx_packets;
++		stats->stats.tx_packets += dev_stats->tx_packets;
++	}
++
++	return &stats->stats;
++}
++
++/* Initialize the rest of the LOOPBACK device. */
++int venet_init_dev(struct net_device *dev)
++{
++	struct venet_stats *stats;
++
++	dev->hard_start_xmit = venet_xmit;
++	stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL);
++	if (stats == NULL)
++		goto fail;
++	stats->real_stats = alloc_percpu(struct net_device_stats);
++	if (stats->real_stats == NULL)
++		goto fail_free;
++	dev->priv = stats;
++
++	dev->get_stats = get_stats;
++	dev->open = venet_open;
++	dev->stop = venet_close;
++	dev->destructor = venet_destructor;
++
++	/*
++	 *	Fill in the generic fields of the device structure.
++	 */
++	dev->type		= ARPHRD_VOID;
++	dev->hard_header_len 	= ETH_HLEN;
++	dev->mtu		= 1500; /* eth_mtu */
++	dev->tx_queue_len	= 0;
++
++	memset(dev->broadcast, 0xFF, ETH_ALEN);
++
++	/* New-style flags. */
++	dev->flags		= IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
++	return 0;
++
++fail_free:
++	kfree(stats);
++fail:
++	return -ENOMEM;
++}
++
++static int
++venet_set_op(struct net_device *dev, u32 data,
++	     int (*fop)(struct net_device *, u32))
++{
++
++	struct ve_struct *ve;
++	int ret = 0;
++
++	read_lock(&ve_list_lock);
++	for_each_ve(ve) {
++		struct ve_struct *ve_old;
++
++		ve_old = set_exec_env(ve);
++		read_lock(&dev_base_lock);
++		for_each_netdev(ve->ve_netns, dev) {
++			if (dev->hard_start_xmit == venet_xmit)
++				ret = fop(dev, data);
++		}
++		read_unlock(&dev_base_lock);
++		set_exec_env(ve_old);
++
++		if (ret < 0)
++			break;
++	}
++	read_unlock(&ve_list_lock);
++	return ret;
++}
++
++static unsigned long common_features;
++
++static int venet_op_set_sg(struct net_device *dev, u32 data)
++{
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
++
++	if (data)
++		common_features |= NETIF_F_SG;
++	else
++		common_features &= ~NETIF_F_SG;
++
++	return venet_set_op(dev, data, ethtool_op_set_sg);
++}
++
++static int venet_op_set_tx_csum(struct net_device *dev, u32 data)
++{
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
++
++	if (data)
++		common_features |= NETIF_F_IP_CSUM;
++	else
++		common_features &= ~NETIF_F_IP_CSUM;
++
++	return venet_set_op(dev, data, ethtool_op_set_tx_csum);
++}
++
++#define venet_op_set_rx_csum venet_op_set_tx_csum
++
++static struct ethtool_ops venet_ethtool_ops = {
++	.get_sg = ethtool_op_get_sg,
++	.set_sg = venet_op_set_sg,
++	.get_tx_csum = ethtool_op_get_tx_csum,
++	.set_tx_csum = venet_op_set_tx_csum,
++	.get_rx_csum = ethtool_op_get_tx_csum,
++	.set_rx_csum = venet_op_set_rx_csum,
++	.get_tso = ethtool_op_get_tso,
++};
++
++static void venet_setup(struct net_device *dev)
++{
++	dev->init = venet_init_dev;
++	/*
++	 * No other features, as they are:
++	 *  - checksumming is required, and nobody else will done our job
++	 */
++	dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
++	       NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED;
++
++	dev->features |= common_features;
++
++	SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
++}
++
++#ifdef CONFIG_PROC_FS
++static int veinfo_seq_show(struct seq_file *m, void *v)
++{
++	struct ve_struct *ve;
++	struct ip_entry_struct *entry;
++
++	ve = list_entry((struct list_head *)v, struct ve_struct, ve_list);
++
++	seq_printf(m, "%10u %5u %5u", ve->veid,
++                                ve->class_id, atomic_read(&ve->pcounter));
++	read_lock(&veip_hash_lock);
++	if (ve->veip == NULL)
++		goto unlock;
++	list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) {
++		char addr[40];
++
++		if (entry->active_env == NULL)
++			continue;
++
++		veaddr_print(addr, sizeof(addr), &entry->addr);
++		if (entry->addr.family == AF_INET)
++			seq_printf(m, " %15s", addr);
++		else
++			seq_printf(m, " %39s", addr);
++	}
++unlock:
++	read_unlock(&veip_hash_lock);
++	seq_putc(m, '\n');
++	return 0;
++}
++
++static struct seq_operations veinfo_seq_op = {
++	.start	= ve_seq_start,
++	.next	=  ve_seq_next,
++	.stop	=  ve_seq_stop,
++	.show	=  veinfo_seq_show,
++};
++
++static int veinfo_open(struct inode *inode, struct file *file)
++{
++        return seq_open(file, &veinfo_seq_op);
++}
++
++static struct file_operations proc_veinfo_operations = {
++	.open		= veinfo_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++
++static void *veip_seq_start(struct seq_file *m, loff_t *pos)
++{
++	loff_t l;
++	struct list_head *p;
++	int i;
++
++	l = *pos;
++	write_lock_irq(&veip_hash_lock);
++	if (l == 0)
++		return ip_entry_hash_table;
++	for (i = 0; i < VEIP_HASH_SZ; i++) {
++		list_for_each(p, ip_entry_hash_table + i) {
++			if (--l == 0)
++				return p;
++		}
++	}
++	return NULL;
++}
++
++static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	struct list_head *p;
++
++	p = (struct list_head *)v;
++	while (1) {
++		p = p->next;
++		if (p < ip_entry_hash_table ||
++		    p >= ip_entry_hash_table + VEIP_HASH_SZ) {
++			(*pos)++;
++			return p;
++		}
++		if (++p >= ip_entry_hash_table + VEIP_HASH_SZ)
++			return NULL;
++	}
++	return NULL;
++}
++
++static void veip_seq_stop(struct seq_file *m, void *v)
++{
++	write_unlock_irq(&veip_hash_lock);
++}
++
++static struct seq_operations veip_seq_op = {
++	.start	= veip_seq_start,
++	.next	= veip_seq_next,
++	.stop	= veip_seq_stop,
++	.show	= veip_seq_show,
++};
++
++static int veip_open(struct inode *inode, struct file *file)
++{
++        return seq_open(file, &veip_seq_op);
++}
++
++static struct file_operations proc_veip_operations = {
++	.open		= veip_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++#endif
++
++static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr,
++		int addrlen)
++{
++	int err;
++	struct ve_struct *ve;
++	struct ve_addr_struct addr;
++
++	err = -EPERM;
++	if (!capable(CAP_SETVEID))
++		goto out;
++
++	err = sockaddr_to_veaddr(uaddr, addrlen, &addr);
++	if (err < 0)
++		goto out;
++
++	switch (op)
++	{
++		case VE_IP_ADD:
++			ve = get_ve_by_id(veid);
++			err = -ESRCH;
++			if (!ve)
++				goto out;
++
++			down_read(&ve->op_sem);
++			if (ve->is_running)
++				err = veip_entry_add(ve, &addr);
++			up_read(&ve->op_sem);
++			put_ve(ve);
++			break;
++
++		case VE_IP_DEL:
++			err = veip_entry_del(veid, &addr);
++			break;
++		default:
++			err = -EINVAL;
++	}
++
++out:
++	return err;
++}
++
++int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	err = -ENOTTY;
++	switch(cmd) {
++	case VENETCTL_VE_IP_MAP: {
++		struct vzctl_ve_ip_map s;
++		err = -EFAULT;
++		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++			break;
++		err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
++		break;
++	}
++	}
++	return err;
++}
++
++#ifdef CONFIG_COMPAT
++int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	switch(cmd) {
++	case VENETCTL_COMPAT_VE_IP_MAP: {
++		struct compat_vzctl_ve_ip_map cs;
++
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++
++		err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr),
++				cs.addrlen);
++		break;
++	}
++	default:
++		err = venet_ioctl(file, cmd, arg);
++		break;
++	}
++	return err;
++}
++#endif
++
++static struct vzioctlinfo venetcalls = {
++	.type		= VENETCTLTYPE,
++	.ioctl		= venet_ioctl,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	= compat_venet_ioctl,
++#endif
++	.owner		= THIS_MODULE,
++};
++
++int venet_dev_start(struct ve_struct *ve)
++{
++	struct net_device *dev_venet;
++	int err;
++
++	dev_venet = alloc_netdev(0, "venet%d", venet_setup);
++	if (!dev_venet)
++		return -ENOMEM;
++	dev_net_set(dev_venet, ve->ve_netns);
++	err = dev_alloc_name(dev_venet, dev_venet->name);
++	if (err<0)
++		goto err;
++	if ((err = register_netdev(dev_venet)) != 0)
++		goto err;
++	ve->_venet_dev = dev_venet;
++	return 0;
++err:
++	free_netdev(dev_venet);
++	printk(KERN_ERR "VENET initialization error err=%d\n", err);
++	return err;
++}
++
++static int venet_start(void *data)
++{
++	struct ve_struct *env;
++	int err;
++
++	env = (struct ve_struct *)data;
++	if (env->veip)
++		return -EEXIST;
++
++	err = veip_start(env);
++	if (err != 0)
++		return err;
++
++	err = venet_dev_start(env);
++	if (err)
++		goto err_free;
++	return 0;
++
++err_free:
++	veip_stop(env);
++	return err;
++}
++
++static void venet_stop(void *data)
++{
++	struct ve_struct *env;
++	struct net_device *dev;
++
++	env = (struct ve_struct *)data;
++	veip_stop(env);
++
++	dev = env->_venet_dev;
++	if (dev == NULL)
++		return;
++
++	unregister_netdev(dev);
++	env->_venet_dev = NULL;
++	free_netdev(dev);
++}
++
++static struct ve_hook venet_ve_hook = {
++	.init	  = venet_start,
++	.fini	  = venet_stop,
++	.owner	  = THIS_MODULE,
++	.priority = HOOK_PRIO_NET,
++};
++
++__init int venet_init(void)
++{
++#ifdef CONFIG_PROC_FS
++	struct proc_dir_entry *de;
++#endif
++	int i, err;
++
++	if (get_ve0()->_venet_dev != NULL)
++		return -EEXIST;
++
++	for (i = 0; i < VEIP_HASH_SZ; i++)
++		INIT_LIST_HEAD(ip_entry_hash_table + i);
++
++	err = venet_start(get_ve0());
++	if (err)
++		return err;
++
++#ifdef CONFIG_PROC_FS
++	de = proc_create("veinfo", S_IFREG | S_IRUSR, glob_proc_vz_dir,
++			&proc_veinfo_operations);
++	if (de == NULL)
++		printk(KERN_WARNING "venet: can't make veinfo proc entry\n");
++
++	de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir,
++			&proc_veip_operations);
++	if (de == NULL)
++		printk(KERN_WARNING "venet: can't make veip proc entry\n");
++#endif
++
++	ve_hook_register(VE_SS_CHAIN, &venet_ve_hook);
++	vzioctl_register(&venetcalls);
++	return 0;
++}
++
++__exit void venet_exit(void)
++{
++	vzioctl_unregister(&venetcalls);
++	ve_hook_unregister(&venet_ve_hook);
++
++#ifdef CONFIG_PROC_FS
++	remove_proc_entry("veip", proc_vz_dir);
++	remove_proc_entry("veinfo", glob_proc_vz_dir);
++#endif
++	venet_stop(get_ve0());
++	veip_cleanup();
++}
++
++module_init(venet_init);
++module_exit(venet_exit);
+diff --git a/drivers/net/vzethdev.c b/drivers/net/vzethdev.c
+new file mode 100644
+index 0000000..1414618
+--- /dev/null
++++ b/drivers/net/vzethdev.c
+@@ -0,0 +1,692 @@
++/*
++ *  veth.c
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Virtual ethernet device used to change VE ownership on packets
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/fs.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/socket.h>
++#include <linux/errno.h>
++#include <linux/fcntl.h>
++#include <linux/in.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/nsproxy.h>
++#include <linux/tcp.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/unistd.h>
++
++#include <linux/inet.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <linux/if_ether.h>	/* For the statistics structure. */
++#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
++#include <linux/ethtool.h>
++#include <linux/ve_proto.h>
++#include <linux/veth.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_veth.h>
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/vzcalluser.h>
++#include <linux/nfcalls.h>
++
++static LIST_HEAD(veth_hwaddr_list);
++static DEFINE_RWLOCK(ve_hwaddr_lock);
++static DECLARE_MUTEX(hwaddr_sem);
++
++struct net_device * veth_dev_start(char *dev_addr, char *name);
++
++struct veth_struct *hwaddr_entry_lookup(char *name)
++{
++	struct veth_struct *entry;
++
++	list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) {
++		BUG_ON(entry->pair == NULL);
++		if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0)
++			return entry;
++	}
++	return NULL;
++}
++
++int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name,
++		char *dev_addr_ve, char *name_ve)
++{
++	struct net_device *dev_ve;
++	struct net_device *dev_ve0;
++	struct ve_struct *old_env;
++	char dev_name[IFNAMSIZ];
++	int err;
++
++	down(&hwaddr_sem);
++
++	if (name[0] == '\0')
++		snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid);
++	else {
++		memcpy(dev_name, name, IFNAMSIZ - 1);
++		dev_name[IFNAMSIZ - 1] = '\0';
++	}
++	dev_ve0 = veth_dev_start(dev_addr, dev_name);
++	if (IS_ERR(dev_ve0)) {
++		err = PTR_ERR(dev_ve0);
++		goto err;
++	}
++
++	old_env = set_exec_env(ve);
++	if (name_ve[0] == '\0')
++		sprintf(dev_name, "eth%%d");
++	else {
++		memcpy(dev_name, name_ve, IFNAMSIZ - 1);
++		dev_name[IFNAMSIZ - 1] = '\0';
++	}
++	dev_ve = veth_dev_start(dev_addr_ve, dev_name);
++	if (IS_ERR(dev_ve)) {
++		err = PTR_ERR(dev_ve);
++		goto err_ve;
++	}
++	set_exec_env(old_env);
++	veth_from_netdev(dev_ve)->pair = dev_ve0;
++	veth_from_netdev(dev_ve0)->pair = dev_ve;
++
++	write_lock(&ve_hwaddr_lock);
++	list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list);
++	write_unlock(&ve_hwaddr_lock);
++
++	up(&hwaddr_sem);
++	return 0;
++
++err_ve:
++	set_exec_env(old_env);
++	unregister_netdev(dev_ve0);
++err:
++	up(&hwaddr_sem);
++	return err;
++}
++
++void veth_pair_del(struct ve_struct *env, struct veth_struct *entry)
++{
++	struct net_device *dev;
++	struct ve_struct *old_env;
++
++	write_lock(&ve_hwaddr_lock);
++	list_del(&entry->hwaddr_list);
++	write_unlock(&ve_hwaddr_lock);
++
++	dev = entry->pair;
++	BUG_ON(entry->pair == NULL);
++
++	veth_from_netdev(dev)->pair = NULL;
++	entry->pair = NULL;
++	rtnl_lock();
++	old_env = set_exec_env(dev->owner_env);
++	dev_close(dev);
++
++	/*
++	 * Now device from VE0 does not send or receive anything,
++	 * i.e. dev->hard_start_xmit won't be called.
++	 */
++	set_exec_env(env);
++	unregister_netdevice(veth_to_netdev(entry));
++	set_exec_env(dev->owner_env);
++	unregister_netdevice(dev);
++	set_exec_env(old_env);
++	rtnl_unlock();
++}
++
++int veth_entry_del(struct ve_struct *ve, char *name)
++{
++	struct veth_struct *found;
++	int err;
++
++	err = -ENODEV;
++	down(&hwaddr_sem);
++	found = hwaddr_entry_lookup(name);
++	if (found == NULL)
++		goto out;
++	if (veth_to_netdev(found)->owner_env != ve)
++		goto out;
++
++	err = 0;
++	veth_pair_del(ve, found);
++
++out:
++	up(&hwaddr_sem);
++	return err;
++}
++
++int veth_allow_change_mac(envid_t veid, char *name, int allow)
++{
++	struct ve_struct *ve;
++	struct veth_struct *found;
++	int err;
++
++	err = -ESRCH;
++	ve = get_ve_by_id(veid);
++	if (!ve)
++		return err;
++
++	down_read(&ve->op_sem);
++	if (!ve->is_running)
++		goto out_ve;
++	err = -ENODEV;
++	down(&hwaddr_sem);
++	found = hwaddr_entry_lookup(name);
++	if (found == NULL)
++		goto out_sem;
++	if (veth_to_netdev(found)->owner_env != ve)
++		goto out_sem;
++
++	err = 0;
++	found->allow_mac_change = allow;
++
++out_sem:
++	up(&hwaddr_sem);
++out_ve:
++	up_read(&ve->op_sem);
++	put_ve(ve);
++	return err;
++}
++
++/*
++ * Device functions
++ */
++
++static int veth_open(struct net_device *dev)
++{
++	return 0;
++}
++
++static int veth_close(struct net_device *master)
++{
++	return 0;
++}
++
++static void veth_destructor(struct net_device *dev)
++{
++	free_percpu(veth_from_netdev(dev)->real_stats);
++	free_netdev(dev);
++}
++
++static struct net_device_stats *get_stats(struct net_device *dev)
++{
++	int i;
++	struct net_device_stats *stats;
++
++	stats = &veth_from_netdev(dev)->stats;
++	memset(stats, 0, sizeof(struct net_device_stats));
++	for (i = 0; i < NR_CPUS; i++) {
++		struct net_device_stats *dev_stats;
++
++		if (!cpu_possible(i))
++			continue;
++		dev_stats = veth_stats(dev, i);
++		stats->rx_bytes   += dev_stats->rx_bytes;
++		stats->tx_bytes   += dev_stats->tx_bytes;
++		stats->rx_packets += dev_stats->rx_packets;
++		stats->tx_packets += dev_stats->tx_packets;
++	}
++
++	return stats;
++}
++
++/*
++ * The higher levels take care of making this non-reentrant (it's
++ * called with bh's disabled).
++ */
++static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct net_device_stats *stats;
++	struct net_device *rcv = NULL;
++	struct veth_struct *entry;
++	int length;
++
++	stats = veth_stats(dev, smp_processor_id());
++	if (unlikely(get_exec_env()->disable_net))
++		goto outf;
++
++	entry = veth_from_netdev(dev);
++	rcv = entry->pair;
++	if (!rcv)
++		/* VE going down */
++		goto outf;
++
++	if (!(rcv->flags & IFF_UP)) {
++		/* Target VE does not want to receive packets */
++		goto outf;
++	}
++
++	if (unlikely(rcv->owner_env->disable_net))
++		goto outf;
++	/* Filtering */
++	if (ve_is_super(dev->owner_env) &&
++			!veth_from_netdev(rcv)->allow_mac_change) {
++		/* from VE0 to VEX */
++		if (ve_is_super(rcv->owner_env))
++			goto out;
++		if (is_multicast_ether_addr(
++					((struct ethhdr *)skb->data)->h_dest))
++			goto out;
++		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest,
++					rcv->dev_addr))
++			goto outf;
++	} else if (!ve_is_super(dev->owner_env) &&
++			!entry->allow_mac_change) {
++		/* from VE to VE0 */
++		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source,
++					dev->dev_addr))
++			goto outf;
++	}
++
++out:
++	skb->owner_env = rcv->owner_env;
++
++	skb->dev = rcv;
++	skb->pkt_type = PACKET_HOST;
++	skb->protocol = eth_type_trans(skb, rcv);
++
++	if (skb->protocol != __constant_htons(ETH_P_IP))
++		skb_orphan(skb);
++
++	dst_release(skb->dst);
++	skb->dst = NULL;
++#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
++	nf_conntrack_put(skb->nfct);
++	skb->nfct = NULL;
++#endif
++	length = skb->len;
++
++	netif_rx(skb);
++
++	stats->tx_bytes += length;
++	stats->tx_packets++;
++	if (rcv) {
++		struct net_device_stats *rcv_stats;
++		rcv_stats = veth_stats(rcv, smp_processor_id());
++		rcv_stats->rx_bytes += length;
++		rcv_stats->rx_packets++;
++	}
++
++	return 0;
++
++outf:
++	kfree_skb(skb);
++	stats->tx_dropped++;
++	return 0;
++}
++
++static int veth_set_mac(struct net_device *dev, void *p)
++{
++	struct sockaddr *addr = p;
++
++	if (!ve_is_super(dev->owner_env) &&
++			!veth_from_netdev(dev)->allow_mac_change)
++		return -EPERM;
++	if (netif_running(dev))
++		return -EBUSY;
++	if (!is_valid_ether_addr(addr->sa_data))
++		return -EADDRNOTAVAIL;
++
++	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
++
++	return 0;
++}
++
++int veth_init_dev(struct net_device *dev)
++{
++	dev->hard_start_xmit = veth_xmit;
++	dev->get_stats = get_stats;
++	dev->open = veth_open;
++	dev->stop = veth_close;
++	dev->destructor = veth_destructor;
++
++	ether_setup(dev);
++	dev->set_mac_address = veth_set_mac;
++
++	/* remove setted by ether_setup() handler */
++	dev->change_mtu	= NULL;
++
++	dev->tx_queue_len = 0;
++
++	veth_from_netdev(dev)->real_stats =
++		alloc_percpu(struct net_device_stats);
++	if (veth_from_netdev(dev)->real_stats == NULL)
++		return -ENOMEM;
++
++	return 0;
++}
++
++static int
++veth_set_op(struct net_device *dev, u32 data,
++	     int (*fop)(struct net_device *, u32))
++{
++	struct net_device *pair;
++	int ret = 0;
++
++	ret = fop(dev, data);
++	if (ret < 0)
++		goto out;
++
++	pair = veth_from_netdev(dev)->pair;
++	if (pair)
++		ret = fop(pair, data);
++out:
++	return ret;
++}
++
++static int veth_op_set_sg(struct net_device *dev, u32 data)
++{
++	return veth_set_op(dev, data, ethtool_op_set_sg);
++}
++
++static int veth_op_set_tx_csum(struct net_device *dev, u32 data)
++{
++	return veth_set_op(dev, data, ethtool_op_set_tx_csum);
++}
++
++#define veth_op_set_rx_csum veth_op_set_tx_csum
++
++static struct ethtool_ops veth_ethtool_ops = {
++	.get_sg = ethtool_op_get_sg,
++	.set_sg = veth_op_set_sg,
++	.get_tx_csum = ethtool_op_get_tx_csum,
++	.set_tx_csum = veth_op_set_tx_csum,
++	.get_rx_csum = ethtool_op_get_tx_csum,
++	.set_rx_csum = veth_op_set_rx_csum,
++	.get_tso = ethtool_op_get_tso,
++};
++
++static void veth_setup(struct net_device *dev)
++{
++	dev->init = veth_init_dev;
++	/*
++	 * No other features, as they are:
++	 *  - checksumming is required, and nobody else will done our job
++	 */
++	dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
++		NETIF_F_HIGHDMA;
++
++	SET_ETHTOOL_OPS(dev, &veth_ethtool_ops);
++}
++
++#ifdef CONFIG_PROC_FS
++#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
++#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5]
++static int vehwaddr_seq_show(struct seq_file *m, void *v)
++{
++	struct list_head *p;
++	struct veth_struct *entry;
++
++	p = (struct list_head *)v;
++	if (p == &veth_hwaddr_list) {
++		seq_puts(m, "Version: 1.0\n");
++		return 0;
++	}
++	entry = list_entry(p, struct veth_struct, hwaddr_list);
++	seq_printf(m, ADDR_FMT " %16s ",
++			ADDR_ARG(entry->pair->dev_addr), entry->pair->name);
++	seq_printf(m, ADDR_FMT " %16s %10u %5s\n",
++			ADDR_ARG(veth_to_netdev(entry)->dev_addr),
++			veth_to_netdev(entry)->name,
++			VEID(veth_to_netdev(entry)->owner_env),
++			entry->allow_mac_change ? "allow" : "deny");
++	return 0;
++}
++
++static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos)
++{
++	read_lock(&ve_hwaddr_lock);
++	return seq_list_start_head(&veth_hwaddr_list, *pos);
++}
++
++static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	return seq_list_next(v, &veth_hwaddr_list, pos);
++}
++
++static void vehwaddr_seq_stop(struct seq_file *m, void *v)
++{
++	read_unlock(&ve_hwaddr_lock);
++}
++
++static struct seq_operations vehwaddr_seq_op = {
++	.start	= vehwaddr_seq_start,
++	.next	= vehwaddr_seq_next,
++	.stop	= vehwaddr_seq_stop,
++	.show	= vehwaddr_seq_show,
++};
++
++static int vehwaddr_open(struct inode *inode, struct file *file)
++{
++	return seq_open(file, &vehwaddr_seq_op);
++}
++
++static struct file_operations proc_vehwaddr_operations = {
++	.open		= vehwaddr_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++#endif
++
++int real_ve_hwaddr(envid_t veid, int op,
++		unsigned char *dev_addr, int addrlen, char *name,
++		unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve)
++{
++	int err;
++	struct ve_struct *ve;
++	char ve_addr[ETH_ALEN];
++
++	err = -EPERM;
++	if (!capable(CAP_NET_ADMIN))
++		goto out;
++
++	err = -EINVAL;
++	switch (op) {
++	case VE_ETH_ADD:
++		if (addrlen != ETH_ALEN)
++			goto out;
++		if (addrlen_ve != ETH_ALEN && addrlen_ve != 0)
++			goto out;
++		/* If ve addr is not set then we use dev_addr[3] & 0x80 for it */
++		if (addrlen_ve == 0 && (dev_addr[3] & 0x80))
++			goto out;
++		if (addrlen_ve == 0) {
++			memcpy(ve_addr, dev_addr, ETH_ALEN);
++			ve_addr[3] |= 0x80;
++		} else {
++			memcpy(ve_addr, dev_addr_ve, ETH_ALEN);
++		}
++
++		ve = get_ve_by_id(veid);
++		err = -ESRCH;
++		if (!ve)
++			goto out;
++
++		down_read(&ve->op_sem);
++		if (ve->is_running)
++			err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve);
++		up_read(&ve->op_sem);
++		put_ve(ve);
++		break;
++
++	case VE_ETH_DEL:
++		if (name[0] == '\0')
++			goto out;
++		ve = get_ve_by_id(veid);
++		err = -ESRCH;
++		if (!ve)
++			goto out;
++
++		down_read(&ve->op_sem);
++		if (ve->is_running)
++			err = veth_entry_del(ve, name);
++		up_read(&ve->op_sem);
++		put_ve(ve);
++		break;
++	case VE_ETH_ALLOW_MAC_CHANGE:
++	case VE_ETH_DENY_MAC_CHANGE:
++		err = veth_allow_change_mac(veid, name,
++						op == VE_ETH_ALLOW_MAC_CHANGE);
++		break;
++	}
++
++out:
++	return err;
++}
++
++int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	err = -ENOTTY;
++	switch(cmd) {
++	case VETHCTL_VE_HWADDR: {
++		struct vzctl_ve_hwaddr s;
++
++		err = -EFAULT;
++		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++			break;
++		err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen,
++				     s.dev_name, s.dev_addr_ve, s.addrlen_ve,
++				     s.dev_name_ve);
++	}
++	break;
++	}
++	return err;
++}
++
++static struct vzioctlinfo vethcalls = {
++	.type		= VETHCTLTYPE,
++	.ioctl		= veth_ioctl,
++	.compat_ioctl	= veth_ioctl,
++	.owner		= THIS_MODULE,
++};
++
++struct net_device * veth_dev_start(char *dev_addr, char *name)
++{
++	struct net_device *dev;
++	int err;
++
++	if (!is_valid_ether_addr(dev_addr))
++		return ERR_PTR(-EADDRNOTAVAIL);
++
++	dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup);
++	if (!dev)
++		return ERR_PTR(-ENOMEM);
++	dev->nd_net = get_exec_env()->ve_netns;
++	if (strchr(dev->name, '%')) {
++		err = dev_alloc_name(dev, dev->name);
++		if (err < 0)
++			goto err;
++	}
++	if ((err = register_netdev(dev)) != 0)
++		goto err;
++
++	memcpy(dev->dev_addr, dev_addr, ETH_ALEN);
++	dev->addr_len = ETH_ALEN;
++
++	return dev;
++err:
++	free_netdev(dev);
++	printk(KERN_ERR "%s initialization error err=%d\n", name, err);
++	return ERR_PTR(err);
++}
++
++static int veth_start(void *data)
++{
++	return 0;
++}
++
++static void veth_stop(void *data)
++{
++	struct ve_struct *env;
++	struct veth_struct *entry, *tmp;
++
++	env = (struct ve_struct *)data;
++	down(&hwaddr_sem);
++	list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list)
++		if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env))
++			veth_pair_del(env, entry);
++	up(&hwaddr_sem);
++}
++
++static struct ve_hook veth_ve_hook = {
++	.init	  = veth_start,
++	.fini	  = veth_stop,
++	.owner	  = THIS_MODULE,
++	.priority = HOOK_PRIO_NET,
++};
++
++__init int veth_init(void)
++{
++#ifdef CONFIG_PROC_FS
++	struct proc_dir_entry *de;
++
++	de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir,
++			&proc_vehwaddr_operations);
++	if (de == NULL)
++		printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n");
++#endif
++
++	ve_hook_register(VE_SS_CHAIN, &veth_ve_hook);
++	vzioctl_register(&vethcalls);
++	KSYMRESOLVE(veth_open);
++	KSYMMODRESOLVE(vzethdev);
++	return 0;
++}
++
++__exit void veth_exit(void)
++{
++	struct veth_struct *entry;
++	struct list_head *tmp, *n;
++	struct ve_struct *ve;
++
++	KSYMMODUNRESOLVE(vzethdev);
++	KSYMUNRESOLVE(veth_open);
++	vzioctl_unregister(&vethcalls);
++	ve_hook_unregister(&veth_ve_hook);
++#ifdef CONFIG_PROC_FS
++	remove_proc_entry("veth", proc_vz_dir);
++#endif
++
++	down(&hwaddr_sem);
++	list_for_each_safe(tmp, n, &veth_hwaddr_list) {
++		entry = list_entry(tmp, struct veth_struct, hwaddr_list);
++		ve = get_ve(veth_to_netdev(entry)->owner_env);
++
++		veth_pair_del(ve, entry);
++
++		put_ve(ve);
++	}
++	up(&hwaddr_sem);
++}
++
++module_init(veth_init);
++module_exit(veth_exit);
++
++MODULE_AUTHOR("Andrey Mirkin <amirkin at sw.ru>");
++MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device");
++MODULE_LICENSE("GPL v2");
++
+diff --git a/fs/Kconfig b/fs/Kconfig
+index 2694648..be9d729 100644
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -554,13 +554,22 @@ config QUOTA_NETLINK_INTERFACE
+ config PRINT_QUOTA_WARNING
+ 	bool "Print quota warnings to console (OBSOLETE)"
+ 	depends on QUOTA
+-	default y
++	default n
+ 	help
+ 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+ 	  hardlimit, etc.) will be printed to the process' controlling terminal.
+ 	  Note that this behavior is currently deprecated and may go away in
+ 	  future. Please use notification via netlink socket instead.
+ 
++config QUOTA_COMPAT
++	bool "Compatibility with older quotactl interface"
++	depends on QUOTA
++	help
++	  This option enables compatibility layer for older version
++	  of quotactl interface with byte granularity (QUOTAON at 0x0100,
++	  GETQUOTA at 0x0D00).  Interface versions older than that one and
++	  with block granularity are still not supported.
++
+ config QFMT_V1
+ 	tristate "Old quota format support"
+ 	depends on QUOTA
+@@ -576,6 +585,40 @@ config QFMT_V2
+ 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
+ 	  need this functionality say Y here.
+ 
++config SIM_FS
++	tristate "VPS filesystem"
++	depends on VZ_QUOTA
++	default m
++	help
++	  This file system is a part of Virtuozzo. It intoduces a fake
++	  superblock and blockdev to VE to hide real device and show
++	  statfs results taken from quota.
++
++config VZ_QUOTA
++	tristate "Virtuozzo Disk Quota support"
++	select QUOTA
++	select QUOTA_COMPAT
++	select VZ_DEV
++	default m
++	help
++	  Virtuozzo Disk Quota imposes disk quota on directories with their
++	  files and subdirectories in total.  Such disk quota is used to
++	  account and limit disk usage by Virtuozzo VPS, but also may be used
++	  separately.
++
++config VZ_QUOTA_UNLOAD
++	bool "Unloadable Virtuozzo Disk Quota module"
++	depends on VZ_QUOTA=m
++	default n
++	help
++	  Make Virtuozzo Disk Quota module unloadable.
++	  Doesn't work reliably now.
++
++config VZ_QUOTA_UGID
++	bool "Per-user and per-group quota in Virtuozzo quota partitions"
++	depends on VZ_QUOTA!=n
++	default y
++
+ config QUOTACTL
+ 	bool
+ 	depends on XFS_QUOTA || QUOTA
+diff --git a/fs/Makefile b/fs/Makefile
+index 1e7a11b..4c87b36 100644
+--- a/fs/Makefile
++++ b/fs/Makefile
+@@ -53,9 +53,15 @@ obj-$(CONFIG_QUOTA)		+= dquot.o
+ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
+ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+ obj-$(CONFIG_QUOTACTL)		+= quota.o
++obj-$(CONFIG_VZ_QUOTA)		+= vzdquota.o
++vzdquota-y			+= vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o
++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o
++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o
+ 
+ obj-$(CONFIG_DNOTIFY)		+= dnotify.o
+ 
++obj-$(CONFIG_SIM_FS)		+= simfs.o
++
+ obj-$(CONFIG_PROC_FS)		+= proc/
+ obj-y				+= partitions/
+ obj-$(CONFIG_SYSFS)		+= sysfs/
+diff --git a/fs/aio.c b/fs/aio.c
+index 0fb3117..1a5d0d4 100644
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -43,13 +43,16 @@
+ #endif
+ 
+ /*------ sysctl variables----*/
+-static DEFINE_SPINLOCK(aio_nr_lock);
++DEFINE_SPINLOCK(aio_nr_lock);
++EXPORT_SYMBOL_GPL(aio_nr_lock);
+ unsigned long aio_nr;		/* current system wide number of aio requests */
++EXPORT_SYMBOL_GPL(aio_nr);
+ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+ /*----end sysctl variables---*/
+ 
+ static struct kmem_cache	*kiocb_cachep;
+-static struct kmem_cache	*kioctx_cachep;
++struct kmem_cache		*kioctx_cachep;
++EXPORT_SYMBOL_GPL(kioctx_cachep);
+ 
+ static struct workqueue_struct *aio_wq;
+ 
+@@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
+ static DEFINE_SPINLOCK(fput_lock);
+ static LIST_HEAD(fput_head);
+ 
+-static void aio_kick_handler(struct work_struct *);
++void aio_kick_handler(struct work_struct *);
+ static void aio_queue_work(struct kioctx *);
+ 
+ /* aio_setup
+@@ -327,7 +330,7 @@ static void aio_cancel_all(struct kioctx *ctx)
+ 	spin_unlock_irq(&ctx->ctx_lock);
+ }
+ 
+-static void wait_for_all_aios(struct kioctx *ctx)
++void wait_for_all_aios(struct kioctx *ctx)
+ {
+ 	struct task_struct *tsk = current;
+ 	DECLARE_WAITQUEUE(wait, tsk);
+@@ -350,6 +353,7 @@ static void wait_for_all_aios(struct kioctx *ctx)
+ out:
+ 	spin_unlock_irq(&ctx->ctx_lock);
+ }
++EXPORT_SYMBOL_GPL(wait_for_all_aios);
+ 
+ /* wait_on_sync_kiocb:
+  *	Waits on the given sync kiocb to complete.
+@@ -838,7 +842,7 @@ static inline void aio_run_all_iocbs(struct kioctx *ctx)
+  *      space.
+  * Run on aiod's context.
+  */
+-static void aio_kick_handler(struct work_struct *work)
++void aio_kick_handler(struct work_struct *work)
+ {
+ 	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
+ 	mm_segment_t oldfs = get_fs();
+@@ -859,7 +863,7 @@ static void aio_kick_handler(struct work_struct *work)
+ 	if (requeue)
+ 		queue_delayed_work(aio_wq, &ctx->wq, 0);
+ }
+-
++EXPORT_SYMBOL_GPL(aio_kick_handler);
+ 
+ /*
+  * Called by kick_iocb to queue the kiocb for retry
+diff --git a/fs/autofs/init.c b/fs/autofs/init.c
+index cea5219..1217caf 100644
+--- a/fs/autofs/init.c
++++ b/fs/autofs/init.c
+@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = {
+ 	.name		= "autofs",
+ 	.get_sb		= autofs_get_sb,
+ 	.kill_sb	= autofs_kill_sb,
++	.fs_flags	= FS_VIRTUALIZED,
+ };
+ 
+ static int __init init_autofs_fs(void)
+diff --git a/fs/autofs/root.c b/fs/autofs/root.c
+index 8aacade..f273f47 100644
+--- a/fs/autofs/root.c
++++ b/fs/autofs/root.c
+@@ -362,7 +362,7 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
+ 
+ 	/* This allows root to remove symlinks */
+ 	lock_kernel();
+-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
++	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) {
+ 		unlock_kernel();
+ 		return -EACCES;
+ 	}
+@@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
+ 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
+ 		return -ENOTTY;
+ 	
+-	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
++	if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	
+ 	switch(cmd) {
+diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
+index 723a1c5..01ac1e0 100644
+--- a/fs/autofs4/init.c
++++ b/fs/autofs4/init.c
+@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = {
+ 	.name		= "autofs",
+ 	.get_sb		= autofs_get_sb,
+ 	.kill_sb	= autofs4_kill_sb,
++	.fs_flags	= FS_VIRTUALIZED,
+ };
+ 
+ static int __init init_autofs4_fs(void)
+diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
+index edf5b6b..4e9cacc 100644
+--- a/fs/autofs4/root.c
++++ b/fs/autofs4/root.c
+@@ -785,7 +785,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
+ 	struct autofs_info *p_ino;
+ 	
+ 	/* This allows root to remove symlinks */
+-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
++	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
+ 		return -EACCES;
+ 
+ 	if (atomic_dec_and_test(&ino->count)) {
+@@ -1005,7 +1005,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
+ 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
+ 		return -ENOTTY;
+ 	
+-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
++	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	
+ 	switch(cmd) {
+diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
+index 75e5955..67d444c 100644
+--- a/fs/autofs4/waitq.c
++++ b/fs/autofs4/waitq.c
+@@ -136,6 +136,16 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
+ 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+ 
+ 		pktsz = sizeof(*packet);
++#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
++		/*
++		 * On x86_64 autofs_v5_packet struct padded with 4 bytes
++		 * it broke autofs daemon worked in ia32 emulation mode
++		 *
++		 * reduce size if work in 32-bit mode to satisfy userspace hope
++		 */
++		if (test_thread_flag(TIF_IA32))
++			pktsz -= 4;
++#endif
+ 
+ 		packet->wait_queue_token = wq->wait_queue_token;
+ 		packet->len = wq->len;
+diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
+index ba4cddb..8430452 100644
+--- a/fs/binfmt_aout.c
++++ b/fs/binfmt_aout.c
+@@ -375,12 +375,12 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+ 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+ 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
+ 		{
+-			printk(KERN_NOTICE "executable not page aligned\n");
++			ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
+ 		}
+ 
+ 		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
+ 		{
+-			printk(KERN_WARNING 
++			ve_printk(VE_LOG, KERN_WARNING 
+ 			       "fd_offset is not page aligned. Please convert program: %s\n",
+ 			       bprm->file->f_path.dentry->d_name.name);
+ 		}
+@@ -495,7 +495,7 @@ static int load_aout_library(struct file *file)
+ 
+ 		if (printk_ratelimit())
+ 		{
+-			printk(KERN_WARNING 
++			ve_printk(VE_LOG, KERN_WARNING 
+ 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
+ 			       file->f_path.dentry->d_name.name);
+ 		}
+diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
+index d48ff5f..67a3eaa 100644
+--- a/fs/binfmt_elf.c
++++ b/fs/binfmt_elf.c
+@@ -403,7 +403,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+ 	eppnt = elf_phdata;
+ 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+ 		if (eppnt->p_type == PT_LOAD) {
+-			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
++			int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO;
+ 			int elf_prot = 0;
+ 			unsigned long vaddr = 0;
+ 			unsigned long k, map_addr;
+@@ -785,7 +785,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 		if (elf_ppnt->p_flags & PF_X)
+ 			elf_prot |= PROT_EXEC;
+ 
+-		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
++		elf_flags = MAP_PRIVATE | MAP_DENYWRITE |
++				MAP_EXECUTABLE | MAP_EXECPRIO;
+ 
+ 		vaddr = elf_ppnt->p_vaddr;
+ 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+@@ -920,7 +921,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ 	set_binfmt(&elf_format);
+ 
+ #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+-	retval = arch_setup_additional_pages(bprm, executable_stack);
++	retval = arch_setup_additional_pages(bprm, executable_stack, 0);
+ 	if (retval < 0) {
+ 		send_sig(SIGKILL, current, 0);
+ 		goto out;
+@@ -1517,7 +1518,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 	 * Allocate a structure for each thread.
+ 	 */
+ 	rcu_read_lock();
+-	do_each_thread(g, p)
++	do_each_thread_ve(g, p)
+ 		if (p->mm == dump_task->mm) {
+ 			t = kzalloc(offsetof(struct elf_thread_core_info,
+ 					     notes[info->thread_notes]),
+@@ -1539,7 +1540,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 				info->thread->next = t;
+ 			}
+ 		}
+-	while_each_thread(g, p);
++	while_each_thread_ve(g, p);
+ 	rcu_read_unlock();
+ 
+ 	/*
+@@ -1721,7 +1722,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 	if (signr) {
+ 		struct elf_thread_status *ets;
+ 		rcu_read_lock();
+-		do_each_thread(g, p)
++		do_each_thread_ve(g, p)
+ 			if (current->mm == p->mm && current != p) {
+ 				ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
+ 				if (!ets) {
+@@ -1731,7 +1732,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 				ets->thread = p;
+ 				list_add(&ets->list, &info->thread_list);
+ 			}
+-		while_each_thread(g, p);
++		while_each_thread_ve(g, p);
+ 		rcu_read_unlock();
+ 		list_for_each(t, &info->thread_list) {
+ 			int sz;
+diff --git a/fs/block_dev.c b/fs/block_dev.c
+index 10d8a0a..fd077d4 100644
+--- a/fs/block_dev.c
++++ b/fs/block_dev.c
+@@ -1304,7 +1304,7 @@ int __invalidate_device(struct block_device *bdev)
+ 		 * hold).
+ 		 */
+ 		shrink_dcache_sb(sb);
+-		res = invalidate_inodes(sb);
++		res = invalidate_inodes_check(sb, 1);
+ 		drop_super(sb);
+ 	}
+ 	invalidate_bdev(bdev);
+diff --git a/fs/buffer.c b/fs/buffer.c
+index 0f51c0f..9585ec2 100644
+--- a/fs/buffer.c
++++ b/fs/buffer.c
+@@ -700,6 +700,8 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
+ static int __set_page_dirty(struct page *page,
+ 		struct address_space *mapping, int warn)
+ {
++	int acct = 0;
++
+ 	if (unlikely(!mapping))
+ 		return !TestSetPageDirty(page);
+ 
+@@ -714,12 +716,14 @@ static int __set_page_dirty(struct page *page,
+ 			__inc_zone_page_state(page, NR_FILE_DIRTY);
+ 			__inc_bdi_stat(mapping->backing_dev_info,
+ 					BDI_RECLAIMABLE);
+-			task_io_account_write(PAGE_CACHE_SIZE);
++			acct = 1;
+ 		}
+ 		radix_tree_tag_set(&mapping->page_tree,
+ 				page_index(page), PAGECACHE_TAG_DIRTY);
+ 	}
+ 	write_unlock_irq(&mapping->tree_lock);
++	if (acct)
++		task_io_account_write(page, PAGE_CACHE_SIZE, 0);
+ 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ 
+ 	return 1;
+diff --git a/fs/char_dev.c b/fs/char_dev.c
+index 68e510b..8fd1195 100644
+--- a/fs/char_dev.c
++++ b/fs/char_dev.c
+@@ -22,6 +22,8 @@
+ #include <linux/mutex.h>
+ #include <linux/backing-dev.h>
+ 
++#include <linux/ve_proto.h>
++
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+diff --git a/fs/compat.c b/fs/compat.c
+index ed43e17..9ab3698 100644
+--- a/fs/compat.c
++++ b/fs/compat.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/vfs.h>
++#include <linux/virtinfo.h>
+ #include <linux/ioctl.h>
+ #include <linux/init.h>
+ #include <linux/smb.h>
+@@ -73,6 +74,18 @@ int compat_printk(const char *fmt, ...)
+ 
+ #include "read_write.h"
+ 
++int ve_compat_printk(int dst, const char *fmt, ...)
++{
++	va_list ap;
++	int ret;
++	if (!compat_log)
++		return 0;
++	va_start(ap, fmt);
++	ret = ve_vprintk(dst, fmt, ap);
++	va_end(ap);
++	return ret;
++}
++
+ /*
+  * Not all architectures have sys_utime, so implement this in terms
+  * of sys_utimes.
+@@ -244,6 +257,8 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
+ 		struct kstatfs tmp;
+ 		error = vfs_statfs(nd.path.dentry, &tmp);
+ 		if (!error)
++			error = faudit_statfs(nd.path.mnt->mnt_sb, &tmp);
++		if (!error)
+ 			error = put_compat_statfs(buf, &tmp);
+ 		path_put(&nd.path);
+ 	}
+@@ -262,6 +277,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
+ 		goto out;
+ 	error = vfs_statfs(file->f_path.dentry, &tmp);
+ 	if (!error)
++		error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
++	if (!error)
+ 		error = put_compat_statfs(buf, &tmp);
+ 	fput(file);
+ out:
+@@ -312,6 +329,8 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
+ 		struct kstatfs tmp;
+ 		error = vfs_statfs(nd.path.dentry, &tmp);
+ 		if (!error)
++			error = faudit_statfs(nd.path.mnt->mnt_sb, &tmp);
++		if (!error)
+ 			error = put_compat_statfs64(buf, &tmp);
+ 		path_put(&nd.path);
+ 	}
+@@ -333,6 +352,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
+ 		goto out;
+ 	error = vfs_statfs(file->f_path.dentry, &tmp);
+ 	if (!error)
++		error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
++	if (!error)
+ 		error = put_compat_statfs64(buf, &tmp);
+ 	fput(file);
+ out:
+@@ -1351,6 +1372,10 @@ int compat_do_execve(char * filename,
+ 	struct file *file;
+ 	int retval;
+ 
++	retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL);
++	if (retval)
++		return retval;
++
+ 	retval = -ENOMEM;
+ 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ 	if (!bprm)
+diff --git a/fs/dcache.c b/fs/dcache.c
+index 6068c25..422d2b4 100644
+--- a/fs/dcache.c
++++ b/fs/dcache.c
+@@ -27,13 +27,20 @@
+ #include <linux/module.h>
+ #include <linux/mount.h>
+ #include <linux/file.h>
++#include <linux/namei.h>
+ #include <asm/uaccess.h>
+ #include <linux/security.h>
+ #include <linux/seqlock.h>
+ #include <linux/swap.h>
+ #include <linux/bootmem.h>
++#include <linux/kernel_stat.h>
++#include <linux/vzstat.h>
++#include <linux/fdtable.h>
++#include <net/inet_sock.h>
+ #include "internal.h"
+ 
++#include <bc/dcache.h>
++#include <bc/dcache_op.h>
+ 
+ int sysctl_vfs_cache_pressure __read_mostly = 100;
+ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+@@ -43,7 +50,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+ 
+ EXPORT_SYMBOL(dcache_lock);
+ 
+-static struct kmem_cache *dentry_cache __read_mostly;
++struct kmem_cache *dentry_cache __read_mostly;
+ 
+ #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
+ 
+@@ -146,6 +153,7 @@ static struct dentry *d_kill(struct dentry *dentry)
+ 
+ 	list_del(&dentry->d_u.d_child);
+ 	dentry_stat.nr_dentry--;	/* For d_free, below */
++	preempt_enable_no_resched();
+ 	/*drops the locks, at that point nobody can reach this dentry */
+ 	dentry_iput(dentry);
+ 	parent = dentry->d_parent;
+@@ -184,21 +192,31 @@ static struct dentry *d_kill(struct dentry *dentry)
+ 
+ void dput(struct dentry *dentry)
+ {
++	struct user_beancounter *ub;
++	unsigned long d_ubsize;
++
+ 	if (!dentry)
+ 		return;
+ 
+ repeat:
+ 	if (atomic_read(&dentry->d_count) == 1)
+ 		might_sleep();
+-	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
+-		return;
++	preempt_disable();
++	if (unlikely(ub_dentry_on)) {
++		spin_lock(&dcache_lock);
++		if (!atomic_dec_and_test(&dentry->d_count)) {
++			ub_dentry_uncharge_locked(dentry);
++			spin_unlock(&dcache_lock);
++			goto out_preempt;
++		}
++	} else {
++		if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
++			goto out_preempt;
++	}
+ 
+ 	spin_lock(&dentry->d_lock);
+-	if (atomic_read(&dentry->d_count)) {
+-		spin_unlock(&dentry->d_lock);
+-		spin_unlock(&dcache_lock);
+-		return;
+-	}
++	if (atomic_read(&dentry->d_count))
++		goto out_unlock;
+ 
+ 	/*
+ 	 * AV: ->d_delete() is _NOT_ allowed to block now.
+@@ -215,17 +233,30 @@ repeat:
+   		list_add(&dentry->d_lru, &dentry_unused);
+   		dentry_stat.nr_unused++;
+   	}
++out_unlock:
+  	spin_unlock(&dentry->d_lock);
++	ub_dentry_uncharge_locked(dentry);
+ 	spin_unlock(&dcache_lock);
++out_preempt:
++	preempt_enable();
+ 	return;
+ 
+ unhash_it:
+ 	__d_drop(dentry);
+ kill_it:
+ 	dentry_lru_remove(dentry);
++
++	ub = dentry->dentry_bc.d_ub;
++	d_ubsize = dentry->dentry_bc.d_ubsize;
+ 	dentry = d_kill(dentry);
+-	if (dentry)
++	preempt_disable();
++	if (unlikely(ub_dentry_on)) {
++		uncharge_dcache(ub, d_ubsize);
++		put_beancounter(ub);
++	}
++	if (dentry) 
+ 		goto repeat;
++	preempt_enable();
+ }
+ 
+ /**
+@@ -291,6 +322,7 @@ static inline struct dentry * __dget_locked(struct dentry *dentry)
+ {
+ 	atomic_inc(&dentry->d_count);
+ 	dentry_lru_remove(dentry);
++	ub_dentry_charge_nofail(dentry);
+ 	return dentry;
+ }
+ 
+@@ -393,6 +425,7 @@ static void prune_one_dentry(struct dentry * dentry)
+ 	__acquires(dcache_lock)
+ {
+ 	__d_drop(dentry);
++	preempt_disable();
+ 	dentry = d_kill(dentry);
+ 
+ 	/*
+@@ -408,6 +441,7 @@ static void prune_one_dentry(struct dentry * dentry)
+ 			dentry->d_op->d_delete(dentry);
+ 		dentry_lru_remove(dentry);
+ 		__d_drop(dentry);
++		preempt_disable();
+ 		dentry = d_kill(dentry);
+ 		spin_lock(&dcache_lock);
+ 	}
+@@ -701,6 +735,8 @@ void shrink_dcache_for_umount(struct super_block *sb)
+ 
+ 	dentry = sb->s_root;
+ 	sb->s_root = NULL;
++	/* "/" was also charged in d_alloc_root() */
++	ub_dentry_uncharge(dentry);
+ 	atomic_dec(&dentry->d_count);
+ 	shrink_dcache_for_umount_subtree(dentry);
+ 
+@@ -860,12 +896,18 @@ void shrink_dcache_parent(struct dentry * parent)
+  */
+ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+ {
++	int res = -1;
++
++	KSTAT_PERF_ENTER(shrink_dcache)
+ 	if (nr) {
+ 		if (!(gfp_mask & __GFP_FS))
+-			return -1;
++			goto out;
+ 		prune_dcache(nr, NULL);
+ 	}
+-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
++	res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
++out:
++	KSTAT_PERF_LEAVE(shrink_dcache)
++	return res;
+ }
+ 
+ static struct shrinker dcache_shrinker = {
+@@ -888,21 +930,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+ 	struct dentry *dentry;
+ 	char *dname;
+ 
++	dname = NULL;
++	if (name->len > DNAME_INLINE_LEN-1) {
++		dname = kmalloc(name->len + 1, GFP_KERNEL);
++		if (!dname)
++			goto err_name;
++	}
++
++	ub_dentry_alloc_start();
++
+ 	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ 	if (!dentry)
+-		return NULL;
++		goto err_alloc;
+ 
+-	if (name->len > DNAME_INLINE_LEN-1) {
+-		dname = kmalloc(name->len + 1, GFP_KERNEL);
+-		if (!dname) {
+-			kmem_cache_free(dentry_cache, dentry); 
+-			return NULL;
+-		}
+-	} else  {
++	preempt_disable();
++	if (dname == NULL)
+ 		dname = dentry->d_iname;
+-	}	
+ 	dentry->d_name.name = dname;
+ 
++	if (ub_dentry_alloc(dentry))
++		goto err_charge;
++
+ 	dentry->d_name.len = name->len;
+ 	dentry->d_name.hash = name->hash;
+ 	memcpy(dname, name->name, name->len);
+@@ -933,12 +981,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+ 	}
+ 
+ 	spin_lock(&dcache_lock);
+-	if (parent)
++	if (parent) {
+ 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
++		if (parent->d_flags & DCACHE_VIRTUAL)
++			dentry->d_flags |= DCACHE_VIRTUAL;
++	}
+ 	dentry_stat.nr_dentry++;
+ 	spin_unlock(&dcache_lock);
++	preempt_enable();
++	ub_dentry_alloc_end();
+ 
+ 	return dentry;
++
++err_charge:
++	preempt_enable();
++	kmem_cache_free(dentry_cache, dentry);
++err_alloc:
++	if (name->len > DNAME_INLINE_LEN - 1)
++		kfree(dname);
++	ub_dentry_alloc_end();
++err_name:
++	return NULL;
+ }
+ 
+ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
+@@ -1244,12 +1307,12 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+ 	unsigned int hash = name->hash;
+ 	const unsigned char *str = name->name;
+ 	struct hlist_head *head = d_hash(parent,hash);
+-	struct dentry *found = NULL;
+ 	struct hlist_node *node;
+-	struct dentry *dentry;
++	struct dentry *dentry, *found;
+ 
+ 	rcu_read_lock();
+ 	
++	found = NULL;
+ 	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
+ 		struct qstr *qstr;
+ 
+@@ -1286,6 +1349,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
+ 		if (!d_unhashed(dentry)) {
+ 			atomic_inc(&dentry->d_count);
+ 			found = dentry;
++			if (ub_dentry_charge(found))
++				goto charge_failure;
+ 		}
+ 		spin_unlock(&dentry->d_lock);
+ 		break;
+@@ -1295,6 +1360,14 @@ next:
+  	rcu_read_unlock();
+ 
+  	return found;
++
++charge_failure:
++	spin_unlock(&found->d_lock);
++	rcu_read_unlock();
++	/* dentry is now unhashed, just kill it */
++	dput(found);
++	/* ... and fail lookup */
++	return NULL;
+ }
+ 
+ /**
+@@ -1763,6 +1836,16 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+ }
+ 
+ /**
++ * d_root_check - checks if dentry is accessible from current's fs root
++ * @dentry: dentry to be verified
++ * @vfsmnt: vfsmnt to which the dentry belongs
++ */
++int d_root_check(struct path *path)
++{
++	return PTR_ERR(d_path(path, NULL, 0));
++}
++
++/**
+  * __d_path - return the path of a dentry
+  * @path: the dentry/vfsmount to report
+  * @root: root vfsmnt/dentry (may be modified by this function)
+@@ -1786,18 +1869,21 @@ char *__d_path(const struct path *path, struct path *root,
+ 	struct vfsmount *vfsmnt = path->mnt;
+ 	char *end = buffer + buflen;
+ 	char *retval;
++	int deleted;
++	struct vfsmount *oldmnt = vfsmnt;
+ 
+ 	spin_lock(&vfsmount_lock);
+-	prepend(&end, &buflen, "\0", 1);
+-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+-		(prepend(&end, &buflen, " (deleted)", 10) != 0))
++	if (buffer) {
++		prepend(&end, &buflen, "\0", 1);
++		if (buflen < 1)
+ 			goto Elong;
++	}
++	deleted = (!IS_ROOT(dentry) && d_unhashed(dentry));
+ 
+-	if (buflen < 1)
+-		goto Elong;
+ 	/* Get '/' right */
+ 	retval = end-1;
+-	*retval = '/';
++	if (buffer)
++		*retval = '/';
+ 
+ 	for (;;) {
+ 		struct dentry * parent;
+@@ -1815,20 +1901,43 @@ char *__d_path(const struct path *path, struct path *root,
+ 		}
+ 		parent = dentry->d_parent;
+ 		prefetch(parent);
+-		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+-		    (prepend(&end, &buflen, "/", 1) != 0))
++		if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
++		    (prepend(&end, &buflen, "/", 1) != 0)))
+ 			goto Elong;
+ 		retval = end;
+ 		dentry = parent;
+ 	}
+ 
+ out:
++	if (deleted && buffer &&
++			prepend(&end, &buflen, " (deleted)", 10) != 0)
++		goto Elong;
++out_err:
+ 	spin_unlock(&vfsmount_lock);
+-	return retval;
++	return buffer ? retval : NULL;
+ 
+ global_root:
++	/*
++	 * We traversed the tree upward and reached a root, but the given
++	 * lookup terminal point wasn't encountered.  It means either that the
++	 * dentry is out of our scope or belongs to an abstract space like
++	 * sock_mnt or pipe_mnt.  Check for it.
++	 *
++	 * There are different options to check it.
++	 * We may assume that any dentry tree is unreachable unless it's
++	 * connected to `root' (defined as fs root of init aka child reaper)
++	 * and expose all paths that are not connected to it.
++	 * The other option is to allow exposing of known abstract spaces
++	 * explicitly and hide the path information for other cases.
++	 * This approach is more safe, let's take it.  2001/04/22  SAW
++	 */
++	if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER)) {
++		retval = ERR_PTR(-EINVAL);
++		goto out_err;
++	}
++
+ 	retval += 1;	/* hit the slash */
+-	if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
++	if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+ 		goto Elong;
+ 	root->mnt = vfsmnt;
+ 	root->dentry = dentry;
+@@ -1836,8 +1945,9 @@ global_root:
+ 
+ Elong:
+ 	retval = ERR_PTR(-ENAMETOOLONG);
+-	goto out;
++	goto out_err;
+ }
++EXPORT_SYMBOL(__d_path);
+ 
+ /**
+  * d_path - return the path of a dentry
+@@ -1864,8 +1974,11 @@ char *d_path(const struct path *path, char *buf, int buflen)
+ 	 * thus don't need to be hashed.  They also don't need a name until a
+ 	 * user wants to identify the object in /proc/pid/fd/.  The little hack
+ 	 * below allows us to generate a name for these objects on demand:
++	 *
++	 * pipefs and socketfs methods assume valid buffer, d_root_check()
++	 * supplies NULL one for access checks.
+ 	 */
+-	if (path->dentry->d_op && path->dentry->d_op->d_dname)
++	if (buf && path->dentry->d_op && path->dentry->d_op->d_dname)
+ 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+ 
+ 	read_lock(&current->fs->lock);
+@@ -1880,6 +1993,231 @@ char *d_path(const struct path *path, char *buf, int buflen)
+ 	return res;
+ }
+ 
++#ifdef CONFIG_VE
++#include <net/sock.h>
++#include <linux/ip.h>
++#include <linux/file.h>
++#include <linux/mnt_namespace.h>
++#include <linux/vzratelimit.h>
++
++static void mark_sub_tree_virtual(struct dentry *d)
++{
++	struct dentry *orig_root;
++
++	orig_root = d;
++	while (1) {
++		spin_lock(&d->d_lock);
++		d->d_flags |= DCACHE_VIRTUAL;
++		spin_unlock(&d->d_lock);
++
++		if (!list_empty(&d->d_subdirs)) {
++			d = list_entry(d->d_subdirs.next,
++					struct dentry, d_u.d_child);
++			continue;
++		}
++		if (d == orig_root)
++			break;
++		while (d == list_entry(d->d_parent->d_subdirs.prev,
++					struct dentry, d_u.d_child)) {
++			d = d->d_parent;
++			if (d == orig_root)
++				goto out;
++		}
++		d = list_entry(d->d_u.d_child.next,
++				struct dentry, d_u.d_child);
++	}
++out:
++	return;
++}
++
++void mark_tree_virtual(struct path *path)
++{
++	struct vfsmount *orig_rootmnt;
++	struct vfsmount *m = path->mnt;
++	struct dentry *d = path->dentry;
++
++	spin_lock(&dcache_lock);
++	spin_lock(&vfsmount_lock);
++	orig_rootmnt = m;
++	while (1) {
++		mark_sub_tree_virtual(d);
++		if (!list_empty(&m->mnt_mounts)) {
++			m = list_entry(m->mnt_mounts.next,
++					struct vfsmount, mnt_child);
++			d = m->mnt_root;
++			continue;
++		}
++		if (m == orig_rootmnt)
++			break;
++		while (m == list_entry(m->mnt_parent->mnt_mounts.prev,
++					struct vfsmount, mnt_child)) {
++			m = m->mnt_parent;
++			if (m == orig_rootmnt)
++				goto out;
++		}
++		m = list_entry(m->mnt_child.next,
++				struct vfsmount, mnt_child);
++		d = m->mnt_root;
++	}
++out:
++	spin_unlock(&vfsmount_lock);
++	spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(mark_tree_virtual);
++
++static struct vz_rate_info area_ri = { 20, 10*HZ };
++#define VE_AREA_ACC_CHECK	0x0001
++#define VE_AREA_ACC_DENY	0x0002
++#define VE_AREA_EXEC_CHECK	0x0010
++#define VE_AREA_EXEC_DENY	0x0020
++#define VE0_AREA_ACC_CHECK	0x0100
++#define VE0_AREA_ACC_DENY	0x0200
++#define VE0_AREA_EXEC_CHECK	0x1000
++#define VE0_AREA_EXEC_DENY	0x2000
++int ve_area_access_check = 0;
++
++static void print_connection_info(struct task_struct *tsk)
++{
++	struct files_struct *files;
++	struct fdtable *fdt;
++	int fd;
++
++	files = get_files_struct(tsk);
++	if (!files)
++		return;
++
++	spin_lock(&files->file_lock);
++	fdt = files_fdtable(files);
++	for (fd = 0; fd < fdt->max_fds; fd++) {
++		struct file *file;
++		struct inode *inode;
++		struct socket *socket;
++		struct sock *sk;
++		struct inet_sock *inet;
++
++		file = fdt->fd[fd];
++		if (file == NULL)
++			continue;
++
++		inode = file->f_dentry->d_inode;
++		if (!S_ISSOCK(inode->i_mode))
++			continue;
++
++		socket = SOCKET_I(inode);
++		if (socket == NULL)
++			continue;
++
++		sk = socket->sk;
++		if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
++		    || sk->sk_type != SOCK_STREAM)
++			continue;
++
++		inet = inet_sk(sk);
++		printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n",
++				NIPQUAD(inet->daddr), ntohs(inet->dport),
++				inet->num);
++	}
++	spin_unlock(&files->file_lock);
++	put_files_struct(files);
++}
++
++static void check_alert(struct path *path, char *str)
++{
++	struct task_struct *tsk;
++	unsigned long page;
++	struct super_block *sb;
++	char *p;
++
++	if (!vz_ratelimit(&area_ri))
++		return;
++
++	tsk = current;
++	p = ERR_PTR(-ENOMEM);
++	page = __get_free_page(GFP_KERNEL);
++	if (page) {
++		spin_lock(&dcache_lock);
++		p = __d_path(path, &tsk->fs->root, (char *)page, PAGE_SIZE);
++		spin_unlock(&dcache_lock);
++	}
++	if (IS_ERR(p))
++		p = "(undefined)";
++
++	sb = path->dentry->d_sb;
++	printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n"
++			"Task %d/%d[%s] from VE%d, execenv %d\n",
++			str, p,	sb->s_type->owner_env->veid,
++			sb->s_type->name, sb->s_dev,
++			tsk->pid, task_pid_vnr(tsk), tsk->comm,
++			VE_TASK_INFO(tsk)->owner_env->veid,
++			get_exec_env()->veid);
++
++	free_page(page);
++
++	print_connection_info(tsk);
++
++	read_lock(&tasklist_lock);
++	tsk = tsk->parent;
++	get_task_struct(tsk);
++	read_unlock(&tasklist_lock);
++
++	printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n",
++			tsk->pid, task_pid_vnr(tsk), tsk->comm,
++			VE_TASK_INFO(tsk)->owner_env->veid);
++
++	print_connection_info(tsk);
++	put_task_struct(tsk);
++	dump_stack();
++}
++#endif
++
++int check_area_access_ve(struct path *path)
++{
++#ifdef CONFIG_VE
++	int check, alert, deny;
++
++	if (ve_is_super(get_exec_env())) {
++		check = ve_area_access_check & VE0_AREA_ACC_CHECK;
++		alert = path->dentry->d_flags & DCACHE_VIRTUAL;
++		deny = ve_area_access_check & VE0_AREA_ACC_DENY;
++	} else {
++		check = ve_area_access_check & VE_AREA_ACC_CHECK;
++		alert = !(path->dentry->d_flags & DCACHE_VIRTUAL);
++		deny = ve_area_access_check & VE_AREA_ACC_DENY;
++	}
++
++	if (check && alert)
++		check_alert(path, "Access");
++	if (deny && alert)
++		return -EACCES;
++#endif
++	return 0;
++}
++
++#if 0
++int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt)
++{
++#ifdef CONFIG_VE
++	int check, alert, deny;
++
++	if (ve_is_super(get_exec_env())) {
++		check = ve_area_access_check & VE0_AREA_EXEC_CHECK;
++		alert = dentry->d_flags & DCACHE_VIRTUAL;
++		deny = ve_area_access_check & VE0_AREA_EXEC_DENY;
++	} else {
++		check = ve_area_access_check & VE_AREA_EXEC_CHECK;
++		alert = !(dentry->d_flags & DCACHE_VIRTUAL);
++		deny = ve_area_access_check & VE_AREA_EXEC_DENY;
++	}
++
++	if (check && alert)
++		check_alert(mnt, dentry, "Exec");
++	if (deny && alert)
++		return -EACCES;
++#endif
++	return 0;
++}
++#endif
++
+ /*
+  * Helper function for dentry_operations.d_dname() members
+  */
+@@ -2072,10 +2410,12 @@ resume:
+ 			goto repeat;
+ 		}
+ 		atomic_dec(&dentry->d_count);
++		ub_dentry_uncharge_locked(dentry);
+ 	}
+ 	if (this_parent != root) {
+ 		next = this_parent->d_u.d_child.next;
+ 		atomic_dec(&this_parent->d_count);
++		ub_dentry_uncharge_locked(this_parent);
+ 		this_parent = this_parent->d_parent;
+ 		goto resume;
+ 	}
+diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
+index 285b64a..d89511b 100644
+--- a/fs/devpts/inode.c
++++ b/fs/devpts/inode.c
+@@ -23,6 +23,7 @@
+ #include <linux/parser.h>
+ #include <linux/fsnotify.h>
+ #include <linux/seq_file.h>
++#include <linux/ve.h>
+ 
+ #define DEVPTS_SUPER_MAGIC 0x1cd1
+ 
+@@ -30,18 +31,26 @@
+ 
+ extern int pty_limit;			/* Config limit on Unix98 ptys */
+ static DEFINE_IDR(allocated_ptys);
++#ifdef CONFIG_VE
++#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys))
++#define ve_allocated_ptys	__ve_allocated_ptys(get_exec_env())
++#else
++#define __ve_allocated_ptys(ve) allocated_ptys
++#define ve_allocated_ptys	allocated_ptys
++#endif
+ static DEFINE_MUTEX(allocated_ptys_lock);
+ 
++struct devpts_config devpts_config = {.mode = 0600};
++
++#ifndef CONFIG_VE
+ static struct vfsmount *devpts_mnt;
+ static struct dentry *devpts_root;
+-
+-static struct {
+-	int setuid;
+-	int setgid;
+-	uid_t   uid;
+-	gid_t   gid;
+-	umode_t mode;
+-} config = {.mode = DEVPTS_DEFAULT_MODE};
++#define config	devpts_config
++#else
++#define devpts_mnt	(get_exec_env()->devpts_mnt)
++#define devpts_root	(get_exec_env()->devpts_root)
++#define config		(*(get_exec_env()->devpts_config))
++#endif
+ 
+ enum {
+ 	Opt_uid, Opt_gid, Opt_mode,
+@@ -93,7 +102,8 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
+ 			config.mode = option & S_IALLUGO;
+ 			break;
+ 		default:
+-			printk(KERN_ERR "devpts: called with bogus options\n");
++			ve_printk(VE_LOG, KERN_ERR
++					"devpts: called with bogus options\n");
+ 			return -EINVAL;
+ 		}
+ 	}
+@@ -157,13 +167,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
+ 	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+ }
+ 
+-static struct file_system_type devpts_fs_type = {
++struct file_system_type devpts_fs_type = {
+ 	.owner		= THIS_MODULE,
+ 	.name		= "devpts",
+ 	.get_sb		= devpts_get_sb,
+ 	.kill_sb	= kill_anon_super,
+ };
+ 
++EXPORT_SYMBOL(devpts_fs_type);
++
+ /*
+  * The normal naming convention is simply /dev/pts/<number>; this conforms
+  * to the System V naming convention
+@@ -183,12 +195,12 @@ int devpts_new_index(void)
+ 	int idr_ret;
+ 
+ retry:
+-	if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
++	if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) {
+ 		return -ENOMEM;
+ 	}
+ 
+ 	mutex_lock(&allocated_ptys_lock);
+-	idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
++	idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index);
+ 	if (idr_ret < 0) {
+ 		mutex_unlock(&allocated_ptys_lock);
+ 		if (idr_ret == -EAGAIN)
+@@ -197,7 +209,7 @@ retry:
+ 	}
+ 
+ 	if (index >= pty_limit) {
+-		idr_remove(&allocated_ptys, index);
++		idr_remove(&ve_allocated_ptys, index);
+ 		mutex_unlock(&allocated_ptys_lock);
+ 		return -EIO;
+ 	}
+@@ -208,7 +220,7 @@ retry:
+ void devpts_kill_index(int idx)
+ {
+ 	mutex_lock(&allocated_ptys_lock);
+-	idr_remove(&allocated_ptys, idx);
++	idr_remove(&ve_allocated_ptys, idx);
+ 	mutex_unlock(&allocated_ptys_lock);
+ }
+ 
+@@ -278,6 +290,17 @@ void devpts_pty_kill(int number)
+ 	mutex_unlock(&devpts_root->d_inode->i_mutex);
+ }
+ 
++void prepare_tty(void)
++{
++#ifdef CONFIG_VE
++	get_ve0()->allocated_ptys = &allocated_ptys;
++	/*
++	 * in this case, tty_register_driver() setups
++	 * owner_env correctly right from the bootup
++	 */
++#endif
++}
++
+ static int __init init_devpts_fs(void)
+ {
+ 	int err = register_filesystem(&devpts_fs_type);
+@@ -286,11 +309,13 @@ static int __init init_devpts_fs(void)
+ 		if (IS_ERR(devpts_mnt))
+ 			err = PTR_ERR(devpts_mnt);
+ 	}
++	prepare_tty();
+ 	return err;
+ }
+ 
+ static void __exit exit_devpts_fs(void)
+ {
++	/* the code is never called, the argument is irrelevant */
+ 	unregister_filesystem(&devpts_fs_type);
+ 	mntput(devpts_mnt);
+ }
+diff --git a/fs/direct-io.c b/fs/direct-io.c
+index 9e81add..f30ffef 100644
+--- a/fs/direct-io.c
++++ b/fs/direct-io.c
+@@ -666,7 +666,7 @@ submit_page_section(struct dio *dio, struct page *page,
+ 		/*
+ 		 * Read accounting is performed in submit_bio()
+ 		 */
+-		task_io_account_write(len);
++		task_io_account_write(page, len, 1);
+ 	}
+ 
+ 	/*
+diff --git a/fs/dquot.c b/fs/dquot.c
+index 5ac77da..6d488bd 100644
+--- a/fs/dquot.c
++++ b/fs/dquot.c
+@@ -162,7 +162,9 @@ static struct quota_format_type *find_quota_format(int id)
+ 	struct quota_format_type *actqf;
+ 
+ 	spin_lock(&dq_list_lock);
+-	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
++	for (actqf = quota_formats;
++		 actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL);
++						 actqf = actqf->qf_next);
+ 	if (!actqf || !try_module_get(actqf->qf_owner)) {
+ 		int qm;
+ 
+diff --git a/fs/eventpoll.c b/fs/eventpoll.c
+index 990c01d..4e99696 100644
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -31,6 +31,7 @@
+ #include <linux/eventpoll.h>
+ #include <linux/mount.h>
+ #include <linux/bitops.h>
++#include <linux/module.h>
+ #include <linux/mutex.h>
+ #include <linux/anon_inodes.h>
+ #include <asm/uaccess.h>
+@@ -102,11 +103,6 @@
+ 
+ #define EP_UNACTIVE_PTR ((void *) -1L)
+ 
+-struct epoll_filefd {
+-	struct file *file;
+-	int fd;
+-};
+-
+ /*
+  * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
+  * It is used to keep track on all tasks that are currently inside the wake_up() code
+@@ -129,79 +125,6 @@ struct poll_safewake {
+ 	spinlock_t lock;
+ };
+ 
+-/*
+- * Each file descriptor added to the eventpoll interface will
+- * have an entry of this type linked to the "rbr" RB tree.
+- */
+-struct epitem {
+-	/* RB tree node used to link this structure to the eventpoll RB tree */
+-	struct rb_node rbn;
+-
+-	/* List header used to link this structure to the eventpoll ready list */
+-	struct list_head rdllink;
+-
+-	/*
+-	 * Works together "struct eventpoll"->ovflist in keeping the
+-	 * single linked chain of items.
+-	 */
+-	struct epitem *next;
+-
+-	/* The file descriptor information this item refers to */
+-	struct epoll_filefd ffd;
+-
+-	/* Number of active wait queue attached to poll operations */
+-	int nwait;
+-
+-	/* List containing poll wait queues */
+-	struct list_head pwqlist;
+-
+-	/* The "container" of this item */
+-	struct eventpoll *ep;
+-
+-	/* List header used to link this item to the "struct file" items list */
+-	struct list_head fllink;
+-
+-	/* The structure that describe the interested events and the source fd */
+-	struct epoll_event event;
+-};
+-
+-/*
+- * This structure is stored inside the "private_data" member of the file
+- * structure and rapresent the main data sructure for the eventpoll
+- * interface.
+- */
+-struct eventpoll {
+-	/* Protect the this structure access */
+-	spinlock_t lock;
+-
+-	/*
+-	 * This mutex is used to ensure that files are not removed
+-	 * while epoll is using them. This is held during the event
+-	 * collection loop, the file cleanup path, the epoll file exit
+-	 * code and the ctl operations.
+-	 */
+-	struct mutex mtx;
+-
+-	/* Wait queue used by sys_epoll_wait() */
+-	wait_queue_head_t wq;
+-
+-	/* Wait queue used by file->poll() */
+-	wait_queue_head_t poll_wait;
+-
+-	/* List of ready file descriptors */
+-	struct list_head rdllist;
+-
+-	/* RB tree root used to store monitored fd structs */
+-	struct rb_root rbr;
+-
+-	/*
+-	 * This is a single linked list that chains all the "struct epitem" that
+-	 * happened while transfering ready events to userspace w/out
+-	 * holding ->lock.
+-	 */
+-	struct epitem *ovflist;
+-};
+-
+ /* Wait structure used by the poll hooks */
+ struct eppoll_entry {
+ 	/* List header used to link this structure to the "struct epitem" */
+@@ -229,7 +152,8 @@ struct ep_pqueue {
+ /*
+  * This mutex is used to serialize ep_free() and eventpoll_release_file().
+  */
+-static struct mutex epmutex;
++struct mutex epmutex;
++EXPORT_SYMBOL_GPL(epmutex);
+ 
+ /* Safe wake up implementation */
+ static struct poll_safewake psw;
+@@ -482,10 +406,11 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
+ }
+ 
+ /* File callbacks that implement the eventpoll file behaviour */
+-static const struct file_operations eventpoll_fops = {
++const struct file_operations eventpoll_fops = {
+ 	.release	= ep_eventpoll_release,
+ 	.poll		= ep_eventpoll_poll
+ };
++EXPORT_SYMBOL(eventpoll_fops);
+ 
+ /* Fast test to see if the file is an evenpoll file */
+ static inline int is_file_epoll(struct file *f)
+@@ -557,7 +482,7 @@ static int ep_alloc(struct eventpoll **pep)
+  * are protected by the "mtx" mutex, and ep_find() must be called with
+  * "mtx" held.
+  */
+-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+ {
+ 	int kcmp;
+ 	struct rb_node *rbp;
+@@ -583,6 +508,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+ 
+ 	return epir;
+ }
++EXPORT_SYMBOL_GPL(ep_find);
+ 
+ /*
+  * This is the callback that is passed to the wait queue wakeup
+@@ -695,7 +621,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
+ /*
+  * Must be called with "mtx" held.
+  */
+-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+ 		     struct file *tfile, int fd)
+ {
+ 	int error, revents, pwake = 0;
+@@ -792,6 +718,7 @@ error_unregister:
+ error_return:
+ 	return error;
+ }
++EXPORT_SYMBOL(ep_insert);
+ 
+ /*
+  * Modify the interest event mask by dropping an event if the new mask
+@@ -1078,6 +1005,7 @@ error_return:
+ 
+ 	return fd;
+ }
++EXPORT_SYMBOL(sys_epoll_create);
+ 
+ /*
+  * The following function implements the controller interface for
+diff --git a/fs/exec.c b/fs/exec.c
+index fd92343..2f1a48a 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -26,6 +26,7 @@
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/mman.h>
++#include <linux/virtinfo.h>
+ #include <linux/stat.h>
+ #include <linux/fcntl.h>
+ #include <linux/smp_lock.h>
+@@ -56,6 +57,8 @@
+ #include <asm/mmu_context.h>
+ #include <asm/tlb.h>
+ 
++#include <bc/vmpages.h>
++
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+@@ -71,6 +74,8 @@ int suid_dumpable = 0;
+ 
+ /* The maximal length of core_pattern is also specified in sysctl.c */
+ 
++int sysctl_at_vsyscall;
++
+ static LIST_HEAD(formats);
+ static DEFINE_RWLOCK(binfmt_lock);
+ 
+@@ -230,9 +235,13 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
+ 	struct vm_area_struct *vma = NULL;
+ 	struct mm_struct *mm = bprm->mm;
+ 
+-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
++	if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags,
++				NULL, UB_SOFT))
++		goto fail_charge;
++
++	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL_UBC);
+ 	if (!vma)
+-		goto err;
++		goto fail_alloc;
+ 
+ 	down_write(&mm->mmap_sem);
+ 	vma->vm_mm = mm;
+@@ -266,7 +275,9 @@ err:
+ 		bprm->vma = NULL;
+ 		kmem_cache_free(vm_area_cachep, vma);
+ 	}
+-
++fail_alloc:
++	ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL);
++fail_charge:
+ 	return err;
+ }
+ 
+@@ -709,10 +720,11 @@ int kernel_read(struct file *file, unsigned long offset,
+ 
+ EXPORT_SYMBOL(kernel_read);
+ 
+-static int exec_mmap(struct mm_struct *mm)
++static int exec_mmap(struct linux_binprm *bprm)
+ {
+ 	struct task_struct *tsk;
+-	struct mm_struct * old_mm, *active_mm;
++	struct mm_struct *old_mm, *active_mm, *mm;
++	int ret;
+ 
+ 	/* Notify parent that we're no longer interested in the old VM */
+ 	tsk = current;
+@@ -734,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm)
+ 			return -EINTR;
+ 		}
+ 	}
++
++	ret = 0;
++	mm = bprm->mm;
++	mm->vps_dumpable = 1;
+ 	task_lock(tsk);
+ 	active_mm = tsk->active_mm;
+ 	tsk->mm = mm;
+@@ -742,14 +758,24 @@ static int exec_mmap(struct mm_struct *mm)
+ 	task_unlock(tsk);
+ 	mm_update_next_owner(old_mm);
+ 	arch_pick_mmap_layout(mm);
++	bprm->mm = NULL;		/* We're using it now */
++
++#ifdef CONFIG_VZ_GENCALLS
++	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP,
++				bprm) & NOTIFY_FAIL) {
++		/* similar to binfmt_elf */
++		send_sig(SIGKILL, current, 0);
++		ret = -ENOMEM;
++	}
++#endif
+ 	if (old_mm) {
+ 		up_read(&old_mm->mmap_sem);
+ 		BUG_ON(active_mm != old_mm);
+ 		mmput(old_mm);
+-		return 0;
++		return ret;
+ 	}
+ 	mmdrop(active_mm);
+-	return 0;
++	return ret;
+ }
+ 
+ /*
+@@ -847,6 +873,10 @@ static int de_thread(struct task_struct *tsk)
+ 		transfer_pid(leader, tsk, PIDTYPE_PGID);
+ 		transfer_pid(leader, tsk, PIDTYPE_SID);
+ 		list_replace_rcu(&leader->tasks, &tsk->tasks);
++#ifdef CONFIG_VE
++		list_replace_rcu(&leader->ve_task_info.vetask_list,
++				&tsk->ve_task_info.vetask_list);
++#endif
+ 
+ 		tsk->group_leader = tsk;
+ 		leader->group_leader = tsk;
+@@ -964,12 +994,10 @@ int flush_old_exec(struct linux_binprm * bprm)
+ 	/*
+ 	 * Release all of the old mmap stuff
+ 	 */
+-	retval = exec_mmap(bprm->mm);
++	retval = exec_mmap(bprm);
+ 	if (retval)
+ 		goto out;
+ 
+-	bprm->mm = NULL;		/* We're using it now */
+-
+ 	/* This is the point of no return */
+ 	current->sas_ss_sp = current->sas_ss_size = 0;
+ 
+@@ -1275,6 +1303,10 @@ int do_execve(char * filename,
+ 	struct files_struct *displaced;
+ 	int retval;
+ 
++	retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL);
++	if (retval)
++		return retval;
++
+ 	retval = unshare_files(&displaced);
+ 	if (retval)
+ 		goto out_ret;
+@@ -1543,7 +1575,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+ 		goto done;
+ 
+ 	rcu_read_lock();
+-	for_each_process(g) {
++	for_each_process_ve(g) {
+ 		if (g == tsk->group_leader)
+ 			continue;
+ 
+@@ -1677,7 +1709,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+ 	/*
+ 	 * If another thread got here first, or we are not dumpable, bail out.
+ 	 */
+-	if (mm->core_waiters || !get_dumpable(mm)) {
++	if (mm->core_waiters || !get_dumpable(mm) || mm->vps_dumpable != 1) {
+ 		up_write(&mm->mmap_sem);
+ 		goto fail;
+ 	}
+diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
+index 80c97fd..c03ef38 100644
+--- a/fs/ext2/namei.c
++++ b/fs/ext2/namei.c
+@@ -31,6 +31,7 @@
+  */
+ 
+ #include <linux/pagemap.h>
++#include <linux/quotaops.h>
+ #include "ext2.h"
+ #include "xattr.h"
+ #include "acl.h"
+@@ -257,6 +258,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
+ 	struct page * page;
+ 	int err = -ENOENT;
+ 
++	DQUOT_INIT(inode);
++
+ 	de = ext2_find_entry (dir, dentry, &page);
+ 	if (!de)
+ 		goto out;
+@@ -299,6 +302,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
+ 	struct ext2_dir_entry_2 * old_de;
+ 	int err = -ENOENT;
+ 
++	if (new_inode)
++		DQUOT_INIT(new_inode);
++
+ 	old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
+ 	if (!old_de)
+ 		goto out;
+diff --git a/fs/ext2/super.c b/fs/ext2/super.c
+index ef50cbc..dc11cf2 100644
+--- a/fs/ext2/super.c
++++ b/fs/ext2/super.c
+@@ -1400,7 +1400,7 @@ static struct file_system_type ext2_fs_type = {
+ 	.name		= "ext2",
+ 	.get_sb		= ext2_get_sb,
+ 	.kill_sb	= kill_block_super,
+-	.fs_flags	= FS_REQUIRES_DEV,
++	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
+ };
+ 
+ static int __init init_ext2_fs(void)
+diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
+index 0d0c701..d4d3c11 100644
+--- a/fs/ext3/ioctl.c
++++ b/fs/ext3/ioctl.c
+@@ -87,7 +87,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+ 		 * the relevant capability.
+ 		 */
+ 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+-			if (!capable(CAP_SYS_RESOURCE)) {
++			if (!capable(CAP_SYS_ADMIN)) {
+ 				mutex_unlock(&inode->i_mutex);
+ 				err = -EPERM;
+ 				goto flags_out;
+diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
+index 0b8cf80..c39f682 100644
+--- a/fs/ext3/namei.c
++++ b/fs/ext3/namei.c
+@@ -1345,7 +1345,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+ 	if (err)
+ 		ext3_std_error(dir->i_sb, err);
+ 	brelse(bh);
+-	return 0;
++	return err;
+ }
+ 
+ /*
+diff --git a/fs/ext3/super.c b/fs/ext3/super.c
+index 2845425..1682ad4 100644
+--- a/fs/ext3/super.c
++++ b/fs/ext3/super.c
+@@ -2903,7 +2903,7 @@ static struct file_system_type ext3_fs_type = {
+ 	.name		= "ext3",
+ 	.get_sb		= ext3_get_sb,
+ 	.kill_sb	= kill_block_super,
+-	.fs_flags	= FS_REQUIRES_DEV,
++	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
+ };
+ 
+ static int __init init_ext3_fs(void)
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 7a6c2f1..ec237c2 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -79,7 +79,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ 		 * the relevant capability.
+ 		 */
+ 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+-			if (!capable(CAP_SYS_RESOURCE))
++			if (!capable(CAP_SYS_ADMIN))
+ 				goto flags_out;
+ 		}
+ 
+diff --git a/fs/fcntl.c b/fs/fcntl.c
+index bfd7765..24017e2 100644
+--- a/fs/fcntl.c
++++ b/fs/fcntl.c
+@@ -181,6 +181,7 @@ out_fput:
+ 	fput(file);
+ 	goto out;
+ }
++EXPORT_SYMBOL_GPL(sys_dup2);
+ 
+ asmlinkage long sys_dup(unsigned int fildes)
+ {
+@@ -199,6 +200,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
+ 	struct inode * inode = filp->f_path.dentry->d_inode;
+ 	int error = 0;
+ 
++	if (!capable(CAP_SYS_RAWIO) && !odirect_enable)
++		arg &= ~O_DIRECT;
++
+ 	/*
+ 	 * O_APPEND cannot be cleared if the file is marked as append-only
+ 	 * and the file is open for write.
+diff --git a/fs/file.c b/fs/file.c
+index 7b3887e..aad77ec 100644
+--- a/fs/file.c
++++ b/fs/file.c
+@@ -8,6 +8,7 @@
+ 
+ #include <linux/fs.h>
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/time.h>
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+@@ -19,6 +20,8 @@
+ #include <linux/rcupdate.h>
+ #include <linux/workqueue.h>
+ 
++#include <bc/kmem.h>
++
+ struct fdtable_defer {
+ 	spinlock_t lock;
+ 	struct work_struct wq;
+@@ -40,9 +43,9 @@ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
+ static inline void * alloc_fdmem(unsigned int size)
+ {
+ 	if (size <= PAGE_SIZE)
+-		return kmalloc(size, GFP_KERNEL);
++		return kmalloc(size, GFP_KERNEL_UBC);
+ 	else
+-		return vmalloc(size);
++		return ub_vmalloc(size);
+ }
+ 
+ static inline void free_fdarr(struct fdtable *fdt)
+@@ -161,7 +164,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
+ 	if (unlikely(nr > sysctl_nr_open))
+ 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+ 
+-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
++	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC);
+ 	if (!fdt)
+ 		goto out;
+ 	fdt->max_fds = nr;
+@@ -196,7 +199,7 @@ out:
+  * Return <0 error code on error; 1 on successful completion.
+  * The files->file_lock should be held on entry, and will be held on exit.
+  */
+-static int expand_fdtable(struct files_struct *files, int nr)
++int expand_fdtable(struct files_struct *files, int nr)
+ 	__releases(files->file_lock)
+ 	__acquires(files->file_lock)
+ {
+@@ -236,6 +239,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
+ 	}
+ 	return 1;
+ }
++EXPORT_SYMBOL_GPL(expand_fdtable);
+ 
+ /*
+  * Expand files.
+diff --git a/fs/file_table.c b/fs/file_table.c
+index 8308422..7cdcec5 100644
+--- a/fs/file_table.c
++++ b/fs/file_table.c
+@@ -21,9 +21,14 @@
+ #include <linux/fsnotify.h>
+ #include <linux/sysctl.h>
+ #include <linux/percpu_counter.h>
++#include <linux/ve.h>
+ 
+ #include <asm/atomic.h>
+ 
++#include <bc/beancounter.h>
++#include <bc/kmem.h>
++#include <bc/misc.h>
++
+ /* sysctl tunables... */
+ struct files_stat_struct files_stat = {
+ 	.max_files = NR_FILE
+@@ -37,13 +42,16 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+ static inline void file_free_rcu(struct rcu_head *head)
+ {
+ 	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
++	put_ve(f->owner_env);
+ 	kmem_cache_free(filp_cachep, f);
+ }
+ 
+ static inline void file_free(struct file *f)
+ {
+-	percpu_counter_dec(&nr_files);
+ 	file_check_state(f);
++	if (f->f_ub == get_ub0())
++		percpu_counter_dec(&nr_files);
++	ub_file_uncharge(f);
+ 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+ }
+ 
+@@ -97,11 +105,14 @@ struct file *get_empty_filp(void)
+ 	struct task_struct *tsk;
+ 	static int old_max;
+ 	struct file * f;
++	int acct;
+ 
++	acct = (get_exec_ub() == get_ub0());
+ 	/*
+ 	 * Privileged users can go above max_files
+ 	 */
+-	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
++	if (acct && get_nr_files() >= files_stat.max_files &&
++			!capable(CAP_SYS_ADMIN)) {
+ 		/*
+ 		 * percpu_counters are inaccurate.  Do an expensive check before
+ 		 * we go and fail.
+@@ -114,7 +125,13 @@ struct file *get_empty_filp(void)
+ 	if (f == NULL)
+ 		goto fail;
+ 
+-	percpu_counter_inc(&nr_files);
++	if (ub_file_charge(f))
++		goto fail_ch;
++	if (acct)
++		percpu_counter_inc(&nr_files);
++
++	f->owner_env = get_ve(get_exec_env());
++
+ 	if (security_file_alloc(f))
+ 		goto fail_sec;
+ 
+@@ -141,6 +158,10 @@ fail_sec:
+ 	file_free(f);
+ fail:
+ 	return NULL;
++
++fail_ch:
++	kmem_cache_free(filp_cachep, f);
++	return NULL;
+ }
+ 
+ EXPORT_SYMBOL(get_empty_filp);
+diff --git a/fs/filesystems.c b/fs/filesystems.c
+index f37f872..3dca4a7 100644
+--- a/fs/filesystems.c
++++ b/fs/filesystems.c
+@@ -12,6 +12,9 @@
+ #include <linux/kmod.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
++#include <linux/sched.h>	/* for 'current' */
++#include <linux/mount.h>
++#include <linux/ve.h>
+ #include <asm/uaccess.h>
+ 
+ /*
+@@ -21,8 +24,8 @@
+  *	During the unload module must call unregister_filesystem().
+  *	We can access the fields of list element if:
+  *		1) spinlock is held or
+- *		2) we hold the reference to the module.
+- *	The latter can be guaranteed by call of try_module_get(); if it
++ *		2) we hold the reference to the element.
++ *	The latter can be guaranteed by call of try_filesystem(); if it
+  *	returned 0 we must skip the element, otherwise we got the reference.
+  *	Once the reference is obtained we can drop the spinlock.
+  */
+@@ -30,24 +33,46 @@
+ static struct file_system_type *file_systems;
+ static DEFINE_RWLOCK(file_systems_lock);
+ 
++int try_get_filesystem(struct file_system_type *fs)
++{
++	if (try_module_get(fs->owner)) {
++		(void)get_ve(fs->owner_env);
++		return 1;
++	}
++	return 0;
++}
++
+ /* WARNING: This can be used only if we _already_ own a reference */
+ void get_filesystem(struct file_system_type *fs)
+ {
++	(void)get_ve(fs->owner_env);
+ 	__module_get(fs->owner);
+ }
+ 
+ void put_filesystem(struct file_system_type *fs)
+ {
+ 	module_put(fs->owner);
++	put_ve(fs->owner_env);
++}
++
++static inline int check_ve_fstype(struct file_system_type *p,
++		struct ve_struct *env)
++{
++	return ((p->fs_flags & FS_VIRTUALIZED) ||
++			ve_accessible_strict(p->owner_env, env));
+ }
+ 
+-static struct file_system_type **find_filesystem(const char *name, unsigned len)
++static struct file_system_type **find_filesystem(const char *name, unsigned len,
++		struct ve_struct *env)
+ {
+ 	struct file_system_type **p;
+-	for (p=&file_systems; *p; p=&(*p)->next)
++	for (p=&file_systems; *p; p=&(*p)->next) {
++		if (!check_ve_fstype(*p, env))
++			continue;
+ 		if (strlen((*p)->name) == len &&
+ 		    strncmp((*p)->name, name, len) == 0)
+ 			break;
++	}
+ 	return p;
+ }
+ 
+@@ -73,8 +98,12 @@ int register_filesystem(struct file_system_type * fs)
+ 	if (fs->next)
+ 		return -EBUSY;
+ 	INIT_LIST_HEAD(&fs->fs_supers);
++	if (fs->owner_env == NULL)
++		fs->owner_env = get_ve0();
++	if (fs->proto == NULL)
++		fs->proto = fs;
+ 	write_lock(&file_systems_lock);
+-	p = find_filesystem(fs->name, strlen(fs->name));
++	p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env);
+ 	if (*p)
+ 		res = -EBUSY;
+ 	else
+@@ -118,6 +147,75 @@ int unregister_filesystem(struct file_system_type * fs)
+ 
+ EXPORT_SYMBOL(unregister_filesystem);
+ 
++#ifdef CONFIG_VE
++int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template,
++		struct file_system_type **p_fs_type, struct vfsmount **p_mnt)
++{
++	struct vfsmount *mnt;
++	struct file_system_type *local_fs_type;
++	int ret;
++
++	local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *),
++					GFP_KERNEL);
++	if (local_fs_type == NULL)
++		return -ENOMEM;
++
++	local_fs_type->name = template->name;
++	local_fs_type->fs_flags = template->fs_flags;
++	local_fs_type->get_sb = template->get_sb;
++	local_fs_type->kill_sb = template->kill_sb;
++	local_fs_type->owner = template->owner;
++	local_fs_type->owner_env = ve;
++	local_fs_type->proto = template;
++
++	get_filesystem(local_fs_type);	/* get_ve() inside */
++
++	ret = register_filesystem(local_fs_type);
++	if (ret)
++		goto reg_err;
++
++	if (p_mnt == NULL) 
++		goto done; 
++
++	mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL);
++	if (IS_ERR(mnt))
++		goto mnt_err;
++
++	*p_mnt = mnt;
++done:
++	*p_fs_type = local_fs_type;
++	return 0;
++
++mnt_err:
++	ret = PTR_ERR(mnt);
++	unregister_filesystem(local_fs_type); /* does not put */
++
++reg_err:
++	put_filesystem(local_fs_type);
++	kfree(local_fs_type);
++	printk(KERN_DEBUG
++	       "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret);
++	return ret;
++}
++
++EXPORT_SYMBOL(register_ve_fs_type);
++
++void unregister_ve_fs_type(struct file_system_type *local_fs_type,
++		struct vfsmount *local_fs_mount)
++{
++	if (local_fs_mount == NULL && local_fs_type == NULL)
++		return;
++
++	unregister_filesystem(local_fs_type);
++	umount_ve_fs_type(local_fs_type);
++	if (local_fs_mount)
++		kern_umount(local_fs_mount); /* alias to mntput, drop our ref */
++	put_filesystem(local_fs_type);
++}
++
++EXPORT_SYMBOL(unregister_ve_fs_type);
++#endif
++
+ static int fs_index(const char __user * __name)
+ {
+ 	struct file_system_type * tmp;
+@@ -131,11 +229,14 @@ static int fs_index(const char __user * __name)
+ 
+ 	err = -EINVAL;
+ 	read_lock(&file_systems_lock);
+-	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
++	for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) {
++		if (!check_ve_fstype(tmp, get_exec_env()))
++			continue;
+ 		if (strcmp(tmp->name,name) == 0) {
+ 			err = index;
+ 			break;
+ 		}
++		index++;
+ 	}
+ 	read_unlock(&file_systems_lock);
+ 	putname(name);
+@@ -148,9 +249,15 @@ static int fs_name(unsigned int index, char __user * buf)
+ 	int len, res;
+ 
+ 	read_lock(&file_systems_lock);
+-	for (tmp = file_systems; tmp; tmp = tmp->next, index--)
+-		if (index <= 0 && try_module_get(tmp->owner))
+-			break;
++	for (tmp = file_systems; tmp; tmp = tmp->next) {
++		if (!check_ve_fstype(tmp, get_exec_env()))
++			continue;
++		if (!index) {
++			if (try_get_filesystem(tmp))
++				break;
++		} else
++			index--;
++	}
+ 	read_unlock(&file_systems_lock);
+ 	if (!tmp)
+ 		return -EINVAL;
+@@ -168,8 +275,9 @@ static int fs_maxindex(void)
+ 	int index;
+ 
+ 	read_lock(&file_systems_lock);
+-	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
+-		;
++	for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next)
++		if (check_ve_fstype(tmp, get_exec_env()))
++			index++;
+ 	read_unlock(&file_systems_lock);
+ 	return index;
+ }
+@@ -205,9 +313,10 @@ int get_filesystem_list(char * buf)
+ 	read_lock(&file_systems_lock);
+ 	tmp = file_systems;
+ 	while (tmp && len < PAGE_SIZE - 80) {
+-		len += sprintf(buf+len, "%s\t%s\n",
+-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+-			tmp->name);
++		if (check_ve_fstype(tmp, get_exec_env()))
++			len += sprintf(buf+len, "%s\t%s\n",
++				(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
++				tmp->name);
+ 		tmp = tmp->next;
+ 	}
+ 	read_unlock(&file_systems_lock);
+@@ -221,14 +330,14 @@ struct file_system_type *get_fs_type(const char *name)
+ 	unsigned len = dot ? dot - name : strlen(name);
+ 
+ 	read_lock(&file_systems_lock);
+-	fs = *(find_filesystem(name, len));
+-	if (fs && !try_module_get(fs->owner))
++	fs = *(find_filesystem(name, len, get_exec_env()));
++	if (fs && !try_get_filesystem(fs))
+ 		fs = NULL;
+ 	read_unlock(&file_systems_lock);
+ 	if (!fs && (request_module("%.*s", len, name) == 0)) {
+ 		read_lock(&file_systems_lock);
+-		fs = *(find_filesystem(name, len));
+-		if (fs && !try_module_get(fs->owner))
++		fs = *(find_filesystem(name, len, get_exec_env()));
++		if (fs && !try_get_filesystem(fs))
+ 			fs = NULL;
+ 		read_unlock(&file_systems_lock);
+ 	}
+diff --git a/fs/fuse/control.c b/fs/fuse/control.c
+index 4f3cab3..755be17 100644
+--- a/fs/fuse/control.c
++++ b/fs/fuse/control.c
+@@ -10,6 +10,8 @@
+ 
+ #include <linux/init.h>
+ #include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/ve_proto.h>
+ 
+ #define FUSE_CTL_SUPER_MAGIC 0x65735543
+ 
+@@ -17,7 +19,11 @@
+  * This is non-NULL when the single instance of the control filesystem
+  * exists.  Protected by fuse_mutex
+  */
++#ifdef CONFIG_VE
++#define fuse_control_sb	(get_exec_env()->_fuse_control_sb)
++#else
+ static struct super_block *fuse_control_sb;
++#endif
+ 
+ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+ {
+@@ -211,12 +217,51 @@ static struct file_system_type fuse_ctl_fs_type = {
+ 	.kill_sb	= fuse_ctl_kill_sb,
+ };
+ 
++#ifdef CONFIG_VE
++static int fuse_ctl_start(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	if (ve->fuse_ctl_fs_type != NULL)
++		return -EBUSY;
++
++	return register_ve_fs_type(ve, &fuse_ctl_fs_type,
++			&ve->fuse_ctl_fs_type, NULL);
++}
++
++static void fuse_ctl_stop(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	if (ve->fuse_ctl_fs_type == NULL)
++		return;
++
++	unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL);
++	ve->fuse_ctl_fs_type = NULL;
++}
++
++static struct ve_hook fuse_ctl_ve_hook = {
++	.init		= fuse_ctl_start,
++	.fini		= fuse_ctl_stop,
++	.owner		= THIS_MODULE,
++	.priority	= HOOK_PRIO_FS,
++};
++#endif
++
+ int __init fuse_ctl_init(void)
+ {
+-	return register_filesystem(&fuse_ctl_fs_type);
++	int err;
++	
++	err = register_filesystem(&fuse_ctl_fs_type);
++	if (err == 0)
++		ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook);
++	return err;
+ }
+ 
+ void fuse_ctl_cleanup(void)
+ {
++	ve_hook_unregister(&fuse_ctl_ve_hook);
+ 	unregister_filesystem(&fuse_ctl_fs_type);
+ }
+diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
+index bae9486..253017e 100644
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -45,7 +45,11 @@
+ #define FUSE_ALLOW_OTHER         (1 << 1)
+ 
+ /** List of active connections */
++#ifdef CONFIG_VE
++#define fuse_conn_list	(get_exec_env()->_fuse_conn_list)
++#else
+ extern struct list_head fuse_conn_list;
++#endif
+ 
+ /** Global mutex protecting fuse_conn_list and the control filesystem */
+ extern struct mutex fuse_mutex;
+diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
+index 3141690..84975e4 100644
+--- a/fs/fuse/inode.c
++++ b/fs/fuse/inode.c
+@@ -18,13 +18,16 @@
+ #include <linux/statfs.h>
+ #include <linux/random.h>
+ #include <linux/sched.h>
++#include <linux/ve_proto.h>
+ 
+ MODULE_AUTHOR("Miklos Szeredi <miklos at szeredi.hu>");
+ MODULE_DESCRIPTION("Filesystem in Userspace");
+ MODULE_LICENSE("GPL");
+ 
+ static struct kmem_cache *fuse_inode_cachep;
++#ifndef CONFIG_VE
+ struct list_head fuse_conn_list;
++#endif
+ DEFINE_MUTEX(fuse_mutex);
+ 
+ #define FUSE_SUPER_MAGIC 0x65735546
+@@ -858,6 +861,41 @@ static void fuse_sysfs_cleanup(void)
+ 	kobject_put(fuse_kobj);
+ }
+ 
++#ifdef CONFIG_VE
++static int fuse_start(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	if (ve->fuse_fs_type != NULL)
++		return -EBUSY;
++
++	INIT_LIST_HEAD(&ve->_fuse_conn_list);
++	return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL);
++}
++
++static void fuse_stop(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	if (ve->fuse_fs_type == NULL)
++		return;
++
++	unregister_ve_fs_type(ve->fuse_fs_type, NULL);
++	kfree(ve->fuse_fs_type);
++	ve->fuse_fs_type = NULL;
++	BUG_ON(!list_empty(&ve->_fuse_conn_list));
++}
++
++static struct ve_hook fuse_ve_hook = {
++	.init		= fuse_start,
++	.fini		= fuse_stop,
++	.owner		= THIS_MODULE,
++	.priority	= HOOK_PRIO_FS,
++};
++#endif
++
+ static int __init fuse_init(void)
+ {
+ 	int res;
+@@ -882,6 +920,7 @@ static int __init fuse_init(void)
+ 	if (res)
+ 		goto err_sysfs_cleanup;
+ 
++	ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook);
+ 	return 0;
+ 
+  err_sysfs_cleanup:
+@@ -898,6 +937,7 @@ static void __exit fuse_exit(void)
+ {
+ 	printk(KERN_DEBUG "fuse exit\n");
+ 
++	ve_hook_unregister(&fuse_ve_hook);
+ 	fuse_ctl_cleanup();
+ 	fuse_sysfs_cleanup();
+ 	fuse_fs_cleanup();
+diff --git a/fs/inode.c b/fs/inode.c
+index c36d948..57adf85 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -8,10 +8,13 @@
+ #include <linux/mm.h>
+ #include <linux/dcache.h>
+ #include <linux/init.h>
++#include <linux/kernel_stat.h>
+ #include <linux/quotaops.h>
+ #include <linux/slab.h>
+ #include <linux/writeback.h>
+ #include <linux/module.h>
++#include <linux/nsproxy.h>
++#include <linux/mnt_namespace.h>
+ #include <linux/backing-dev.h>
+ #include <linux/wait.h>
+ #include <linux/hash.h>
+@@ -22,6 +25,7 @@
+ #include <linux/bootmem.h>
+ #include <linux/inotify.h>
+ #include <linux/mount.h>
++#include <linux/vzstat.h>
+ 
+ /*
+  * This is needed for the following functions:
+@@ -97,7 +101,8 @@ static DEFINE_MUTEX(iprune_mutex);
+  */
+ struct inodes_stat_t inodes_stat;
+ 
+-static struct kmem_cache * inode_cachep __read_mostly;
++struct kmem_cache * inode_cachep __read_mostly;
++
+ 
+ static void wake_up_inode(struct inode *inode)
+ {
+@@ -108,11 +113,13 @@ static void wake_up_inode(struct inode *inode)
+ 	wake_up_bit(&inode->i_state, __I_LOCK);
+ }
+ 
++static struct address_space_operations vfs_empty_aops;
++struct inode_operations vfs_empty_iops;
++static struct file_operations vfs_empty_fops;
++EXPORT_SYMBOL(vfs_empty_iops);
++
+ static struct inode *alloc_inode(struct super_block *sb)
+ {
+-	static const struct address_space_operations empty_aops;
+-	static struct inode_operations empty_iops;
+-	static const struct file_operations empty_fops;
+ 	struct inode *inode;
+ 
+ 	if (sb->s_op->alloc_inode)
+@@ -127,8 +134,8 @@ static struct inode *alloc_inode(struct super_block *sb)
+ 		inode->i_blkbits = sb->s_blocksize_bits;
+ 		inode->i_flags = 0;
+ 		atomic_set(&inode->i_count, 1);
+-		inode->i_op = &empty_iops;
+-		inode->i_fop = &empty_fops;
++		inode->i_op = &vfs_empty_iops;
++		inode->i_fop = &vfs_empty_fops;
+ 		inode->i_nlink = 1;
+ 		atomic_set(&inode->i_writecount, 0);
+ 		inode->i_size = 0;
+@@ -152,15 +159,15 @@ static struct inode *alloc_inode(struct super_block *sb)
+ 		}
+ 
+ 		spin_lock_init(&inode->i_lock);
+-		lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
++		lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key);
+ 
+ 		mutex_init(&inode->i_mutex);
+-		lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
++		lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key);
+ 
+ 		init_rwsem(&inode->i_alloc_sem);
+-		lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
++		lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key);
+ 
+-		mapping->a_ops = &empty_aops;
++		mapping->a_ops = &vfs_empty_aops;
+  		mapping->host = inode;
+ 		mapping->flags = 0;
+ 		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+@@ -310,13 +317,76 @@ static void dispose_list(struct list_head *head)
+ 	spin_unlock(&inode_lock);
+ }
+ 
++static void show_header(struct inode *inode)
++{
++	struct super_block *sb = inode->i_sb;
++
++	printk("VFS: Busy inodes after unmount. "
++			"sb = %p, fs type = %s, sb count = %d, "
++			"sb->s_root = %s\n", sb,
++			(sb->s_type != NULL) ? sb->s_type->name : "",
++			sb->s_count,
++			(sb->s_root != NULL) ?
++			(char *)sb->s_root->d_name.name : "");
++}
++
++static void show_inode(struct inode *inode)
++{
++	struct dentry *d;
++	struct vfsmount *mnt;
++	int i;
++
++	printk("inode = %p, inode->i_count = %d, "
++			"inode->i_nlink = %d, "
++			"inode->i_mode = %d, "
++			"inode->i_state = %ld, "
++			"inode->i_flags = %d, "
++			"inode->i_devices.next = %p, "
++			"inode->i_devices.prev = %p, "
++			"inode->i_ino = %ld\n",
++			inode,
++			atomic_read(&inode->i_count),
++			inode->i_nlink,
++			inode->i_mode,
++			inode->i_state,
++			inode->i_flags,
++			inode->i_devices.next,
++			inode->i_devices.prev,
++			inode->i_ino);
++	printk("inode dump: ");
++	for (i = 0; i < sizeof(*inode); i++)
++		printk("%2.2x ", *((u_char *)inode + i));
++	printk("\n");
++	list_for_each_entry(d, &inode->i_dentry, d_alias) {
++		printk("  d_alias %s d_count=%d d_flags=%x\n",
++			d->d_name.name, atomic_read(&d->d_count), d->d_flags);
++		for (i = 0; i < sizeof(*d); i++)
++			printk("%2.2x ", *((u_char *)d + i));
++		printk("\n");
++	}
++
++	spin_lock(&vfsmount_lock);
++	list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) {
++		if (mnt->mnt_sb != inode->i_sb)
++			continue;
++		printk("mnt=%p count=%d flags=%x exp_mask=%x\n",
++				mnt, atomic_read(&mnt->mnt_count),
++				mnt->mnt_flags,
++				mnt->mnt_expiry_mark);
++		for (i = 0; i < sizeof(*mnt); i++)
++			printk("%2.2x ", *((u_char *)mnt + i));
++		printk("\n");
++	}
++	spin_unlock(&vfsmount_lock);
++}
++
+ /*
+  * Invalidate all inodes for a device.
+  */
+-static int invalidate_list(struct list_head *head, struct list_head *dispose)
++static int invalidate_list(struct list_head *head, struct list_head *dispose, int check)
+ {
+ 	struct list_head *next;
+-	int busy = 0, count = 0;
++	int busy = 0, count = 0, once = 1;
+ 
+ 	next = head->next;
+ 	for (;;) {
+@@ -343,6 +413,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
+ 			continue;
+ 		}
+ 		busy = 1;
++
++		if (check) {
++			if (once) {
++				once = 0;
++				show_header(inode);
++			}
++			show_inode(inode);
++		}
+ 	}
+ 	/* only unused inodes may be cached with i_count zero */
+ 	inodes_stat.nr_unused -= count;
+@@ -357,7 +435,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
+  *	fails because there are busy inodes then a non zero value is returned.
+  *	If the discard is successful all the inodes have been discarded.
+  */
+-int invalidate_inodes(struct super_block * sb)
++int invalidate_inodes_check(struct super_block * sb, int check)
+ {
+ 	int busy;
+ 	LIST_HEAD(throw_away);
+@@ -365,7 +443,7 @@ int invalidate_inodes(struct super_block * sb)
+ 	mutex_lock(&iprune_mutex);
+ 	spin_lock(&inode_lock);
+ 	inotify_unmount_inodes(&sb->s_inodes);
+-	busy = invalidate_list(&sb->s_inodes, &throw_away);
++	busy = invalidate_list(&sb->s_inodes, &throw_away, check);
+ 	spin_unlock(&inode_lock);
+ 
+ 	dispose_list(&throw_away);
+@@ -374,7 +452,7 @@ int invalidate_inodes(struct super_block * sb)
+ 	return busy;
+ }
+ 
+-EXPORT_SYMBOL(invalidate_inodes);
++EXPORT_SYMBOL(invalidate_inodes_check);
+ 
+ static int can_unuse(struct inode *inode)
+ {
+@@ -464,6 +542,7 @@ static void prune_icache(int nr_to_scan)
+  */
+ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+ {
++	KSTAT_PERF_ENTER(shrink_icache)
+ 	if (nr) {
+ 		/*
+ 		 * Nasty deadlock avoidance.  We may hold various FS locks,
+@@ -474,6 +553,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+ 			return -1;
+ 		prune_icache(nr);
+ 	}
++	KSTAT_PERF_LEAVE(shrink_icache)
+ 	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ }
+ 
+@@ -583,7 +663,7 @@ void unlock_new_inode(struct inode *inode)
+ 		 */
+ 		mutex_destroy(&inode->i_mutex);
+ 		mutex_init(&inode->i_mutex);
+-		lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key);
++		lockdep_set_class(&inode->i_mutex, &type->proto->i_mutex_dir_key);
+ 	}
+ #endif
+ 	/*
+diff --git a/fs/inotify.c b/fs/inotify.c
+index 690e725..01ddb06 100644
+--- a/fs/inotify.c
++++ b/fs/inotify.c
+@@ -32,6 +32,7 @@
+ #include <linux/list.h>
+ #include <linux/writeback.h>
+ #include <linux/inotify.h>
++#include <linux/mount.h>
+ 
+ static atomic_t inotify_cookie;
+ 
+@@ -69,19 +70,6 @@ static atomic_t inotify_cookie;
+  * inotify_add_watch() to the final put_inotify_watch().
+  */
+ 
+-/*
+- * struct inotify_handle - represents an inotify instance
+- *
+- * This structure is protected by the mutex 'mutex'.
+- */
+-struct inotify_handle {
+-	struct idr		idr;		/* idr mapping wd -> watch */
+-	struct mutex		mutex;		/* protects this bad boy */
+-	struct list_head	watches;	/* list of watches */
+-	atomic_t		count;		/* reference count */
+-	u32			last_wd;	/* the last wd allocated */
+-	const struct inotify_operations *in_ops; /* inotify caller operations */
+-};
+ 
+ static inline void get_inotify_handle(struct inotify_handle *ih)
+ {
+@@ -118,6 +106,9 @@ void put_inotify_watch(struct inotify_watch *watch)
+ 		struct inotify_handle *ih = watch->ih;
+ 
+ 		iput(watch->inode);
++		path_put(&watch->path);
++		watch->path.dentry = NULL;
++		watch->path.mnt = NULL;
+ 		ih->in_ops->destroy_watch(watch);
+ 		put_inotify_handle(ih);
+ 	}
+@@ -476,6 +467,8 @@ void inotify_init_watch(struct inotify_watch *watch)
+ 	INIT_LIST_HEAD(&watch->i_list);
+ 	atomic_set(&watch->count, 0);
+ 	get_inotify_watch(watch); /* initial get */
++	watch->path.dentry = NULL;
++	watch->path.mnt = NULL;
+ }
+ EXPORT_SYMBOL_GPL(inotify_init_watch);
+ 
+@@ -616,8 +609,8 @@ EXPORT_SYMBOL_GPL(inotify_find_update_watch);
+  * Caller must ensure it only calls inotify_add_watch() once per watch.
+  * Calls inotify_handle_get_wd() so may sleep.
+  */
+-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+-		      struct inode *inode, u32 mask)
++s32 __inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
++		struct path *path, struct inode * inode, u32 mask)
+ {
+ 	int ret = 0;
+ 	int newly_watched;
+@@ -645,6 +638,10 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+ 	 * Save a reference to the inode and bump the ref count to make it
+ 	 * official.  We hold a reference to nameidata, which makes this safe.
+ 	 */
++	if (path) {
++		path_get(path);
++		watch->path = *path;
++	}
+ 	watch->inode = igrab(inode);
+ 
+ 	/* Add the watch to the handle's and the inode's list */
+@@ -666,6 +663,18 @@ out:
+ }
+ EXPORT_SYMBOL_GPL(inotify_add_watch);
+ 
++s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
++		      struct inode *inode, u32 mask)
++{
++	return __inotify_add_watch(ih, watch, NULL, inode, mask);
++}
++
++s32 inotify_add_watch_dget(struct inotify_handle *ih,
++		struct inotify_watch *watch, struct path *p, u32 mask)
++{
++	return __inotify_add_watch(ih, watch, p, p->dentry->d_inode, mask);
++}
++
+ /**
+  * inotify_clone_watch - put the watch next to existing one
+  * @old: already installed watch
+diff --git a/fs/inotify_user.c b/fs/inotify_user.c
+index 6676c06..dd51c6d 100644
+--- a/fs/inotify_user.c
++++ b/fs/inotify_user.c
+@@ -20,6 +20,7 @@
+  */
+ 
+ #include <linux/kernel.h>
++#include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/slab.h>
+ #include <linux/fs.h>
+@@ -66,47 +67,6 @@ static int inotify_max_queued_events __read_mostly;
+  * first event, or to inotify_destroy().
+  */
+ 
+-/*
+- * struct inotify_device - represents an inotify instance
+- *
+- * This structure is protected by the mutex 'mutex'.
+- */
+-struct inotify_device {
+-	wait_queue_head_t 	wq;		/* wait queue for i/o */
+-	struct mutex		ev_mutex;	/* protects event queue */
+-	struct mutex		up_mutex;	/* synchronizes watch updates */
+-	struct list_head 	events;		/* list of queued events */
+-	atomic_t		count;		/* reference count */
+-	struct user_struct	*user;		/* user who opened this dev */
+-	struct inotify_handle	*ih;		/* inotify handle */
+-	struct fasync_struct    *fa;            /* async notification */
+-	unsigned int		queue_size;	/* size of the queue (bytes) */
+-	unsigned int		event_count;	/* number of pending events */
+-	unsigned int		max_events;	/* maximum number of events */
+-};
+-
+-/*
+- * struct inotify_kernel_event - An inotify event, originating from a watch and
+- * queued for user-space.  A list of these is attached to each instance of the
+- * device.  In read(), this list is walked and all events that can fit in the
+- * buffer are returned.
+- *
+- * Protected by dev->ev_mutex of the device in which we are queued.
+- */
+-struct inotify_kernel_event {
+-	struct inotify_event	event;	/* the user-space event */
+-	struct list_head        list;	/* entry in inotify_device's list */
+-	char			*name;	/* filename, if any */
+-};
+-
+-/*
+- * struct inotify_user_watch - our version of an inotify_watch, we add
+- * a reference to the associated inotify_device.
+- */
+-struct inotify_user_watch {
+-	struct inotify_device	*dev;	/* associated device */
+-	struct inotify_watch	wdata;	/* inotify watch data */
+-};
+ 
+ #ifdef CONFIG_SYSCTL
+ 
+@@ -376,8 +336,7 @@ static int find_inode(const char __user *dirname, struct nameidata *nd,
+  *
+  * Callers must hold dev->up_mutex.
+  */
+-static int create_watch(struct inotify_device *dev, struct inode *inode,
+-			u32 mask)
++int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask)
+ {
+ 	struct inotify_user_watch *watch;
+ 	int ret;
+@@ -397,12 +356,13 @@ static int create_watch(struct inotify_device *dev, struct inode *inode,
+ 	atomic_inc(&dev->user->inotify_watches);
+ 
+ 	inotify_init_watch(&watch->wdata);
+-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
++	ret = inotify_add_watch_dget(dev->ih, &watch->wdata, p, mask);
+ 	if (ret < 0)
+ 		free_inotify_user_watch(&watch->wdata);
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL(inotify_create_watch);
+ 
+ /* Device Interface */
+ 
+@@ -552,7 +512,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
+ 	return ret;
+ }
+ 
+-static const struct file_operations inotify_fops = {
++const struct file_operations inotify_fops = {
+ 	.poll           = inotify_poll,
+ 	.read           = inotify_read,
+ 	.fasync         = inotify_fasync,
+@@ -560,6 +520,7 @@ static const struct file_operations inotify_fops = {
+ 	.unlocked_ioctl = inotify_ioctl,
+ 	.compat_ioctl	= inotify_ioctl,
+ };
++EXPORT_SYMBOL(inotify_fops);
+ 
+ static const struct inotify_operations inotify_user_ops = {
+ 	.handle_event	= inotify_dev_queue_event,
+@@ -637,6 +598,7 @@ out_put_fd:
+ 	put_unused_fd(fd);
+ 	return ret;
+ }
++EXPORT_SYMBOL(sys_inotify_init);
+ 
+ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+ {
+@@ -673,7 +635,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+ 	mutex_lock(&dev->up_mutex);
+ 	ret = inotify_find_update_watch(dev->ih, inode, mask);
+ 	if (ret == -ENOENT)
+-		ret = create_watch(dev, inode, mask);
++		ret = inotify_create_watch(dev, &nd.path, mask);
+ 	mutex_unlock(&dev->up_mutex);
+ 
+ 	path_put(&nd.path);
+diff --git a/fs/ioprio.c b/fs/ioprio.c
+index c4a1c3c..08b7f78 100644
+--- a/fs/ioprio.c
++++ b/fs/ioprio.c
+@@ -26,6 +26,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/security.h>
+ #include <linux/pid_namespace.h>
++#include <linux/nsproxy.h>
++#include <bc/io_prio.h>
+ 
+ static int set_task_ioprio(struct task_struct *task, int ioprio)
+ {
+@@ -71,8 +73,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
+ 	int data = IOPRIO_PRIO_DATA(ioprio);
+ 	struct task_struct *p, *g;
+ 	struct user_struct *user;
+-	struct pid *pgrp;
+ 	int ret;
++	struct pid *pgrp;
++
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
+ 
+ 	switch (class) {
+ 		case IOPRIO_CLASS_RT:
+@@ -130,17 +135,23 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
+ 			if (!user)
+ 				break;
+ 
+-			do_each_thread(g, p) {
++			do_each_thread_all(g, p) {
+ 				if (p->uid != who)
+ 					continue;
+ 				ret = set_task_ioprio(p, ioprio);
+ 				if (ret)
+ 					goto free_uid;
+-			} while_each_thread(g, p);
++			} while_each_thread_all(g, p);
+ free_uid:
+ 			if (who)
+ 				free_uid(user);
+ 			break;
++		case IOPRIO_WHO_UBC:
++			if (class != IOPRIO_CLASS_BE)
++				return -ERANGE;
++
++			ret = bc_set_ioprio(who, data);
++			break;
+ 		default:
+ 			ret = -EINVAL;
+ 	}
+@@ -185,9 +196,9 @@ asmlinkage long sys_ioprio_get(int which, int who)
+ {
+ 	struct task_struct *g, *p;
+ 	struct user_struct *user;
+-	struct pid *pgrp;
+ 	int ret = -ESRCH;
+ 	int tmpio;
++	struct pid *pgrp;
+ 
+ 	read_lock(&tasklist_lock);
+ 	switch (which) {
+@@ -223,7 +234,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
+ 			if (!user)
+ 				break;
+ 
+-			do_each_thread(g, p) {
++			do_each_thread_ve(g, p) {
+ 				if (p->uid != user->uid)
+ 					continue;
+ 				tmpio = get_task_ioprio(p);
+@@ -233,7 +244,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
+ 					ret = tmpio;
+ 				else
+ 					ret = ioprio_best(ret, tmpio);
+-			} while_each_thread(g, p);
++			} while_each_thread_ve(g, p);
+ 
+ 			if (who)
+ 				free_uid(user);
+diff --git a/fs/locks.c b/fs/locks.c
+index dce8c74..4a37766 100644
+--- a/fs/locks.c
++++ b/fs/locks.c
+@@ -130,6 +130,8 @@
+ 
+ #include <asm/uaccess.h>
+ 
++#include <bc/misc.h>
++
+ #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
+ #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
+ #define IS_LEASE(fl)	(fl->fl_flags & FL_LEASE)
+@@ -146,9 +148,25 @@ static LIST_HEAD(blocked_list);
+ static struct kmem_cache *filelock_cache __read_mostly;
+ 
+ /* Allocate an empty lock structure. */
+-static struct file_lock *locks_alloc_lock(void)
++static struct file_lock *locks_alloc_lock(int charge)
+ {
+-	return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
++	struct file_lock *fl;
++
++	fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL);
++#ifdef CONFIG_BEANCOUNTERS
++	if (fl == NULL)
++		goto out;
++	fl->fl_charged = 0;
++	if (!charge)
++		goto out;
++	if (!ub_flock_charge(fl, 1))
++		goto out;
++
++	kmem_cache_free(filelock_cache, fl);
++	fl = NULL;
++out:
++#endif
++	return fl;
+ }
+ 
+ static void locks_release_private(struct file_lock *fl)
+@@ -173,6 +191,7 @@ static void locks_free_lock(struct file_lock *fl)
+ 	BUG_ON(!list_empty(&fl->fl_block));
+ 	BUG_ON(!list_empty(&fl->fl_link));
+ 
++	ub_flock_uncharge(fl);
+ 	locks_release_private(fl);
+ 	kmem_cache_free(filelock_cache, fl);
+ }
+@@ -276,7 +295,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
+ 	if (type < 0)
+ 		return type;
+ 	
+-	fl = locks_alloc_lock();
++	fl = locks_alloc_lock(type != F_UNLCK);
+ 	if (fl == NULL)
+ 		return -ENOMEM;
+ 
+@@ -463,7 +482,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl)
+ /* Allocate a file_lock initialised to this type of lease */
+ static struct file_lock *lease_alloc(struct file *filp, int type)
+ {
+-	struct file_lock *fl = locks_alloc_lock();
++	struct file_lock *fl = locks_alloc_lock(1);
+ 	int error = -ENOMEM;
+ 
+ 	if (fl == NULL)
+@@ -734,8 +753,13 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
+ 		goto find_conflict;
+ 
+ 	if (request->fl_type != F_UNLCK) {
++		/*
++		 * Nont F_UNLCK request must be already charged in
++		 * flock_make_lock(). Actually new_fl must be charged not the
++		 * request, but we try to fail earlier.
++		 */
+ 		error = -ENOMEM;
+-		new_fl = locks_alloc_lock();
++		new_fl = locks_alloc_lock(0);
+ 		if (new_fl == NULL)
+ 			goto out;
+ 		error = 0;
+@@ -785,6 +809,10 @@ find_conflict:
+ 	}
+ 	if (request->fl_flags & FL_ACCESS)
+ 		goto out;
++
++	set_flock_charged(new_fl);
++	unset_flock_charged(request);
++
+ 	locks_copy_lock(new_fl, request);
+ 	locks_insert_lock(before, new_fl);
+ 	new_fl = NULL;
+@@ -816,8 +844,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
+ 	if (!(request->fl_flags & FL_ACCESS) &&
+ 	    (request->fl_type != F_UNLCK ||
+ 	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+-		new_fl = locks_alloc_lock();
+-		new_fl2 = locks_alloc_lock();
++		if (request->fl_type != F_UNLCK)
++			new_fl = locks_alloc_lock(1);
++		else
++			new_fl = NULL;
++		new_fl2 = locks_alloc_lock(0);
+ 	}
+ 
+ 	lock_kernel();
+@@ -951,7 +982,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
+ 	 * bail out.
+ 	 */
+ 	error = -ENOLCK; /* "no luck" */
+-	if (right && left == right && !new_fl2)
++	if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2))
+ 		goto out;
+ 
+ 	error = 0;
+@@ -962,23 +993,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
+ 			goto out;
+ 		}
+ 
+-		if (!new_fl) {
+-			error = -ENOLCK;
++		error = -ENOLCK;
++		if (!new_fl)
++			goto out;
++		if (right && (left == right) && ub_flock_charge(new_fl, 1))
+ 			goto out;
+-		}
+ 		locks_copy_lock(new_fl, request);
+ 		locks_insert_lock(before, new_fl);
+ 		new_fl = NULL;
++		error = 0;
+ 	}
+ 	if (right) {
+ 		if (left == right) {
+ 			/* The new lock breaks the old one in two pieces,
+ 			 * so we have to use the second new lock.
+ 			 */
++			error = -ENOLCK;
++			if (added && ub_flock_charge(new_fl2,
++						request->fl_type != F_UNLCK))
++				goto out;
++			/* FIXME move all fl_charged manipulations in ub code */
++			set_flock_charged(new_fl2);
+ 			left = new_fl2;
+ 			new_fl2 = NULL;
+ 			locks_copy_lock(left, right);
+ 			locks_insert_lock(before, left);
++			error = 0;
+ 		}
+ 		right->fl_start = request->fl_end + 1;
+ 		locks_wake_up_blocks(right);
+@@ -1365,7 +1405,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+ 
+ 	if (arg != F_UNLCK) {
+ 		error = -ENOMEM;
+-		new_fl = locks_alloc_lock();
++		new_fl = locks_alloc_lock(1);
+ 		if (new_fl == NULL)
+ 			goto out;
+ 
+@@ -1608,6 +1648,7 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
+  out:
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(sys_flock);
+ 
+ /**
+  * vfs_test_lock - test file byte range lock
+@@ -1744,7 +1785,7 @@ EXPORT_SYMBOL_GPL(vfs_lock_file);
+ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ 		struct flock __user *l)
+ {
+-	struct file_lock *file_lock = locks_alloc_lock();
++	struct file_lock *file_lock = locks_alloc_lock(0);
+ 	struct flock flock;
+ 	struct inode *inode;
+ 	struct file *f;
+@@ -1881,7 +1922,7 @@ out:
+ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ 		struct flock64 __user *l)
+ {
+-	struct file_lock *file_lock = locks_alloc_lock();
++	struct file_lock *file_lock = locks_alloc_lock(0);
+ 	struct flock64 flock;
+ 	struct inode *inode;
+ 	struct file *f;
+@@ -2170,6 +2211,8 @@ static int locks_show(struct seq_file *f, void *v)
+ 	struct file_lock *fl, *bfl;
+ 
+ 	fl = list_entry(v, struct file_lock, fl_link);
++	if (!ve_accessible(fl->fl_file->owner_env, get_exec_env()))
++		goto out;
+ 
+ 	lock_get_status(f, fl, (long)f->private, "");
+ 
+@@ -2177,6 +2220,7 @@ static int locks_show(struct seq_file *f, void *v)
+ 		lock_get_status(f, bfl, (long)f->private, " ->");
+ 
+ 	f->private++;
++out:
+ 	return 0;
+ }
+ 
+@@ -2286,7 +2330,7 @@ EXPORT_SYMBOL(lock_may_write);
+ static int __init filelock_init(void)
+ {
+ 	filelock_cache = kmem_cache_create("file_lock_cache",
+-			sizeof(struct file_lock), 0, SLAB_PANIC,
++			sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC,
+ 			init_once);
+ 	return 0;
+ }
+diff --git a/fs/namei.c b/fs/namei.c
+index 01e67dd..4092158 100644
+--- a/fs/namei.c
++++ b/fs/namei.c
+@@ -142,6 +142,7 @@ char * getname(const char __user * filename)
+ {
+ 	char *tmp, *result;
+ 
++	/*ub_dentry_checkup();*/
+ 	result = ERR_PTR(-ENOMEM);
+ 	tmp = __getname();
+ 	if (tmp)  {
+@@ -443,6 +444,21 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
+ 	if (!dentry)
+ 		dentry = d_lookup(parent, name);
+ 
++	/*
++	 * The revalidation rules are simple:
++	 * d_revalidate operation is called when we're about to use a cached
++	 * dentry rather than call d_lookup.
++	 * d_revalidate method may unhash the dentry itself or return FALSE, in
++	 * which case if the dentry can be released d_lookup will be called.
++	 *
++	 * Additionally, by request of NFS people
++	 * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c)
++	 * d_revalidate is called when `/', `.' or `..' are looked up.
++	 * Since re-lookup is impossible on them, we introduce a hack and
++	 * return an error in this case.
++	 *
++	 *     2003/02/19  SAW
++	 */
+ 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+ 		dentry = do_revalidate(dentry, nd);
+ 
+@@ -502,6 +518,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
+ 	struct dentry * result;
+ 	struct inode *dir = parent->d_inode;
+ 
++repeat:
+ 	mutex_lock(&dir->i_mutex);
+ 	/*
+ 	 * First re-do the cached lookup just in case it was created
+@@ -540,7 +557,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
+ 	if (result->d_op && result->d_op->d_revalidate) {
+ 		result = do_revalidate(result, nd);
+ 		if (!result)
+-			result = ERR_PTR(-ENOENT);
++			goto repeat;
+ 	}
+ 	return result;
+ }
+@@ -794,6 +811,13 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
+                         read_unlock(&fs->lock);
+ 			break;
+ 		}
++#ifdef CONFIG_VE
++		if (nd->path.dentry == get_exec_env()->root_path.dentry &&
++		    nd->path.mnt == get_exec_env()->root_path.mnt) {
++			read_unlock(&current->fs->lock);
++			break;
++		}
++#endif
+                 read_unlock(&fs->lock);
+ 		spin_lock(&dcache_lock);
+ 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+@@ -835,6 +859,10 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
+ 	if (dentry->d_op && dentry->d_op->d_revalidate)
+ 		goto need_revalidate;
+ done:
++	if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) {
++		dput(dentry);
++		return -ENOENT;
++	}
+ 	path->mnt = mnt;
+ 	path->dentry = dentry;
+ 	__follow_mount(path);
+@@ -872,6 +900,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
+ 	struct inode *inode;
+ 	int err;
+ 	unsigned int lookup_flags = nd->flags;
++	int real_components = 0;
+ 	
+ 	while (*name=='/')
+ 		name++;
+@@ -942,6 +971,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
+ 				break;
+ 		}
+ 		/* This does the actual lookups.. */
++		real_components++;
+ 		err = do_lookup(nd, &this, &next);
+ 		if (err)
+ 			break;
+@@ -955,6 +985,9 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
+ 			goto out_dput;
+ 
+ 		if (inode->i_op->follow_link) {
++			err = -ENOENT;
++			if (lookup_flags & LOOKUP_STRICT)
++				goto out_dput;
+ 			err = do_follow_link(&next, nd);
+ 			if (err)
+ 				goto return_err;
+@@ -1003,6 +1036,7 @@ last_component:
+ 			break;
+ 		inode = next.dentry->d_inode;
+ 		if ((lookup_flags & LOOKUP_FOLLOW)
++		    && !(lookup_flags & LOOKUP_STRICT)
+ 		    && inode && inode->i_op && inode->i_op->follow_link) {
+ 			err = do_follow_link(&next, nd);
+ 			if (err)
+@@ -1024,27 +1058,41 @@ lookup_parent:
+ 		nd->last_type = LAST_NORM;
+ 		if (this.name[0] != '.')
+ 			goto return_base;
+-		if (this.len == 1)
++		if (this.len == 1) {
+ 			nd->last_type = LAST_DOT;
+-		else if (this.len == 2 && this.name[1] == '.')
++			goto return_reval;
++		} else if (this.len == 2 && this.name[1] == '.') {
+ 			nd->last_type = LAST_DOTDOT;
+-		else
+-			goto return_base;
++			goto return_reval;
++		}
++return_base:
++		if (!(nd->flags & LOOKUP_NOAREACHECK)) {
++			err = check_area_access_ve(&nd->path);
++			if (err)
++				break;
++		}
++		return 0;
+ return_reval:
+ 		/*
+ 		 * We bypassed the ordinary revalidation routines.
+ 		 * We may need to check the cached dentry for staleness.
+ 		 */
+-		if (nd->path.dentry && nd->path.dentry->d_sb &&
++		if (!real_components && nd->path.dentry && nd->path.dentry->d_sb &&
+ 		    (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+ 			err = -ESTALE;
+ 			/* Note: we do not d_invalidate() */
+ 			if (!nd->path.dentry->d_op->d_revalidate(
+ 					nd->path.dentry, nd))
++				/*
++				 * This lookup is for `/' or `.' or `..'.
++				 * The filesystem unhashed the dentry itself
++				 * inside d_revalidate (otherwise, d_invalidate
++				 * wouldn't succeed).  As a special courtesy to
++				 * NFS we return an error.   2003/02/19  SAW
++				 */
+ 				break;
+ 		}
+-return_base:
+-		return 0;
++		goto return_base;
+ out_dput:
+ 		path_put_conditional(&next, nd);
+ 		break;
+@@ -2126,6 +2174,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+ {
+ 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
+ }
++EXPORT_SYMBOL_GPL(sys_mknod);
+ 
+ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ {
+@@ -2191,6 +2240,7 @@ asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+ {
+ 	return sys_mkdirat(AT_FDCWD, pathname, mode);
+ }
++EXPORT_SYMBOL_GPL(sys_mkdir);
+ 
+ /*
+  * We try to drop the dentry early: we should have
+@@ -2218,6 +2268,7 @@ void dentry_unhash(struct dentry *dentry)
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_unlock(&dcache_lock);
+ }
++EXPORT_SYMBOL(sys_symlink);
+ 
+ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+@@ -2303,6 +2354,7 @@ asmlinkage long sys_rmdir(const char __user *pathname)
+ {
+ 	return do_rmdir(AT_FDCWD, pathname);
+ }
++EXPORT_SYMBOL_GPL(sys_rmdir);
+ 
+ int vfs_unlink(struct inode *dir, struct dentry *dentry)
+ {
+@@ -2407,6 +2459,7 @@ asmlinkage long sys_unlink(const char __user *pathname)
+ {
+ 	return do_unlinkat(AT_FDCWD, pathname);
+ }
++EXPORT_SYMBOL_GPL(sys_unlink);
+ 
+ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+ {
+@@ -2577,6 +2630,7 @@ asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+ {
+ 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ }
++EXPORT_SYMBOL(sys_rename);
+ 
+ /*
+  * The worst of all namespace operations - renaming directory. "Perverted"
+@@ -2688,6 +2742,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+ 	const char *old_name;
+ 
++	if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir))
++		return -EXDEV;
++
+ 	if (old_dentry->d_inode == new_dentry->d_inode)
+  		return 0;
+  
+diff --git a/fs/namespace.c b/fs/namespace.c
+index 4fc302c..6873efd 100644
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -37,6 +37,7 @@
+ 
+ /* spinlock for vfsmount related operations, inplace of dcache_lock */
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
++EXPORT_SYMBOL(vfsmount_lock);
+ 
+ static int event;
+ static DEFINE_IDA(mnt_id_ida);
+@@ -44,7 +45,8 @@ static DEFINE_IDA(mnt_group_ida);
+ 
+ static struct list_head *mount_hashtable __read_mostly;
+ static struct kmem_cache *mnt_cache __read_mostly;
+-static struct rw_semaphore namespace_sem;
++struct rw_semaphore namespace_sem;
++EXPORT_SYMBOL_GPL(namespace_sem);
+ 
+ /* /sys/fs */
+ struct kobject *fs_kobj;
+@@ -117,6 +119,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
+ 			return NULL;
+ 		}
+ 
++		mnt->owner = VEID(get_exec_env());
+ 		atomic_set(&mnt->mnt_count, 1);
+ 		INIT_LIST_HEAD(&mnt->mnt_hash);
+ 		INIT_LIST_HEAD(&mnt->mnt_child);
+@@ -129,7 +132,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
+ 		atomic_set(&mnt->__mnt_writers, 0);
+ 		if (name) {
+ 			int size = strlen(name) + 1;
+-			char *newname = kmalloc(size, GFP_KERNEL);
++			char *newname = kmalloc(size, GFP_KERNEL_UBC);
+ 			if (newname) {
+ 				memcpy(newname, name, size);
+ 				mnt->mnt_devname = newname;
+@@ -794,15 +797,48 @@ static void show_type(struct seq_file *m, struct super_block *sb)
+ 	}
+ }
+ 
++static int prepare_mnt_root_mangle(struct path *path,
++		char **path_buf, char **ret_path)
++{
++	/* skip FS_NOMOUNT mounts (rootfs) */
++	if (path->mnt->mnt_sb->s_flags & MS_NOUSER)
++		return -EACCES;
++
++	*path_buf = (char *)__get_free_page(GFP_KERNEL);
++	if (!*path_buf)
++		return -ENOMEM;
++
++	*ret_path = d_path(path, *path_buf, PAGE_SIZE);
++	if (IS_ERR(*ret_path)) {
++		free_page((unsigned long)*path_buf);
++		/*
++		 * This means that the file position will be incremented, i.e.
++		 * the total number of "invisible" vfsmnt will leak.
++		 */
++		return -EACCES;
++	}
++	return 0;
++}
++
+ static int show_vfsmnt(struct seq_file *m, void *v)
+ {
+ 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+-	int err = 0;
++	int err;
+ 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
++	char *path_buf, *path;
+ 
+-	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
++	err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
++	if (err < 0)
++		return (err == -EACCES ? 0 : err);
++
++	if (ve_is_super(get_exec_env()) ||
++	    !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC))
++		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
++	else
++		mangle(m, mnt->mnt_sb->s_type->name);
+ 	seq_putc(m, ' ');
+-	seq_path(m, &mnt_path, " \t\n\\");
++	mangle(m, path);
++	free_page((unsigned long) path_buf);
+ 	seq_putc(m, ' ');
+ 	show_type(m, mnt->mnt_sb);
+ 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
+@@ -883,18 +919,27 @@ static int show_vfsstat(struct seq_file *m, void *v)
+ {
+ 	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
+ 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+-	int err = 0;
++	char *path_buf, *path;
++	int err;
++
++	err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
++	if (err < 0)
++		return (err == -EACCES ? 0 : err);
+ 
+ 	/* device */
+ 	if (mnt->mnt_devname) {
+ 		seq_puts(m, "device ");
+-		mangle(m, mnt->mnt_devname);
++		if (ve_is_super(get_exec_env()))
++			mangle(m, mnt->mnt_devname);
++		else
++			mangle(m, mnt->mnt_sb->s_type->name);
+ 	} else
+ 		seq_puts(m, "no device");
+ 
+ 	/* mount point */
+ 	seq_puts(m, " mounted on ");
+-	seq_path(m, &mnt_path, " \t\n\\");
++	mangle(m, path);
++	free_page((unsigned long)path_buf);
+ 	seq_putc(m, ' ');
+ 
+ 	/* file system type */
+@@ -1111,6 +1156,34 @@ static int do_umount(struct vfsmount *mnt, int flags)
+ 	return retval;
+ }
+ 
++#ifdef CONFIG_VE
++void umount_ve_fs_type(struct file_system_type *local_fs_type)
++{
++	struct vfsmount *mnt;
++	struct list_head *p, *q;
++	LIST_HEAD(kill);
++	LIST_HEAD(umount_list);
++
++	down_write(&namespace_sem);
++	spin_lock(&vfsmount_lock);
++	list_for_each_safe(p, q, &current->nsproxy->mnt_ns->list) {
++		mnt = list_entry(p, struct vfsmount, mnt_list);
++		if (mnt->mnt_sb->s_type != local_fs_type)
++			continue;
++		list_del(p);
++		list_add(p, &kill);
++	}
++
++	while (!list_empty(&kill)) {
++		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
++		umount_tree(mnt, 1, &umount_list);
++	}
++	spin_unlock(&vfsmount_lock);
++	up_write(&namespace_sem);
++	release_mounts(&umount_list);
++}
++#endif
++
+ /*
+  * Now umount can handle mount points as well as block devices.
+  * This is important for filesystems which use unnamed block devices.
+@@ -1134,7 +1207,7 @@ asmlinkage long sys_umount(char __user * name, int flags)
+ 		goto dput_and_out;
+ 
+ 	retval = -EPERM;
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		goto dput_and_out;
+ 
+ 	retval = do_umount(nd.path.mnt, flags);
+@@ -1160,7 +1233,7 @@ asmlinkage long sys_oldumount(char __user * name)
+ 
+ static int mount_is_safe(struct nameidata *nd)
+ {
+-	if (capable(CAP_SYS_ADMIN))
++	if (capable(CAP_VE_SYS_ADMIN))
+ 		return 0;
+ 	return -EPERM;
+ #ifdef notyet
+@@ -1430,6 +1503,8 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
+ 
+ 	if (nd->path.dentry != nd->path.mnt->mnt_root)
+ 		return -EINVAL;
++	if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid))
++		return -EPERM;
+ 
+ 	down_write(&namespace_sem);
+ 	if (type == MS_SHARED) {
+@@ -1453,7 +1528,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
+  * noinline this do_mount helper to save do_mount stack space.
+  */
+ static noinline int do_loopback(struct nameidata *nd, char *old_name,
+-				int recurse)
++				int recurse, int mnt_flags)
+ {
+ 	struct nameidata old_nd;
+ 	struct vfsmount *mnt = NULL;
+@@ -1483,6 +1558,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
+ 	if (!mnt)
+ 		goto out;
+ 
++	mnt->mnt_flags |= mnt_flags;
+ 	err = graft_tree(mnt, &nd->path);
+ 	if (err) {
+ 		LIST_HEAD(umount_list);
+@@ -1527,7 +1603,7 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+ 	int err;
+ 	struct super_block *sb = nd->path.mnt->mnt_sb;
+ 
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 
+ 	if (!check_mnt(nd->path.mnt))
+@@ -1536,6 +1612,9 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+ 	if (nd->path.dentry != nd->path.mnt->mnt_root)
+ 		return -EINVAL;
+ 
++	if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid))
++		return -EPERM;
++
+ 	down_write(&sb->s_umount);
+ 	if (flags & MS_BIND)
+ 		err = change_mount_flags(nd->path.mnt, flags);
+@@ -1568,7 +1647,7 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name)
+ 	struct path parent_path;
+ 	struct vfsmount *p;
+ 	int err = 0;
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	if (!old_name || !*old_name)
+ 		return -EINVAL;
+@@ -1576,6 +1655,10 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name)
+ 	if (err)
+ 		return err;
+ 
++	err = -EPERM;
++	if (!ve_accessible_veid(old_nd.path.mnt->owner, get_exec_env()->veid))
++		goto out_nosem;
++
+ 	down_write(&namespace_sem);
+ 	while (d_mountpoint(nd->path.dentry) &&
+ 	       follow_down(&nd->path.mnt, &nd->path.dentry))
+@@ -1633,6 +1716,7 @@ out:
+ 	up_write(&namespace_sem);
+ 	if (!err)
+ 		path_put(&parent_path);
++out_nosem:
+ 	path_put(&old_nd.path);
+ 	return err;
+ }
+@@ -1651,7 +1735,7 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
+ 		return -EINVAL;
+ 
+ 	/* we need capabilities... */
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 
+ 	mnt = do_kern_mount(type, flags, name, data);
+@@ -1690,6 +1774,11 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+ 		goto unlock;
+ 
+ 	newmnt->mnt_flags = mnt_flags;
++
++	/* make this before graft_tree reveals mnt_root to the world... */
++	if (nd->path.dentry->d_flags & DCACHE_VIRTUAL)
++		newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL;
++
+ 	if ((err = graft_tree(newmnt, &nd->path)))
+ 		goto unlock;
+ 
+@@ -1944,7 +2033,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
+ 		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ 				    data_page);
+ 	else if (flags & MS_BIND)
+-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
++		retval = do_loopback(&nd, dev_name, flags & MS_REC, mnt_flags);
+ 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+ 		retval = do_change_type(&nd, flags);
+ 	else if (flags & MS_MOVE)
+@@ -2086,6 +2175,7 @@ out1:
+ 	free_page(type_page);
+ 	return retval;
+ }
++EXPORT_SYMBOL_GPL(sys_mount);
+ 
+ /*
+  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+@@ -2128,7 +2218,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
+ 	struct fs_struct *fs;
+ 
+ 	read_lock(&tasklist_lock);
+-	do_each_thread(g, p) {
++	do_each_thread_ve(g, p) {
+ 		task_lock(p);
+ 		fs = p->fs;
+ 		if (fs) {
+@@ -2143,7 +2233,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
+ 			put_fs_struct(fs);
+ 		} else
+ 			task_unlock(p);
+-	} while_each_thread(g, p);
++	} while_each_thread_ve(g, p);
+ 	read_unlock(&tasklist_lock);
+ }
+ 
+@@ -2314,7 +2404,7 @@ void __init mnt_init(void)
+ 	init_rwsem(&namespace_sem);
+ 
+ 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
+-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
++			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL);
+ 
+ 	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+ 
+@@ -2351,3 +2441,4 @@ void __put_mnt_ns(struct mnt_namespace *ns)
+ 	release_mounts(&umount_list);
+ 	kfree(ns);
+ }
++EXPORT_SYMBOL_GPL(__put_mnt_ns);
+diff --git a/fs/open.c b/fs/open.c
+index a99ad09..2165ec5 100644
+--- a/fs/open.c
++++ b/fs/open.c
+@@ -25,6 +25,7 @@
+ #include <linux/fs.h>
+ #include <linux/personality.h>
+ #include <linux/pagemap.h>
++#include <linux/faudit.h>
+ #include <linux/syscalls.h>
+ #include <linux/rcupdate.h>
+ #include <linux/audit.h>
+@@ -51,7 +52,21 @@ int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+ 
+ EXPORT_SYMBOL(vfs_statfs);
+ 
+-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
++int faudit_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++	struct faudit_statfs_arg arg;
++
++	arg.sb = sb;
++	arg.stat = buf;
++
++	if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg)
++			!= NOTIFY_DONE)
++		return arg.err;
++	return 0;
++}
++
++static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt,
++		struct statfs *buf)
+ {
+ 	struct kstatfs st;
+ 	int retval;
+@@ -60,6 +75,10 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+ 	if (retval)
+ 		return retval;
+ 
++	retval = faudit_statfs(mnt->mnt_sb, &st);
++	if (retval)
++		return retval;
++
+ 	if (sizeof(*buf) == sizeof(st))
+ 		memcpy(buf, &st, sizeof(st));
+ 	else {
+@@ -94,7 +113,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+ 	return 0;
+ }
+ 
+-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
++static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt,
++		struct statfs64 *buf)
+ {
+ 	struct kstatfs st;
+ 	int retval;
+@@ -103,6 +123,10 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+ 	if (retval)
+ 		return retval;
+ 
++	retval = faudit_statfs(mnt->mnt_sb, &st);
++	if (retval)
++		return retval;
++
+ 	if (sizeof(*buf) == sizeof(st))
+ 		memcpy(buf, &st, sizeof(st));
+ 	else {
+@@ -129,7 +153,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
+ 	error = user_path_walk(path, &nd);
+ 	if (!error) {
+ 		struct statfs tmp;
+-		error = vfs_statfs_native(nd.path.dentry, &tmp);
++		error = vfs_statfs_native(nd.path.dentry, nd.path.mnt, &tmp);
+ 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ 			error = -EFAULT;
+ 		path_put(&nd.path);
+@@ -148,7 +172,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
+ 	error = user_path_walk(path, &nd);
+ 	if (!error) {
+ 		struct statfs64 tmp;
+-		error = vfs_statfs64(nd.path.dentry, &tmp);
++		error = vfs_statfs64(nd.path.dentry, nd.path.mnt, &tmp);
+ 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ 			error = -EFAULT;
+ 		path_put(&nd.path);
+@@ -167,7 +191,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
+ 	file = fget(fd);
+ 	if (!file)
+ 		goto out;
+-	error = vfs_statfs_native(file->f_path.dentry, &tmp);
++	error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp);
+ 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ 		error = -EFAULT;
+ 	fput(file);
+@@ -188,7 +212,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
+ 	file = fget(fd);
+ 	if (!file)
+ 		goto out;
+-	error = vfs_statfs64(file->f_path.dentry, &tmp);
++	error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp);
+ 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ 		error = -EFAULT;
+ 	fput(file);
+@@ -701,6 +725,7 @@ out_release:
+ out:
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(sys_chown);
+ 
+ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
+ 			     gid_t group, int flag)
+@@ -939,6 +964,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
+ 	return filp;
+ }
+ 
++int odirect_enable = 0;
+ /*
+  * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
+  * error.
+@@ -960,6 +986,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
+ 		return ERR_PTR(-EINVAL);
+ 	}
+ 
++	if (!capable(CAP_SYS_RAWIO) && !odirect_enable)
++		flags &= ~O_DIRECT;
++
+ 	error = -ENFILE;
+ 	f = get_empty_filp();
+ 	if (f == NULL) {
+@@ -1115,6 +1144,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+ 	asmlinkage_protect(3, ret, filename, flags, mode);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(sys_open);
+ 
+ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
+ 			   int mode)
+diff --git a/fs/partitions/check.c b/fs/partitions/check.c
+index 6149e4b..c904faa 100644
+--- a/fs/partitions/check.c
++++ b/fs/partitions/check.c
+@@ -131,6 +131,7 @@ char *disk_name(struct gendisk *hd, int part, char *buf)
+ 
+ 	return buf;
+ }
++EXPORT_SYMBOL(disk_name);
+ 
+ const char *bdevname(struct block_device *bdev, char *buf)
+ {
+diff --git a/fs/pipe.c b/fs/pipe.c
+index 700f4e0..77ca617 100644
+--- a/fs/pipe.c
++++ b/fs/pipe.c
+@@ -22,6 +22,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+ 
++#include <bc/kmem.h>
++
+ /*
+  * We use a start+len construction, which provides full use of the 
+  * allocated memory.
+@@ -478,7 +480,7 @@ redo1:
+ 			int error, atomic = 1;
+ 
+ 			if (!page) {
+-				page = alloc_page(GFP_HIGHUSER);
++				page = alloc_page(GFP_HIGHUSER | __GFP_UBC);
+ 				if (unlikely(!page)) {
+ 					ret = ret ? : -ENOMEM;
+ 					break;
+@@ -856,7 +858,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+ {
+ 	struct pipe_inode_info *pipe;
+ 
+-	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
++	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC);
+ 	if (pipe) {
+ 		init_waitqueue_head(&pipe->wait);
+ 		pipe->r_counter = pipe->w_counter = 1;
+@@ -1073,6 +1075,7 @@ int do_pipe(int *fd)
+ 	free_write_pipe(fw);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(do_pipe);
+ 
+ /*
+  * sys_pipe() is the normal C calling standard for creating
+diff --git a/fs/proc/array.c b/fs/proc/array.c
+index 797d775..6fd6695 100644
+--- a/fs/proc/array.c
++++ b/fs/proc/array.c
+@@ -81,6 +81,8 @@
+ #include <linux/seq_file.h>
+ #include <linux/pid_namespace.h>
+ 
++#include <bc/beancounter.h>
++
+ #include <asm/pgtable.h>
+ #include <asm/processor.h>
+ #include "internal.h"
+@@ -203,6 +205,15 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+ 	put_group_info(group_info);
+ 
+ 	seq_printf(m, "\n");
++
++#ifdef CONFIG_VE
++	if (ve_is_super(get_exec_env())) {
++		seq_printf(m, "envID:\t%d\nVPid:\t%d\n",
++			p->ve_task_info.owner_env->veid, task_pid_vnr(p));
++		seq_printf(m, "PNState:\t%u\nStopState:\t%u\n",
++			p->pn_state, p->stopped_state);
++	}
++#endif
+ }
+ 
+ static void render_sigset_t(struct seq_file *m, const char *header,
+@@ -242,10 +253,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
+ 	}
+ }
+ 
+-static inline void task_sig(struct seq_file *m, struct task_struct *p)
++void task_sig(struct seq_file *m, struct task_struct *p)
+ {
+ 	unsigned long flags;
+-	sigset_t pending, shpending, blocked, ignored, caught;
++	sigset_t pending, shpending, blocked, ignored, caught, saved;
+ 	int num_threads = 0;
+ 	unsigned long qsize = 0;
+ 	unsigned long qlim = 0;
+@@ -255,12 +266,14 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
+ 	sigemptyset(&blocked);
+ 	sigemptyset(&ignored);
+ 	sigemptyset(&caught);
++	sigemptyset(&saved);
+ 
+ 	rcu_read_lock();
+ 	if (lock_task_sighand(p, &flags)) {
+ 		pending = p->pending.signal;
+ 		shpending = p->signal->shared_pending.signal;
+ 		blocked = p->blocked;
++		saved = p->saved_sigmask;
+ 		collect_sigign_sigcatch(p, &ignored, &caught);
+ 		num_threads = atomic_read(&p->signal->count);
+ 		qsize = atomic_read(&p->user->sigpending);
+@@ -278,6 +291,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
+ 	render_sigset_t(m, "SigBlk:\t", &blocked);
+ 	render_sigset_t(m, "SigIgn:\t", &ignored);
+ 	render_sigset_t(m, "SigCgt:\t", &caught);
++	render_sigset_t(m, "SigSvd:\t", &saved);
+ }
+ 
+ static void render_cap_t(struct seq_file *m, const char *header,
+@@ -301,6 +315,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
+ 	render_cap_t(m, "CapBnd:\t", &p->cap_bset);
+ }
+ 
++#ifdef CONFIG_BEANCOUNTERS
++static inline void ub_dump_task_info(struct task_struct *tsk,
++		char *stsk, int ltsk, char *smm, int lmm)
++{
++	print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk);
++	task_lock(tsk);
++	if (tsk->mm)
++		print_ub_uid(tsk->mm->mm_ub, smm, lmm);
++	else
++		strncpy(smm, "N/A", lmm);
++	task_unlock(tsk);
++}
++#endif
++
+ static inline void task_context_switch_counts(struct seq_file *m,
+ 						struct task_struct *p)
+ {
+@@ -314,6 +342,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+ 			struct pid *pid, struct task_struct *task)
+ {
+ 	struct mm_struct *mm = get_task_mm(task);
++#ifdef CONFIG_BEANCOUNTERS
++	char tsk_ub_info[64], mm_ub_info[64];
++#endif
+ 
+ 	task_name(m, task);
+ 	task_state(m, ns, pid, task);
+@@ -329,6 +360,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
+ 	task_show_regs(m, task);
+ #endif
+ 	task_context_switch_counts(m, task);
++#ifdef CONFIG_BEANCOUNTERS
++	ub_dump_task_info(task,
++			tsk_ub_info, sizeof(tsk_ub_info),
++			mm_ub_info, sizeof(mm_ub_info));
++
++	seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info);
++	seq_printf(m, "MMUB:\t%s\n", mm_ub_info);
++#endif
+ 	return 0;
+ }
+ 
+@@ -410,6 +449,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ 	unsigned long rsslim = 0;
+ 	char tcomm[sizeof(task->comm)];
+ 	unsigned long flags;
++#ifdef CONFIG_BEANCOUNTERS
++	char ub_task_info[64];
++	char ub_mm_info[64];
++#endif
+ 
+ 	state = *get_task_state(task);
+ 	vsize = eip = esp = 0;
+@@ -488,6 +531,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ 	priority = task_prio(task);
+ 	nice = task_nice(task);
+ 
++#ifndef CONFIG_VE
+ 	/* Temporary variable needed for gcc-2.96 */
+ 	/* convert timespec -> nsec*/
+ 	start_time =
+@@ -495,10 +539,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ 				+ task->real_start_time.tv_nsec;
+ 	/* convert nsec -> ticks */
+ 	start_time = nsec_to_clock_t(start_time);
++#else
++	start_time = ve_relative_clock(&task->start_time);
++#endif
++
++#ifdef CONFIG_BEANCOUNTERS
++	ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info),
++				ub_mm_info, sizeof(ub_mm_info));
++#endif
+ 
+ 	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
+ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld"
++#ifdef CONFIG_VE
++	" 0 0 0 0 0 0 0 %d %u"
++#endif
++#ifdef CONFIG_BEANCOUNTERS
++	" %s %s"
++#endif
++	"\n",
+ 		pid_nr_ns(pid, ns),
+ 		tcomm,
+ 		state,
+@@ -545,7 +604,16 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ 		task->policy,
+ 		(unsigned long long)delayacct_blkio_ticks(task),
+ 		cputime_to_clock_t(gtime),
+-		cputime_to_clock_t(cgtime));
++		cputime_to_clock_t(cgtime)
++#ifdef CONFIG_VE
++		, task_pid_vnr(task),
++		VEID(VE_TASK_INFO(task)->owner_env)
++#endif
++#ifdef CONFIG_BEANCOUNTERS
++		, ub_task_info,
++		ub_mm_info
++#endif
++		);
+ 	if (mm)
+ 		mmput(mm);
+ 	return 0;
+diff --git a/fs/proc/base.c b/fs/proc/base.c
+index 3b45537..fb40acb 100644
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -185,10 +185,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
+ 	}
+ 	if (fs) {
+ 		read_lock(&fs->lock);
+-		*path = fs->pwd;
+-		path_get(&fs->pwd);
++		result = d_root_check(&fs->pwd);
++		if (result == 0) {
++			*path = fs->pwd;
++			path_get(&fs->pwd);
++		}
+ 		read_unlock(&fs->lock);
+-		result = 0;
+ 		put_fs_struct(fs);
+ 	}
+ 	return result;
+@@ -511,17 +513,31 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
+ static int proc_fd_access_allowed(struct inode *inode)
+ {
+ 	struct task_struct *task;
+-	int allowed = 0;
++	int err;
++
+ 	/* Allow access to a task's file descriptors if it is us or we
+ 	 * may use ptrace attach to the process and find out that
+ 	 * information.
+ 	 */
++	err = -ENOENT;
+ 	task = get_proc_task(inode);
+ 	if (task) {
+-		allowed = ptrace_may_attach(task);
++		if (ptrace_may_attach(task))
++			err = 0;
++		else
++			/*
++			 * This clever ptrace_may_attach() may play a trick
++			 * on us. If the task is zombie it will consider this
++			 * task to be not dumpable at all and will deny any
++			 * ptracing in VE. Not a big deal for ptrace(), but
++			 * following the link will fail with the -EACCESS
++			 * reason. Some software is unable to stand such a
++			 * swindle and refuses to work :(
++			 */
++			err = (task->mm ? -EACCES : -ENOENT);
+ 		put_task_struct(task);
+ 	}
+-	return allowed;
++	return err;
+ }
+ 
+ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+@@ -996,6 +1012,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
+ 	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
+ 	     oom_adjust != OOM_DISABLE)
+ 		return -EINVAL;
++	if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env()))
++		return -EPERM;
+ 	if (*end == '\n')
+ 		end++;
+ 	task = get_proc_task(file->f_path.dentry->d_inode);
+@@ -1288,10 +1306,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path)
+ 	exe_file = get_mm_exe_file(mm);
+ 	mmput(mm);
+ 	if (exe_file) {
+-		*exe_path = exe_file->f_path;
+-		path_get(&exe_file->f_path);
++		int result;
++
++		result = d_root_check(&exe_file->f_path);
++		if (result == 0) {
++			*exe_path = exe_file->f_path;
++			path_get(&exe_file->f_path);
++		}
+ 		fput(exe_file);
+-		return 0;
++		return result;
+ 	} else
+ 		return -ENOENT;
+ }
+@@ -1299,13 +1322,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path)
+ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+ 	struct inode *inode = dentry->d_inode;
+-	int error = -EACCES;
++	int error;
+ 
+ 	/* We don't need a base pointer in the /proc filesystem */
+ 	path_put(&nd->path);
+ 
+ 	/* Are we allowed to snoop on the tasks file descriptors? */
+-	if (!proc_fd_access_allowed(inode))
++	error = proc_fd_access_allowed(inode);
++	if (error < 0)
+ 		goto out;
+ 
+ 	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+@@ -1340,12 +1364,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+ 
+ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
+ {
+-	int error = -EACCES;
++	int error;
+ 	struct inode *inode = dentry->d_inode;
+ 	struct path path;
+ 
+ 	/* Are we allowed to snoop on the tasks file descriptors? */
+-	if (!proc_fd_access_allowed(inode))
++	error = proc_fd_access_allowed(inode);
++	if (error < 0)
+ 		goto out;
+ 
+ 	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+@@ -1586,6 +1611,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+ 	struct files_struct *files = NULL;
+ 	struct file *file;
+ 	int fd = proc_fd(inode);
++	int err = -ENOENT;
+ 
+ 	if (task) {
+ 		files = get_files_struct(task);
+@@ -1598,7 +1624,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+ 		 */
+ 		spin_lock(&files->file_lock);
+ 		file = fcheck_files(files, fd);
+-		if (file) {
++		err = -EACCES;
++		if (file && !d_root_check(&file->f_path)) {
+ 			if (path) {
+ 				*path = file->f_path;
+ 				path_get(&file->f_path);
+@@ -1616,7 +1643,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+ 		spin_unlock(&files->file_lock);
+ 		put_files_struct(files);
+ 	}
+-	return -ENOENT;
++	return err;
+ }
+ 
+ static int proc_fd_link(struct inode *inode, struct path *path)
+diff --git a/fs/proc/generic.c b/fs/proc/generic.c
+index 43e54e8..76240e2 100644
+--- a/fs/proc/generic.c
++++ b/fs/proc/generic.c
+@@ -228,6 +228,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+ 	struct proc_dir_entry *de = PDE(inode);
+ 	int error;
+ 
++	if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) &&
++			LPDE(inode) == PDE(inode))
++		return -EPERM;
++
+ 	error = inode_change_ok(inode, iattr);
+ 	if (error)
+ 		goto out;
+@@ -236,9 +240,12 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+ 	if (error)
+ 		goto out;
+ 	
+-	de->uid = inode->i_uid;
+-	de->gid = inode->i_gid;
+-	de->mode = inode->i_mode;
++	if (iattr->ia_valid & ATTR_UID)
++		de->uid = inode->i_uid;
++	if (iattr->ia_valid & ATTR_GID)
++		de->gid = inode->i_gid;
++	if (iattr->ia_valid & ATTR_MODE)
++		de->mode = inode->i_mode;
+ out:
+ 	return error;
+ }
+@@ -371,29 +378,61 @@ static struct dentry_operations proc_dentry_operations =
+ 	.d_delete	= proc_delete_dentry,
+ };
+ 
++static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir,
++		const char *name, int namelen)
++{
++	struct proc_dir_entry *de;
++
++	for (de = dir->subdir; de ; de = de->next) {
++		if (de->namelen != namelen)
++			continue;
++		if (memcmp(de->name, name, namelen))
++			continue;
++		break;
++	}
++	return de;
++}
++
+ /*
+  * Don't create negative dentries here, return -ENOENT by hand
+  * instead.
+  */
+-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
+-		struct dentry *dentry)
++struct dentry *proc_lookup_de(struct proc_dir_entry *de,
++		struct proc_dir_entry *lde,
++		struct inode *dir, struct dentry *dentry)
+ {
+ 	struct inode *inode = NULL;
+ 	int error = -ENOENT;
+ 
+ 	lock_kernel();
+ 	spin_lock(&proc_subdir_lock);
+-	for (de = de->subdir; de ; de = de->next) {
+-		if (de->namelen != dentry->d_name.len)
+-			continue;
+-		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
++	de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len);
++	if (lde != NULL)
++		lde = __proc_lookup(lde, dentry->d_name.name,
++				dentry->d_name.len);
++
++	if (de == NULL)
++		de = lde;
++
++	if (de != NULL) {
++		/*
++		 * de     lde    meaning   inode(g,l)
++		 * ------------------------------------
++		 * NULL   NULL   -ENOENT   *
++		 * X      NULL   global    X NULL
++		 * NULL   X      local     X X
++		 * X      Y      both      X Y
++		 */
++		{
+ 			unsigned int ino;
+ 
+ 			ino = de->low_ino;
+ 			de_get(de);
++			if (lde != NULL)
++				de_get(lde);
+ 			spin_unlock(&proc_subdir_lock);
+ 			error = -EINVAL;
+-			inode = proc_get_inode(dir->i_sb, ino, de);
++			inode = proc_get_inode(dir->i_sb, ino, de, lde);
+ 			goto out_unlock;
+ 		}
+ 	}
+@@ -408,13 +447,15 @@ out_unlock:
+ 	}
+ 	if (de)
+ 		de_put(de);
++	if (lde)
++		de_put(lde);
+ 	return ERR_PTR(error);
+ }
+ 
+ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+ 		struct nameidata *nd)
+ {
+-	return proc_lookup_de(PDE(dir), dir, dentry);
++	return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry);
+ }
+ 
+ /*
+@@ -426,13 +467,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+  * value of the readdir() call, as long as it's non-negative
+  * for success..
+  */
+-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+-		filldir_t filldir)
++int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde,
++		struct file *filp, void *dirent, filldir_t filldir)
+ {
+ 	unsigned int ino;
+ 	int i;
+ 	struct inode *inode = filp->f_path.dentry->d_inode;
+ 	int ret = 0;
++	struct proc_dir_entry *ode = de, *fde = NULL;
+ 
+ 	lock_kernel();
+ 
+@@ -455,25 +497,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+ 			/* fall through */
+ 		default:
+ 			spin_lock(&proc_subdir_lock);
+-			de = de->subdir;
+ 			i -= 2;
+-			for (;;) {
+-				if (!de) {
+-					ret = 1;
+-					spin_unlock(&proc_subdir_lock);
+-					goto out;
+-				}
+-				if (!i)
+-					break;
+-				de = de->next;
+-				i--;
+-			}
+-
+-			do {
++repeat:
++			de = de->subdir;
++			while (de != NULL) {
+ 				struct proc_dir_entry *next;
+ 
+-				/* filldir passes info to user space */
+ 				de_get(de);
++				if (i-- > 0 || (fde != NULL &&
++							__proc_lookup(fde,
++							de->name, de->namelen)))
++					goto skip;
++
++				/* filldir passes info to user space */
+ 				spin_unlock(&proc_subdir_lock);
+ 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
+ 					    de->low_ino, de->mode >> 12) < 0) {
+@@ -482,10 +518,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+ 				}
+ 				spin_lock(&proc_subdir_lock);
+ 				filp->f_pos++;
++skip:
+ 				next = de->next;
+ 				de_put(de);
+ 				de = next;
+-			} while (de);
++			}
++
++			if (fde == NULL && lde != NULL && lde != ode) {
++				de = lde;
++				fde = ode;
++				goto repeat;
++			}
+ 			spin_unlock(&proc_subdir_lock);
+ 	}
+ 	ret = 1;
+@@ -497,7 +540,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ {
+ 	struct inode *inode = filp->f_path.dentry->d_inode;
+ 
+-	return proc_readdir_de(PDE(inode), filp, dirent, filldir);
++	return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir);
+ }
+ 
+ /*
+diff --git a/fs/proc/inode.c b/fs/proc/inode.c
+index b08d100..4076902 100644
+--- a/fs/proc/inode.c
++++ b/fs/proc/inode.c
+@@ -385,7 +385,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
+ #endif
+ 
+ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+-				struct proc_dir_entry *de)
++		struct proc_dir_entry *de, struct proc_dir_entry *lde)
+ {
+ 	struct inode * inode;
+ 
+@@ -399,6 +399,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
+ 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ 		PROC_I(inode)->fd = 0;
+ 		PROC_I(inode)->pde = de;
++#ifdef CONFIG_VE
++		PROC_I(inode)->lpde = lde;
++#endif
+ 
+ 		if (de->mode) {
+ 			inode->i_mode = de->mode;
+@@ -445,9 +448,11 @@ int proc_fill_super(struct super_block *s)
+ 	s->s_magic = PROC_SUPER_MAGIC;
+ 	s->s_op = &proc_sops;
+ 	s->s_time_gran = 1;
+-	
+-	de_get(&proc_root);
+-	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
++
++	de_get(get_exec_env()->proc_root);
++	de_get(&glob_proc_root);
++	root_inode = proc_get_inode(s, PROC_ROOT_INO,
++			&glob_proc_root, get_exec_env()->proc_root);
+ 	if (!root_inode)
+ 		goto out_no_root;
+ 	root_inode->i_uid = 0;
+diff --git a/fs/proc/internal.h b/fs/proc/internal.h
+index 28cbca8..e7825f8 100644
+--- a/fs/proc/internal.h
++++ b/fs/proc/internal.h
+@@ -12,6 +12,12 @@
+ #include <linux/proc_fs.h>
+ 
+ extern struct proc_dir_entry proc_root;
++#ifdef CONFIG_VE
++extern struct proc_dir_entry glob_proc_root;
++#else
++#define glob_proc_root	proc_root
++#endif
++
+ #ifdef CONFIG_PROC_SYSCTL
+ extern int proc_sys_init(void);
+ #else
+@@ -84,7 +90,8 @@ static inline int proc_fd(struct inode *inode)
+ 	return PROC_I(inode)->fd;
+ }
+ 
+-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
++struct dentry *proc_lookup_de(struct proc_dir_entry *de,
++		struct proc_dir_entry *lpde, struct inode *ino,
+ 		struct dentry *dentry);
+-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+-		filldir_t filldir);
++int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde,
++		struct file *filp, void *dirent, filldir_t filldir);
+diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
+index ff3b90b..8e70bca 100644
+--- a/fs/proc/kmsg.c
++++ b/fs/proc/kmsg.c
+@@ -11,6 +11,8 @@
+ #include <linux/kernel.h>
+ #include <linux/poll.h>
+ #include <linux/fs.h>
++#include <linux/veprintk.h>
++#include <linux/module.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/io.h>
+@@ -40,7 +42,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
+ 
+ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
+ {
+-	poll_wait(file, &log_wait, wait);
++	poll_wait(file, &ve_log_wait, wait);
+ 	if (do_syslog(9, NULL, 0))
+ 		return POLLIN | POLLRDNORM;
+ 	return 0;
+@@ -53,3 +55,4 @@ const struct file_operations proc_kmsg_operations = {
+ 	.open		= kmsg_open,
+ 	.release	= kmsg_release,
+ };
++EXPORT_SYMBOL_GPL(proc_kmsg_operations);
+diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
+index 7e277f2..3a7f65e 100644
+--- a/fs/proc/proc_misc.c
++++ b/fs/proc/proc_misc.c
+@@ -32,6 +32,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
++#include <linux/virtinfo.h>
+ #include <linux/genhd.h>
+ #include <linux/smp.h>
+ #include <linux/signal.h>
+@@ -48,6 +49,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/crash_dump.h>
+ #include <linux/pid_namespace.h>
++#include <linux/vmstat.h>
+ #include <linux/bootmem.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -86,19 +88,39 @@ static int loadavg_read_proc(char *page, char **start, off_t off,
+ 	int a, b, c;
+ 	int len;
+ 	unsigned long seq;
++	long running, threads;
++	struct ve_struct *ve;
+ 
++	ve = get_exec_env();
+ 	do {
+ 		seq = read_seqbegin(&xtime_lock);
+-		a = avenrun[0] + (FIXED_1/200);
+-		b = avenrun[1] + (FIXED_1/200);
+-		c = avenrun[2] + (FIXED_1/200);
++		if (ve_is_super(ve)) {
++			a = avenrun[0] + (FIXED_1/200);
++			b = avenrun[1] + (FIXED_1/200);
++			c = avenrun[2] + (FIXED_1/200);
++#ifdef CONFIG_VE
++		} else {
++			a = ve->avenrun[0] + (FIXED_1/200);
++			b = ve->avenrun[1] + (FIXED_1/200);
++			c = ve->avenrun[2] + (FIXED_1/200);
++#endif
++		}
+ 	} while (read_seqretry(&xtime_lock, seq));
++		if (ve_is_super(ve)) {
++			running = nr_running();
++			threads = nr_threads;
++#ifdef CONFIG_VE
++		} else {
++			running = nr_running_ve(ve);
++			threads = atomic_read(&ve->pcounter);
++#endif
++		}
+ 
+-	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
++	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%ld %d\n",
+ 		LOAD_INT(a), LOAD_FRAC(a),
+ 		LOAD_INT(b), LOAD_FRAC(b),
+ 		LOAD_INT(c), LOAD_FRAC(c),
+-		nr_running(), nr_threads,
++		running, threads,
+ 		task_active_pid_ns(current)->last_pid);
+ 	return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+@@ -113,6 +135,13 @@ static int uptime_read_proc(char *page, char **start, off_t off,
+ 
+ 	do_posix_clock_monotonic_gettime(&uptime);
+ 	monotonic_to_bootbased(&uptime);
++#ifdef CONFIG_VE
++	if (!ve_is_super(get_exec_env())) {
++		set_normalized_timespec(&uptime,
++		      uptime.tv_sec - get_exec_env()->start_timespec.tv_sec,
++		      uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
++	}
++#endif
+ 	cputime_to_timespec(idletime, &idle);
+ 	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+ 			(unsigned long) uptime.tv_sec,
+@@ -126,29 +155,50 @@ static int uptime_read_proc(char *page, char **start, off_t off,
+ static int meminfo_read_proc(char *page, char **start, off_t off,
+ 				 int count, int *eof, void *data)
+ {
+-	struct sysinfo i;
++	struct meminfo mi;
+ 	int len;
+-	unsigned long committed;
+-	unsigned long allowed;
++	unsigned long dummy;
+ 	struct vmalloc_info vmi;
+-	long cached;
++
++	get_zone_counts(&mi.active, &mi.inactive, &dummy);
+ 
+ /*
+  * display in kilobytes.
+  */
+ #define K(x) ((x) << (PAGE_SHIFT - 10))
+-	si_meminfo(&i);
+-	si_swapinfo(&i);
+-	committed = atomic_long_read(&vm_committed_space);
+-	allowed = ((totalram_pages - hugetlb_total_pages())
++	si_meminfo(&mi.si);
++	si_swapinfo(&mi.si);
++	mi.committed_space = atomic_read(&vm_committed_space);
++	mi.swapcache = total_swapcache_pages;
++	mi.allowed = ((totalram_pages - hugetlb_total_pages())
+ 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+ 
+-	cached = global_page_state(NR_FILE_PAGES) -
+-			total_swapcache_pages - i.bufferram;
+-	if (cached < 0)
+-		cached = 0;
++	mi.cache = global_page_state(NR_FILE_PAGES) -
++			total_swapcache_pages - mi.si.bufferram;
++	if (mi.cache < 0)
++		mi.cache = 0;
+ 
+ 	get_vmalloc_info(&vmi);
++	mi.vmalloc_used = vmi.used >> PAGE_SHIFT;
++	mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT;
++	mi.vmalloc_total = VMALLOC_TOTAL >> PAGE_SHIFT;
++
++	mi.pi.nr_file_dirty = global_page_state(NR_FILE_DIRTY);
++	mi.pi.nr_writeback = global_page_state(NR_WRITEBACK);
++	mi.pi.nr_anon_pages = global_page_state(NR_ANON_PAGES);
++	mi.pi.nr_file_mapped = global_page_state(NR_FILE_MAPPED);
++	mi.pi.nr_slab_rec = global_page_state(NR_SLAB_RECLAIMABLE);
++	mi.pi.nr_slab_unrec = global_page_state(NR_SLAB_UNRECLAIMABLE);
++	mi.pi.nr_pagetable = global_page_state(NR_PAGETABLE);
++	mi.pi.nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
++	mi.pi.nr_bounce = global_page_state(NR_BOUNCE);
++	mi.pi.nr_writeback_temp = global_page_state(NR_WRITEBACK_TEMP);
++
++#ifdef CONFIG_BEANCOUNTERS
++	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi)
++			& NOTIFY_FAIL)
++		return -ENOMSG;
++#endif
+ 
+ 	/*
+ 	 * Tagged format, for easy grepping and expansion.
+@@ -185,38 +235,38 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
+ 		"VmallocTotal: %8lu kB\n"
+ 		"VmallocUsed:  %8lu kB\n"
+ 		"VmallocChunk: %8lu kB\n",
+-		K(i.totalram),
+-		K(i.freeram),
+-		K(i.bufferram),
+-		K(cached),
+-		K(total_swapcache_pages),
+-		K(global_page_state(NR_ACTIVE)),
+-		K(global_page_state(NR_INACTIVE)),
++		K(mi.si.totalram),
++		K(mi.si.freeram),
++		K(mi.si.bufferram),
++		K(mi.cache),
++		K(mi.swapcache),
++		K(mi.active),
++		K(mi.inactive),
+ #ifdef CONFIG_HIGHMEM
+-		K(i.totalhigh),
+-		K(i.freehigh),
+-		K(i.totalram-i.totalhigh),
+-		K(i.freeram-i.freehigh),
++		K(mi.si.totalhigh),
++		K(mi.si.freehigh),
++		K(mi.si.totalram-mi.si.totalhigh),
++		K(mi.si.freeram-mi.si.freehigh),
+ #endif
+-		K(i.totalswap),
+-		K(i.freeswap),
+-		K(global_page_state(NR_FILE_DIRTY)),
+-		K(global_page_state(NR_WRITEBACK)),
+-		K(global_page_state(NR_ANON_PAGES)),
+-		K(global_page_state(NR_FILE_MAPPED)),
+-		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+-				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+-		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+-		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+-		K(global_page_state(NR_PAGETABLE)),
+-		K(global_page_state(NR_UNSTABLE_NFS)),
+-		K(global_page_state(NR_BOUNCE)),
+-		K(global_page_state(NR_WRITEBACK_TEMP)),
+-		K(allowed),
+-		K(committed),
+-		(unsigned long)VMALLOC_TOTAL >> 10,
+-		vmi.used >> 10,
+-		vmi.largest_chunk >> 10
++		K(mi.si.totalswap),
++		K(mi.si.freeswap),
++		K(mi.pi.nr_file_dirty),
++		K(mi.pi.nr_writeback),
++		K(mi.pi.nr_anon_pages),
++		K(mi.pi.nr_file_mapped),
++		K(mi.pi.nr_slab_rec +
++				mi.pi.nr_slab_unrec),
++		K(mi.pi.nr_slab_rec),
++		K(mi.pi.nr_slab_unrec),
++		K(mi.pi.nr_pagetable),
++		K(mi.pi.nr_unstable_nfs),
++		K(mi.pi.nr_bounce),
++		K(mi.pi.nr_writeback_temp),
++		K(mi.allowed),
++		K(mi.committed_space),
++		K(mi.vmalloc_total),
++		K(mi.vmalloc_used),
++		K(mi.vmalloc_largest)
+ 		);
+ 
+ 		len += hugetlb_report_meminfo(page + len);
+@@ -472,25 +522,21 @@ static const struct file_operations proc_vmalloc_operations = {
+ };
+ #endif
+ 
+-static int show_stat(struct seq_file *p, void *v)
++static void show_stat_ve0(struct seq_file *p)
+ {
+ 	int i;
+-	unsigned long jif;
+ 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+ 	cputime64_t guest;
+ 	u64 sum = 0;
+-	struct timespec boottime;
+ 	unsigned int *per_irq_sum;
+ 
+ 	per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
+ 	if (!per_irq_sum)
+-		return -ENOMEM;
++		return;
+ 
+ 	user = nice = system = idle = iowait =
+ 		irq = softirq = steal = cputime64_zero;
+ 	guest = cputime64_zero;
+-	getboottime(&boottime);
+-	jif = boottime.tv_sec;
+ 
+ 	for_each_possible_cpu(i) {
+ 		int j;
+@@ -550,9 +596,85 @@ static int show_stat(struct seq_file *p, void *v)
+ 
+ 	for (i = 0; i < NR_IRQS; i++)
+ 		seq_printf(p, " %u", per_irq_sum[i]);
++	kfree(per_irq_sum);
++	seq_printf(p, "\nswap %lu %lu\n",
++			vm_events(PSWPIN), vm_events(PSWPOUT));
++}
++
++#ifdef CONFIG_VE
++static void show_stat_ve(struct seq_file *p, struct ve_struct *ve)
++{
++	int i;
++	u64 user, nice, system;
++	cycles_t idle, iowait;
++	cpumask_t ve_cpus;
++
++	ve_cpu_online_map(ve, &ve_cpus);
++
++	user = nice = system = idle = iowait = 0;
++	for_each_cpu_mask(i, ve_cpus) {
++		user += VE_CPU_STATS(ve, i)->user;
++		nice += VE_CPU_STATS(ve, i)->nice;
++		system += VE_CPU_STATS(ve, i)->system;
++		idle += ve_sched_get_idle_time(ve, i);
++		iowait += ve_sched_get_iowait_time(ve, i);
++	}
++
++	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 0\n",
++		(unsigned long long)cputime64_to_clock_t(user),
++		(unsigned long long)cputime64_to_clock_t(nice),
++		(unsigned long long)cputime64_to_clock_t(system),
++		(unsigned long long)cycles_to_clocks(idle),
++		(unsigned long long)cycles_to_clocks(iowait));
++
++	for_each_cpu_mask(i, ve_cpus) {
++		user = VE_CPU_STATS(ve, i)->user;
++		nice = VE_CPU_STATS(ve, i)->nice;
++		system = VE_CPU_STATS(ve, i)->system;
++		idle = ve_sched_get_idle_time(ve, i);
++		iowait = ve_sched_get_iowait_time(ve, i);
++		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n",
++			i,
++			(unsigned long long)cputime64_to_clock_t(user),
++			(unsigned long long)cputime64_to_clock_t(nice),
++			(unsigned long long)cputime64_to_clock_t(system),
++			(unsigned long long)cycles_to_clocks(idle),
++			(unsigned long long)cycles_to_clocks(iowait));
++	}
++	seq_printf(p, "intr 0\nswap 0 0\n");
++}
++#endif
++
++int show_stat(struct seq_file *p, void *v)
++{
++	extern unsigned long total_forks;
++	unsigned long seq, jif;
++	struct ve_struct *env;
++	unsigned long __nr_running, __nr_iowait;
++ 
++	do {
++		seq = read_seqbegin(&xtime_lock);
++		jif = - wall_to_monotonic.tv_sec;
++		if (wall_to_monotonic.tv_nsec)
++			--jif;
++	} while (read_seqretry(&xtime_lock, seq));
++
++	env = get_exec_env();
++	if (ve_is_super(env)) {
++		show_stat_ve0(p);
++		__nr_running = nr_running();
++		__nr_iowait = nr_iowait();
++	}
++#ifdef CONFIG_VE
++	else {
++		show_stat_ve(p, env);
++		__nr_running = nr_running_ve(env);
++		__nr_iowait = nr_iowait_ve(env);
++	}
++#endif
+ 
+ 	seq_printf(p,
+-		"\nctxt %llu\n"
++		"ctxt %llu\n"
+ 		"btime %lu\n"
+ 		"processes %lu\n"
+ 		"procs_running %lu\n"
+@@ -560,10 +682,9 @@ static int show_stat(struct seq_file *p, void *v)
+ 		nr_context_switches(),
+ 		(unsigned long)jif,
+ 		total_forks,
+-		nr_running(),
+-		nr_iowait());
++		__nr_running,
++		__nr_iowait);
+ 
+-	kfree(per_irq_sum);
+ 	return 0;
+ }
+ 
+@@ -650,7 +771,8 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
+ {
+ 	int len;
+ 
+-	len = sprintf(page, "%s\n", saved_command_line);
++	len = sprintf(page, "%s\n",
++		ve_is_super(get_exec_env()) ? saved_command_line : "quiet");
+ 	return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+ 
+@@ -681,11 +803,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
+ 				   size_t count, loff_t *ppos)
+ {
+ 	if (count) {
+-		char c;
++		int i, cnt;
++		char c[32];
+ 
+-		if (get_user(c, buf))
++		cnt = min(count, sizeof(c));
++		if (copy_from_user(c, buf, cnt))
+ 			return -EFAULT;
+-		__handle_sysrq(c, NULL, 0);
++
++
++		for (i = 0; i < cnt && c[i] != '\n'; i++)
++			__handle_sysrq(c[i], NULL, 0);
+ 	}
+ 	return count;
+ }
+@@ -833,38 +960,39 @@ void __init proc_misc_init(void)
+ 	static struct {
+ 		char *name;
+ 		int (*read_proc)(char*,char**,off_t,int,int*,void*);
++		struct proc_dir_entry *parent;
+ 	} *p, simple_ones[] = {
+-		{"loadavg",     loadavg_read_proc},
+-		{"uptime",	uptime_read_proc},
+-		{"meminfo",	meminfo_read_proc},
+-		{"version",	version_read_proc},
++		{"loadavg",     loadavg_read_proc, &glob_proc_root},
++		{"uptime",	uptime_read_proc, &glob_proc_root},
++		{"meminfo",	meminfo_read_proc, &glob_proc_root},
++		{"version",	version_read_proc, &glob_proc_root},
+ #ifdef CONFIG_PROC_HARDWARE
+ 		{"hardware",	hardware_read_proc},
+ #endif
+ #ifdef CONFIG_STRAM_PROC
+ 		{"stram",	stram_read_proc},
+ #endif
+-		{"filesystems",	filesystems_read_proc},
+-		{"cmdline",	cmdline_read_proc},
++		{"filesystems",	filesystems_read_proc, &glob_proc_root},
++		{"cmdline",	cmdline_read_proc, &glob_proc_root},
+ 		{"execdomains",	execdomains_read_proc},
+ 		{NULL,}
+ 	};
+ 	for (p = simple_ones; p->name; p++)
+-		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);
++		create_proc_read_entry(p->name, 0, p->parent, p->read_proc, NULL);
+ 
+-	proc_symlink("mounts", NULL, "self/mounts");
++	proc_symlink("mounts", &glob_proc_root, "self/mounts");
+ 
+ 	/* And now for trickier ones */
+ #ifdef CONFIG_PRINTK
+ 	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+ #endif
+-	proc_create("locks", 0, NULL, &proc_locks_operations);
++	proc_create("locks", 0, &glob_proc_root, &proc_locks_operations);
+ 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
++	proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations);
+ #ifdef CONFIG_BLOCK
+ 	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+ #endif
+-	proc_create("stat", 0, NULL, &proc_stat_operations);
++	proc_create("stat", 0, &glob_proc_root, &proc_stat_operations);
+ 	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+ #ifdef CONFIG_SLABINFO
+ 	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+@@ -877,13 +1005,13 @@ void __init proc_misc_init(void)
+ #endif
+ 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+ 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
++	proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations);
+ 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+ #ifdef CONFIG_BLOCK
+ 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
+ #endif
+ #ifdef CONFIG_MODULES
+-	proc_create("modules", 0, NULL, &proc_modules_operations);
++	proc_create("modules", 0, &glob_proc_root, &proc_modules_operations);
+ #endif
+ #ifdef CONFIG_SCHEDSTATS
+ 	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
+index 83f357b..108d432 100644
+--- a/fs/proc/proc_net.c
++++ b/fs/proc/proc_net.c
+@@ -90,7 +90,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+ 	de = ERR_PTR(-ENOENT);
+ 	net = get_proc_task_net(dir);
+ 	if (net != NULL) {
+-		de = proc_lookup_de(net->proc_net, dir, dentry);
++		de = proc_lookup_de(net->proc_net, NULL, dir, dentry);
+ 		put_net(net);
+ 	}
+ 	return de;
+@@ -128,7 +128,8 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+ 	ret = -EINVAL;
+ 	net = get_proc_task_net(filp->f_path.dentry->d_inode);
+ 	if (net != NULL) {
+-		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
++		ret = proc_readdir_de(net->proc_net, NULL,
++				filp, dirent, filldir);
+ 		put_net(net);
+ 	}
+ 	return ret;
+@@ -203,7 +204,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
+ 
+ int __init proc_net_init(void)
+ {
+-	proc_symlink("net", NULL, "self/net");
++	proc_symlink("net", &glob_proc_root, "self/net");
+ 
+ 	return register_pernet_subsys(&proc_net_ns_ops);
+ }
+diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
+index 5acc001..37a80ee 100644
+--- a/fs/proc/proc_sysctl.c
++++ b/fs/proc/proc_sysctl.c
+@@ -442,7 +442,7 @@ static struct proc_dir_entry *proc_sys_root;
+ 
+ int proc_sys_init(void)
+ {
+-	proc_sys_root = proc_mkdir("sys", NULL);
++	proc_sys_root = proc_mkdir("sys", &glob_proc_root);
+ 	proc_sys_root->proc_iops = &proc_sys_inode_operations;
+ 	proc_sys_root->proc_fops = &proc_sys_file_operations;
+ 	proc_sys_root->nlink = 0;
+diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
+index 21f490f..dba8a27 100644
+--- a/fs/proc/proc_tty.c
++++ b/fs/proc/proc_tty.c
+@@ -13,6 +13,7 @@
+ #include <linux/stat.h>
+ #include <linux/tty.h>
+ #include <linux/seq_file.h>
++#include <linux/sched.h>
+ #include <linux/bitops.h>
+ 
+ /*
+@@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_file *m, void *v)
+ 	dev_t from = MKDEV(p->major, p->minor_start);
+ 	dev_t to = from + p->num;
+ 
++	if (!ve_accessible_strict(p->owner_env, get_exec_env()))
++		goto out;
++
+ 	if (&p->tty_drivers == tty_drivers.next) {
+ 		/* pseudo-drivers first */
+ 		seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
+@@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_file *m, void *v)
+ 	}
+ 	if (from != to)
+ 		show_tty_range(m, p, from, to - from);
++out:
+ 	return 0;
+ }
+ 
+diff --git a/fs/proc/root.c b/fs/proc/root.c
+index 9511753..e2390df 100644
+--- a/fs/proc/root.c
++++ b/fs/proc/root.c
+@@ -43,6 +43,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
+ 	struct super_block *sb;
+ 	struct pid_namespace *ns;
+ 	struct proc_inode *ei;
++#ifdef CONFIG_VE
++	struct vfsmount *proc_mnt = fs_type->owner_env->proc_mnt;
++#endif
+ 
+ 	if (proc_mnt) {
+ 		/* Seed the root directory with a pid so it doesn't need
+@@ -96,11 +99,12 @@ static void proc_kill_sb(struct super_block *sb)
+ 	put_pid_ns(ns);
+ }
+ 
+-static struct file_system_type proc_fs_type = {
++struct file_system_type proc_fs_type = {
+ 	.name		= "proc",
+ 	.get_sb		= proc_get_sb,
+ 	.kill_sb	= proc_kill_sb,
+ };
++EXPORT_SYMBOL(proc_fs_type);
+ 
+ void __init proc_root_init(void)
+ {
+@@ -110,6 +114,11 @@ void __init proc_root_init(void)
+ 	err = register_filesystem(&proc_fs_type);
+ 	if (err)
+ 		return;
++
++#ifdef CONFIG_VE
++	get_ve0()->proc_root = &proc_root;
++#endif
++
+ 	proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+ 	err = PTR_ERR(proc_mnt);
+ 	if (IS_ERR(proc_mnt)) {
+@@ -117,16 +126,20 @@ void __init proc_root_init(void)
+ 		return;
+ 	}
+ 
++#ifdef CONFIG_VE
++	get_ve0()->proc_mnt = proc_mnt;
++#endif
++
+ 	proc_misc_init();
+ 
+ 	proc_net_init();
+ 
+ #ifdef CONFIG_SYSVIPC
+-	proc_mkdir("sysvipc", NULL);
++	proc_mkdir("sysvipc", &glob_proc_root);
+ #endif
+-	proc_mkdir("fs", NULL);
++	proc_mkdir("fs", &glob_proc_root);
+ 	proc_mkdir("driver", NULL);
+-	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
++	proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */
+ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
+ 	/* just give it a mountpoint */
+ 	proc_mkdir("openprom", NULL);
+@@ -211,6 +224,22 @@ struct proc_dir_entry proc_root = {
+ 	.parent		= &proc_root,
+ };
+ 
++#ifdef CONFIG_VE
++struct proc_dir_entry glob_proc_root = {
++	.low_ino	= PROC_ROOT_INO, 
++	.namelen	= 5, 
++	.name		= "/proc",
++	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
++	.nlink		= 2, 
++	.count		= ATOMIC_INIT(1),
++	.proc_iops	= &proc_root_inode_operations, 
++	.proc_fops	= &proc_root_operations,
++	.parent		= &glob_proc_root,
++};
++
++EXPORT_SYMBOL(glob_proc_root);
++#endif
++
+ int pid_ns_prepare_proc(struct pid_namespace *ns)
+ {
+ 	struct vfsmount *mnt;
+diff --git a/fs/quota.c b/fs/quota.c
+index db1cc9f..e4fe2a6 100644
+--- a/fs/quota.c
++++ b/fs/quota.c
+@@ -18,6 +18,7 @@
+ #include <linux/capability.h>
+ #include <linux/quotaops.h>
+ #include <linux/types.h>
++#include <linux/device_cgroup.h>
+ 
+ /* Check validity of generic quotactl commands */
+ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+@@ -81,11 +82,11 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
+ 	if (cmd == Q_GETQUOTA) {
+ 		if (((type == USRQUOTA && current->euid != id) ||
+ 		     (type == GRPQUOTA && !in_egroup_p(id))) &&
+-		    !capable(CAP_SYS_ADMIN))
++		    !capable(CAP_VE_SYS_ADMIN))
+ 			return -EPERM;
+ 	}
+ 	else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
+-		if (!capable(CAP_SYS_ADMIN))
++		if (!capable(CAP_VE_SYS_ADMIN))
+ 			return -EPERM;
+ 
+ 	return 0;
+@@ -132,10 +133,10 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
+ 	if (cmd == Q_XGETQUOTA) {
+ 		if (((type == XQM_USRQUOTA && current->euid != id) ||
+ 		     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
+-		     !capable(CAP_SYS_ADMIN))
++		     !capable(CAP_VE_SYS_ADMIN))
+ 			return -EPERM;
+ 	} else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
+-		if (!capable(CAP_SYS_ADMIN))
++		if (!capable(CAP_VE_SYS_ADMIN))
+ 			return -EPERM;
+ 	}
+ 
+@@ -177,6 +178,8 @@ static void quota_sync_sb(struct super_block *sb, int type)
+ 			continue;
+ 		if (!sb_has_quota_enabled(sb, cnt))
+ 			continue;
++		if (!sb_dqopt(sb)->files[cnt])
++			continue;
+ 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
+ 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
+ 		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
+@@ -207,7 +210,7 @@ restart:
+ 		sb->s_count++;
+ 		spin_unlock(&sb_lock);
+ 		down_read(&sb->s_umount);
+-		if (sb->s_root && sb->s_qcop->quota_sync)
++		if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync)
+ 			quota_sync_sb(sb, type);
+ 		up_read(&sb->s_umount);
+ 		spin_lock(&sb_lock);
+@@ -338,6 +341,7 @@ static inline struct super_block *quotactl_block(const char __user *special)
+ 	struct block_device *bdev;
+ 	struct super_block *sb;
+ 	char *tmp = getname(special);
++	int error;
+ 
+ 	if (IS_ERR(tmp))
+ 		return ERR_CAST(tmp);
+@@ -345,6 +349,13 @@ static inline struct super_block *quotactl_block(const char __user *special)
+ 	putname(tmp);
+ 	if (IS_ERR(bdev))
+ 		return ERR_CAST(bdev);
++
++	error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL);
++	if (error) {
++		bdput(bdev);
++		return ERR_PTR(error);
++	}
++
+ 	sb = get_super(bdev);
+ 	bdput(bdev);
+ 	if (!sb)
+@@ -356,6 +367,215 @@ static inline struct super_block *quotactl_block(const char __user *special)
+ #endif
+ }
+ 
++#ifdef CONFIG_QUOTA_COMPAT
++
++#define QC_QUOTAON  0x0100	/* enable quotas */
++#define QC_QUOTAOFF 0x0200	/* disable quotas */
++/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
++#define QC_SYNC     0x0600	/* sync disk copy of a filesystems quotas */
++#define QC_SETQLIM  0x0700	/* set limits */
++/* GETSTATS at 0x0800 is now longer... */
++#define QC_GETINFO  0x0900	/* get info about quotas - graces, flags... */
++#define QC_SETINFO  0x0A00	/* set info about quotas */
++#define QC_SETGRACE 0x0B00	/* set inode and block grace */
++#define QC_SETFLAGS 0x0C00	/* set flags for quota */
++#define QC_GETQUOTA 0x0D00	/* get limits and usage */
++#define QC_SETQUOTA 0x0E00	/* set limits and usage */
++#define QC_SETUSE   0x0F00	/* set usage */
++/* 0x1000 used by old RSQUASH */
++#define QC_GETSTATS 0x1100	/* get collected stats */
++
++struct compat_dqblk {
++	unsigned int dqb_ihardlimit;
++	unsigned int dqb_isoftlimit;
++	unsigned int dqb_curinodes;
++	unsigned int dqb_bhardlimit;
++	unsigned int dqb_bsoftlimit;
++	qsize_t dqb_curspace;
++	__kernel_time_t dqb_btime;
++	__kernel_time_t dqb_itime;
++};
++
++struct compat_dqinfo {
++	unsigned int dqi_bgrace;
++	unsigned int dqi_igrace;
++	unsigned int dqi_flags;
++	unsigned int dqi_blocks;
++	unsigned int dqi_free_blk;
++	unsigned int dqi_free_entry;
++};
++
++struct compat_dqstats {
++	__u32 lookups;
++	__u32 drops;
++	__u32 reads;
++	__u32 writes;
++	__u32 cache_hits;
++	__u32 allocated_dquots;
++	__u32 free_dquots;
++	__u32 syncs;
++	__u32 version;
++};
++
++asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
++static long compat_quotactl(unsigned int cmds, unsigned int type,
++		const char __user *special, qid_t id,
++		void __user *addr)
++{
++	struct super_block *sb;
++	long ret;
++
++	sb = NULL;
++	switch (cmds) {
++		case QC_QUOTAON:
++			return sys_quotactl(QCMD(Q_QUOTAON, type),
++					special, id, addr);
++
++		case QC_QUOTAOFF:
++			return sys_quotactl(QCMD(Q_QUOTAOFF, type),
++					special, id, addr);
++
++		case QC_SYNC:
++			return sys_quotactl(QCMD(Q_SYNC, type),
++					special, id, addr);
++
++		case QC_GETQUOTA: {
++			struct if_dqblk idq;
++			struct compat_dqblk cdq;
++
++			sb = quotactl_block(special);
++			ret = PTR_ERR(sb);
++			if (IS_ERR(sb))
++				break;
++			ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
++			if (ret)
++				break;
++			ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
++			if (ret)
++				break;
++			cdq.dqb_ihardlimit = idq.dqb_ihardlimit;
++			cdq.dqb_isoftlimit = idq.dqb_isoftlimit;
++			cdq.dqb_curinodes = idq.dqb_curinodes;
++			cdq.dqb_bhardlimit = idq.dqb_bhardlimit;
++			cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit;
++			cdq.dqb_curspace = idq.dqb_curspace;
++			cdq.dqb_btime = idq.dqb_btime;
++			cdq.dqb_itime = idq.dqb_itime;
++			ret = 0;
++			if (copy_to_user(addr, &cdq, sizeof(cdq)))
++				ret = -EFAULT;
++			break;
++		}
++
++		case QC_SETQUOTA:
++		case QC_SETUSE:
++		case QC_SETQLIM: {
++			struct if_dqblk idq;
++			struct compat_dqblk cdq;
++
++			sb = quotactl_block(special);
++			ret = PTR_ERR(sb);
++			if (IS_ERR(sb))
++				break;
++			ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id);
++			if (ret)
++				break;
++			ret = -EFAULT;
++			if (copy_from_user(&cdq, addr, sizeof(cdq)))
++				break;
++			idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
++			idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
++			idq.dqb_curinodes = cdq.dqb_curinodes;
++			idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
++			idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
++			idq.dqb_curspace = cdq.dqb_curspace;
++			idq.dqb_valid = 0;
++			if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
++				idq.dqb_valid |= QIF_LIMITS;
++			if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
++				idq.dqb_valid |= QIF_USAGE;
++			ret = sb->s_qcop->set_dqblk(sb, type, id, &idq);
++			break;
++		}
++
++		case QC_GETINFO: {
++			struct if_dqinfo iinf;
++			struct compat_dqinfo cinf;
++
++			sb = quotactl_block(special);
++			ret = PTR_ERR(sb);
++			if (IS_ERR(sb))
++				break;
++			ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
++			if (ret)
++				break;
++			ret = sb->s_qcop->get_info(sb, type, &iinf);
++			if (ret)
++				break;
++			cinf.dqi_bgrace = iinf.dqi_bgrace;
++			cinf.dqi_igrace = iinf.dqi_igrace;
++			cinf.dqi_flags = 0;
++			if (iinf.dqi_flags & DQF_INFO_DIRTY)
++				cinf.dqi_flags |= 0x0010;
++			cinf.dqi_blocks = 0;
++			cinf.dqi_free_blk = 0;
++			cinf.dqi_free_entry = 0;
++			ret = 0;
++			if (copy_to_user(addr, &cinf, sizeof(cinf)))
++				ret = -EFAULT;
++			break;
++		}
++
++		case QC_SETINFO:
++		case QC_SETGRACE:
++		case QC_SETFLAGS: {
++			struct if_dqinfo iinf;
++			struct compat_dqinfo cinf;
++
++			sb = quotactl_block(special);
++			ret = PTR_ERR(sb);
++			if (IS_ERR(sb))
++				break;
++			ret = check_quotactl_valid(sb, type, Q_SETINFO, id);
++			if (ret)
++				break;
++			ret = -EFAULT;
++			if (copy_from_user(&cinf, addr, sizeof(cinf)))
++				break;
++			iinf.dqi_bgrace = cinf.dqi_bgrace;
++			iinf.dqi_igrace = cinf.dqi_igrace;
++			iinf.dqi_flags = cinf.dqi_flags;
++			iinf.dqi_valid = 0;
++			if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
++				iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
++			if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
++				iinf.dqi_valid |= IIF_FLAGS;
++			ret = sb->s_qcop->set_info(sb, type, &iinf);
++			break;
++		}
++
++		case QC_GETSTATS: {
++			struct compat_dqstats stat;
++
++			memset(&stat, 0, sizeof(stat));
++			stat.version = 6*10000+5*100+0;
++			ret = 0;
++			if (copy_to_user(addr, &stat, sizeof(stat)))
++				ret = -EFAULT;
++			break;
++		}
++
++		default:
++			ret = -ENOSYS;
++			break;
++	}
++	if (sb && !IS_ERR(sb))
++		drop_super(sb);
++	return ret;
++}
++
++#endif
++
+ /*
+  * This is the system call interface. This communicates with
+  * the user-level programs. Currently this only supports diskquota
+@@ -371,6 +591,11 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
+ 	cmds = cmd >> SUBCMDSHIFT;
+ 	type = cmd & SUBCMDMASK;
+ 
++#ifdef CONFIG_QUOTA_COMPAT
++	if (cmds >= 0x0100 && cmds < 0x3000)
++		return compat_quotactl(cmds, type, special, id, addr);
++#endif
++
+ 	if (cmds != Q_SYNC || special) {
+ 		sb = quotactl_block(special);
+ 		if (IS_ERR(sb))
+diff --git a/fs/read_write.c b/fs/read_write.c
+index f0d1240..b9fbf10 100644
+--- a/fs/read_write.c
++++ b/fs/read_write.c
+@@ -21,6 +21,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+ 
++#include <bc/beancounter.h>
++
+ const struct file_operations generic_ro_fops = {
+ 	.llseek		= generic_file_llseek,
+ 	.read		= do_sync_read,
+@@ -350,6 +352,29 @@ static inline void file_pos_write(struct file *file, loff_t pos)
+ 	file->f_pos = pos;
+ }
+ 
++static inline void bc_acct_write(size_t bytes)
++{
++	struct user_beancounter *ub;
++
++	if (bytes > 0) {
++		ub = get_exec_ub();
++		ub_percpu_inc(ub, write);
++		ub_percpu_add(ub, wchar, bytes);
++	}
++}
++
++static inline void bc_acct_read(size_t bytes)
++{
++	struct user_beancounter *ub;
++
++	if (bytes > 0) {
++		ub = get_exec_ub();
++		ub_percpu_inc(ub, read);
++		ub_percpu_add(ub, rchar, bytes);
++	}
++}
++
++
+ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
+ {
+ 	struct file *file;
+@@ -362,6 +387,8 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
+ 		ret = vfs_read(file, buf, count, &pos);
+ 		file_pos_write(file, pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_read(ret);
+ 	}
+ 
+ 	return ret;
+@@ -379,6 +406,8 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co
+ 		ret = vfs_write(file, buf, count, &pos);
+ 		file_pos_write(file, pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_write(ret);
+ 	}
+ 
+ 	return ret;
+@@ -400,6 +429,8 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
+ 		if (file->f_mode & FMODE_PREAD)
+ 			ret = vfs_read(file, buf, count, &pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_read(ret);
+ 	}
+ 
+ 	return ret;
+@@ -421,6 +452,8 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
+ 		if (file->f_mode & FMODE_PWRITE)  
+ 			ret = vfs_write(file, buf, count, &pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_write(ret);
+ 	}
+ 
+ 	return ret;
+@@ -666,6 +699,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+ 		ret = vfs_readv(file, vec, vlen, &pos);
+ 		file_pos_write(file, pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_read(ret);
+ 	}
+ 
+ 	if (ret > 0)
+@@ -687,6 +722,8 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+ 		ret = vfs_writev(file, vec, vlen, &pos);
+ 		file_pos_write(file, pos);
+ 		fput_light(file, fput_needed);
++
++		bc_acct_write(ret);
+ 	}
+ 
+ 	if (ret > 0)
+diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
+index c1add28..3ca5049 100644
+--- a/fs/reiserfs/namei.c
++++ b/fs/reiserfs/namei.c
+@@ -859,6 +859,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
+ 	INITIALIZE_PATH(path);
+ 	struct reiserfs_dir_entry de;
+ 
++	inode = dentry->d_inode;
++	DQUOT_INIT(inode);
++
+ 	/* we will be doing 2 balancings and update 2 stat data, we change quotas
+ 	 * of the owner of the directory and of the owner of the parent directory.
+ 	 * The quota structure is possibly deleted only on last iput => outside
+@@ -883,8 +886,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
+ 		goto end_rmdir;
+ 	}
+ 
+-	inode = dentry->d_inode;
+-
+ 	reiserfs_update_inode_transaction(inode);
+ 	reiserfs_update_inode_transaction(dir);
+ 
+@@ -947,6 +948,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
+ 	unsigned long savelink;
+ 
+ 	inode = dentry->d_inode;
++	DQUOT_INIT(inode);
+ 
+ 	/* in this transaction we can be doing at max two balancings and update
+ 	 * two stat datas, we change quotas of the owner of the directory and of
+@@ -1254,6 +1256,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 
+ 	old_inode = old_dentry->d_inode;
+ 	new_dentry_inode = new_dentry->d_inode;
++	if (new_dentry_inode)
++		DQUOT_INIT(new_dentry_inode);
+ 
+ 	// make sure, that oldname still exists and points to an object we
+ 	// are going to rename
+diff --git a/fs/select.c b/fs/select.c
+index da0e882..e0eb1cd 100644
+--- a/fs/select.c
++++ b/fs/select.c
+@@ -27,6 +27,8 @@
+ 
+ #include <asm/uaccess.h>
+ 
++#include <bc/kmem.h>
++
+ struct poll_table_page {
+ 	struct poll_table_page * next;
+ 	struct poll_table_entry * entry;
+@@ -332,7 +334,8 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+ 	if (size > sizeof(stack_fds) / 6) {
+ 		/* Not enough space in on-stack array; must use kmalloc */
+ 		ret = -ENOMEM;
+-		bits = kmalloc(6 * size, GFP_KERNEL);
++		bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ?
++				GFP_KERNEL_UBC : GFP_KERNEL);
+ 		if (!bits)
+ 			goto out_nofds;
+ 	}
+@@ -678,7 +681,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
+ 
+ 		len = min(todo, POLLFD_PER_PAGE);
+ 		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
+-		walk = walk->next = kmalloc(size, GFP_KERNEL);
++		walk = walk->next = kmalloc(size, GFP_KERNEL_UBC);
+ 		if (!walk) {
+ 			err = -ENOMEM;
+ 			goto out_fds;
+@@ -710,7 +713,7 @@ out_fds:
+ 	return err;
+ }
+ 
+-static long do_restart_poll(struct restart_block *restart_block)
++long do_restart_poll(struct restart_block *restart_block)
+ {
+ 	struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
+ 	int nfds = restart_block->arg1;
+@@ -726,6 +729,7 @@ static long do_restart_poll(struct restart_block *restart_block)
+ 	}
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(do_restart_poll);
+ 
+ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+ 			long timeout_msecs)
+diff --git a/fs/seq_file.c b/fs/seq_file.c
+index 3f54dbd..4d8b86a 100644
+--- a/fs/seq_file.c
++++ b/fs/seq_file.c
+@@ -32,7 +32,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
+ 	struct seq_file *p = file->private_data;
+ 
+ 	if (!p) {
+-		p = kmalloc(sizeof(*p), GFP_KERNEL);
++		p = kmalloc(sizeof(*p), GFP_KERNEL_UBC);
+ 		if (!p)
+ 			return -ENOMEM;
+ 		file->private_data = p;
+@@ -87,7 +87,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+ 	m->version = file->f_version;
+ 	/* grab buffer if we didn't have one */
+ 	if (!m->buf) {
+-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
++		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
+ 		if (!m->buf)
+ 			goto Enomem;
+ 	}
+@@ -123,7 +123,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+ 			goto Fill;
+ 		m->op->stop(m, p);
+ 		kfree(m->buf);
+-		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
++		m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
+ 		if (!m->buf)
+ 			goto Enomem;
+ 		m->count = 0;
+@@ -193,7 +193,7 @@ static int traverse(struct seq_file *m, loff_t offset)
+ 		return 0;
+ 	}
+ 	if (!m->buf) {
+-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
++		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
+ 		if (!m->buf)
+ 			return -ENOMEM;
+ 	}
+@@ -232,7 +232,7 @@ static int traverse(struct seq_file *m, loff_t offset)
+ Eoverflow:
+ 	m->op->stop(m, p);
+ 	kfree(m->buf);
+-	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
++	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
+ 	return !m->buf ? -ENOMEM : -EAGAIN;
+ }
+ 
+@@ -378,6 +378,8 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
+ 	if (m->count < m->size) {
+ 		char *s = m->buf + m->count;
+ 		char *p = d_path(path, s, m->size - m->count);
++		if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG)
++			return 0;
+ 		if (!IS_ERR(p)) {
+ 			s = mangle_path(s, p, esc);
+ 			if (s) {
+@@ -461,7 +463,7 @@ static void single_stop(struct seq_file *p, void *v)
+ int single_open(struct file *file, int (*show)(struct seq_file *, void *),
+ 		void *data)
+ {
+-	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
++	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC);
+ 	int res = -ENOMEM;
+ 
+ 	if (op) {
+@@ -505,7 +507,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
+ 	void *private;
+ 	struct seq_file *seq;
+ 
+-	private = kzalloc(psize, GFP_KERNEL);
++	private = kzalloc(psize, GFP_KERNEL_UBC);
+ 	if (private == NULL)
+ 		goto out;
+ 
+diff --git a/fs/simfs.c b/fs/simfs.c
+new file mode 100644
+index 0000000..366a3ed
+--- /dev/null
++++ b/fs/simfs.c
+@@ -0,0 +1,332 @@
++/*
++ *  fs/simfs.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/namei.h>
++#include <linux/err.h>
++#include <linux/module.h>
++#include <linux/mount.h>
++#include <linux/vzquota.h>
++#include <linux/statfs.h>
++#include <linux/virtinfo.h>
++#include <linux/faudit.h>
++#include <linux/genhd.h>
++#include <linux/reiserfs_fs.h>
++
++#include <asm/unistd.h>
++#include <asm/uaccess.h>
++
++#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb
++
++static struct super_operations sim_super_ops;
++
++static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry,
++		struct kstat *stat)
++{
++	struct super_block *sb;
++	struct inode *inode;
++
++	inode = dentry->d_inode;
++	if (!inode->i_op->getattr) {
++		generic_fillattr(inode, stat);
++		if (!stat->blksize) {
++			unsigned blocks;
++
++			sb = inode->i_sb;
++			blocks = (stat->size + sb->s_blocksize-1) >>
++				sb->s_blocksize_bits;
++			stat->blocks = (sb->s_blocksize / 512) * blocks;
++			stat->blksize = sb->s_blocksize;
++		}
++	} else {
++		int err;
++
++		err = inode->i_op->getattr(mnt, dentry, stat);
++		if (err)
++			return err;
++	}
++
++	sb = mnt->mnt_sb;
++	if (sb->s_op == &sim_super_ops)
++		stat->dev = sb->s_dev;
++	return 0;
++}
++
++static void quota_get_stat(struct super_block *sb, struct kstatfs *buf)
++{
++	int err;
++	struct dq_stat qstat;
++	struct virt_info_quota q;
++	long free_file, adj_file;
++	s64 blk, free_blk, adj_blk;
++	int bsize_bits;
++
++	q.super = sb;
++	q.qstat = &qstat;
++	err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q);
++	if (err != NOTIFY_OK)
++		return;
++
++	bsize_bits = ffs(buf->f_bsize) - 1;
++	
++	if (qstat.bsoftlimit > qstat.bcurrent)
++		free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits;
++	else
++		free_blk = 0;
++	/*
++	 * In the regular case, we always set buf->f_bfree and buf->f_blocks to
++	 * the values reported by quota.  In case of real disk space shortage,
++	 * we adjust the values.  We want this adjustment to look as if the
++	 * total disk space were reduced, not as if the usage were increased.
++	 *    -- SAW
++	 */
++	adj_blk = 0;
++	if (buf->f_bfree < free_blk)
++		adj_blk = free_blk - buf->f_bfree;
++	buf->f_bfree = free_blk - adj_blk;
++
++	if (free_blk < buf->f_bavail)
++		buf->f_bavail = free_blk;
++
++	blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk;
++	buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk;
++
++	free_file = qstat.isoftlimit - qstat.icurrent;
++	if (free_file < 0)
++		free_file = 0;
++	if (buf->f_type == REISERFS_SUPER_MAGIC)
++		/*
++		 * reiserfs doesn't initialize f_ffree and f_files values of
++		 * kstatfs because it doesn't have an inode limit.
++		 */
++		buf->f_ffree = free_file;
++	adj_file = 0;
++	if (buf->f_ffree < free_file)
++		adj_file = free_file - buf->f_ffree;
++	buf->f_ffree = free_file - adj_file;
++	buf->f_files = qstat.isoftlimit - adj_file;
++}
++
++static int sim_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++	int err;
++	struct super_block *lsb;
++	struct kstatfs statbuf;
++
++	err = 0;
++	if (sb->s_op != &sim_super_ops)
++		return 0;
++
++	memset(&statbuf, 0, sizeof(statbuf));
++	lsb = SIMFS_GET_LOWER_FS_SB(sb);
++
++	err = -ENOSYS;
++	if (lsb && lsb->s_op && lsb->s_op->statfs)
++		err = lsb->s_op->statfs(lsb->s_root, &statbuf);
++	if (err)
++		return err;
++
++	quota_get_stat(sb, &statbuf);
++
++	buf->f_files    = statbuf.f_files;
++	buf->f_ffree    = statbuf.f_ffree;
++	buf->f_blocks   = statbuf.f_blocks;
++	buf->f_bfree    = statbuf.f_bfree;
++	buf->f_bavail   = statbuf.f_bavail;
++	return 0;
++}
++
++static int sim_systemcall(struct vnotifier_block *me, unsigned long n,
++		void *d, int old_ret)
++{
++	int err;
++
++	switch (n) {
++	case VIRTINFO_FAUDIT_STAT: {
++		struct faudit_stat_arg *arg;
++
++		arg = (struct faudit_stat_arg *)d;
++		err = sim_getattr(arg->mnt, arg->dentry, arg->stat);
++		arg->err = err;
++		}
++		break;
++	case VIRTINFO_FAUDIT_STATFS: {
++		struct faudit_statfs_arg *arg;
++
++		arg = (struct faudit_statfs_arg *)d;
++		err = sim_statfs(arg->sb, arg->stat);
++		arg->err = err;
++		}
++		break;
++	default:
++		return old_ret;
++	}
++	return (err ? NOTIFY_BAD : NOTIFY_OK);
++}
++
++static struct inode *sim_quota_root(struct super_block *sb)
++{
++	return sb->s_root->d_inode;
++}
++
++/*
++ * NOTE: We need to setup s_bdev field on super block, since sys_quotactl()
++ * does lookup_bdev() and get_super() which are comparing sb->s_bdev.
++ * so this is a MUST if we want unmodified sys_quotactl
++ * to work correctly on /dev/simfs inside VE
++ */
++static int sim_init_blkdev(struct super_block *sb)
++{
++	static struct hd_struct fake_hd;
++	struct block_device *blkdev;
++
++	blkdev = bdget(sb->s_dev);
++	if (blkdev == NULL)
++		return -ENOMEM;
++
++	blkdev->bd_part = &fake_hd;	/* required for bdev_read_only() */
++	sb->s_bdev = blkdev;
++
++	return 0;
++}
++
++static void sim_free_blkdev(struct super_block *sb)
++{
++	/* set bd_part back to NULL */
++	sb->s_bdev->bd_part = NULL;
++	bdput(sb->s_bdev);
++}
++
++static void sim_quota_init(struct super_block *sb)
++{
++	struct virt_info_quota viq;
++
++	viq.super = sb;
++	virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq);
++}
++
++static void sim_quota_free(struct super_block *sb)
++{
++	struct virt_info_quota viq;
++
++	viq.super = sb;
++	virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq);
++}
++
++static struct super_operations sim_super_ops = {
++	.get_quota_root	= sim_quota_root,
++};
++
++static int sim_fill_super(struct super_block *s, void *data)
++{
++	int err;
++	struct nameidata *nd;
++
++	err = set_anon_super(s, NULL);
++	if (err)
++		goto out;
++
++	err = 0;
++	nd = (struct nameidata *)data;
++	s->s_fs_info = mntget(nd->path.mnt);
++	s->s_root = dget(nd->path.dentry);
++	s->s_op = &sim_super_ops;
++out:
++	return err;
++}
++
++static int sim_get_sb(struct file_system_type *type, int flags,
++		const char *dev_name, void *opt, struct vfsmount *mnt)
++{
++	int err;
++	struct nameidata nd;
++	struct super_block *sb;
++
++	err = -EINVAL;
++	if (opt == NULL)
++		goto out;
++
++	err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++	if (err)
++		goto out;
++
++	sb = sget(type, NULL, sim_fill_super, &nd);
++	err = PTR_ERR(sb);
++	if (IS_ERR(sb))
++		goto out_path;
++
++	err = sim_init_blkdev(sb);
++	if (err)
++		goto out_killsb;
++
++	sim_quota_init(sb);
++
++	path_put(&nd.path);
++	return simple_set_mnt(mnt, sb);
++
++out_killsb:
++	up_write(&sb->s_umount);
++	deactivate_super(sb);
++out_path:
++	path_put(&nd.path);
++out:
++	return err;
++}
++
++static void sim_kill_sb(struct super_block *sb)
++{
++	dput(sb->s_root);
++	sb->s_root = NULL;
++	mntput((struct vfsmount *)(sb->s_fs_info));
++
++	sim_quota_free(sb);
++	sim_free_blkdev(sb);
++
++	kill_anon_super(sb);
++}
++
++static struct file_system_type sim_fs_type = {
++	.owner		= THIS_MODULE,
++	.name		= "simfs",
++	.get_sb		= sim_get_sb,
++	.kill_sb	= sim_kill_sb,
++	.fs_flags	= FS_MANGLE_PROC,
++};
++
++static struct vnotifier_block sim_syscalls = {
++	.notifier_call = sim_systemcall,
++};
++
++static int __init init_simfs(void)
++{
++	int err;
++
++	err = register_filesystem(&sim_fs_type);
++	if (err)
++		return err;
++
++	virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls);
++	return 0;
++}
++
++static void __exit exit_simfs(void)
++{
++	virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls);
++	unregister_filesystem(&sim_fs_type);
++}
++
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System");
++MODULE_LICENSE("GPL v2");
++
++module_init(init_simfs);
++module_exit(exit_simfs);
+diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
+index e37fe4d..1992fc0 100644
+--- a/fs/smbfs/sock.c
++++ b/fs/smbfs/sock.c
+@@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *server)
+ 
+ 		VERBOSE("closing socket %p\n", sock);
+ 		sock->sk->sk_data_ready = server->data_ready;
++		sock->sk->sk_user_data = NULL;
+ 		server->sock_file = NULL;
+ 		fput(file);
+ 	}
+diff --git a/fs/stat.c b/fs/stat.c
+index 9cf41f7..4d53945 100644
+--- a/fs/stat.c
++++ b/fs/stat.c
+@@ -14,6 +14,7 @@
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
+ #include <linux/pagemap.h>
++#include <linux/faudit.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 	int retval;
++	struct faudit_stat_arg arg;
+ 
+ 	retval = security_inode_getattr(mnt, dentry);
+ 	if (retval)
+ 		return retval;
+ 
++	arg.mnt = mnt;
++	arg.dentry = dentry;
++	arg.stat = stat;
++	if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg)
++			!= NOTIFY_DONE)
++		return arg.err;
++
+ 	if (inode->i_op->getattr)
+ 		return inode->i_op->getattr(mnt, dentry, stat);
+ 
+diff --git a/fs/super.c b/fs/super.c
+index 453877c..55ce500 100644
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -38,6 +38,7 @@
+ #include <linux/kobject.h>
+ #include <linux/mutex.h>
+ #include <linux/file.h>
++#include <linux/ve_proto.h>
+ #include <asm/uaccess.h>
+ #include "internal.h"
+ 
+@@ -72,13 +73,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
+ 		INIT_LIST_HEAD(&s->s_inodes);
+ 		init_rwsem(&s->s_umount);
+ 		mutex_init(&s->s_lock);
+-		lockdep_set_class(&s->s_umount, &type->s_umount_key);
++		lockdep_set_class(&s->s_umount,
++				&type->proto->s_umount_key);
+ 		/*
+ 		 * The locking rules for s_lock are up to the
+ 		 * filesystem. For example ext3fs has different
+ 		 * lock ordering than usbfs:
+ 		 */
+-		lockdep_set_class(&s->s_lock, &type->s_lock_key);
++		lockdep_set_class(&s->s_lock,
++				&type->proto->s_lock_key);
+ 		down_write(&s->s_umount);
+ 		s->s_count = S_BIAS;
+ 		atomic_set(&s->s_active, 1);
+@@ -303,7 +306,7 @@ void generic_shutdown_super(struct super_block *sb)
+ 			sop->put_super(sb);
+ 
+ 		/* Forget any remaining inodes */
+-		if (invalidate_inodes(sb)) {
++		if (invalidate_inodes_check(sb, 1)) {
+ 			printk("VFS: Busy inodes after unmount of %s. "
+ 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
+ 			   sb->s_id);
+@@ -532,17 +535,26 @@ rescan:
+ 	spin_unlock(&sb_lock);
+ 	return NULL;
+ }
++EXPORT_SYMBOL(user_get_super);
+ 
+ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
+ {
++	dev_t kdev;
+         struct super_block *s;
+         struct ustat tmp;
+         struct kstatfs sbuf;
+-	int err = -EINVAL;
++	int err;
++
++	kdev = new_decode_dev(dev);
++	err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ);
++	if (err)
++		goto out;
++
++	err = -EINVAL;
++	s = user_get_super(kdev);
++	if (s == NULL)
++		goto out;
+ 
+-        s = user_get_super(new_decode_dev(dev));
+-        if (s == NULL)
+-                goto out;
+ 	err = vfs_statfs(s->s_root, &sbuf);
+ 	drop_super(s);
+ 	if (err)
+@@ -684,6 +696,13 @@ void emergency_remount(void)
+ static struct idr unnamed_dev_idr;
+ static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+ 
++/* for compatibility with coreutils still unaware of new minor sizes */
++int unnamed_dev_majors[] = {
++	0, 144, 145, 146, 242, 243, 244, 245,
++	246, 247, 248, 249, 250, 251, 252, 253
++};
++EXPORT_SYMBOL(unnamed_dev_majors);
++
+ int set_anon_super(struct super_block *s, void *data)
+ {
+ 	int dev;
+@@ -701,13 +720,13 @@ int set_anon_super(struct super_block *s, void *data)
+ 	else if (error)
+ 		return -EAGAIN;
+ 
+-	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
++	if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) {
+ 		spin_lock(&unnamed_dev_lock);
+ 		idr_remove(&unnamed_dev_idr, dev);
+ 		spin_unlock(&unnamed_dev_lock);
+ 		return -EMFILE;
+ 	}
+-	s->s_dev = MKDEV(0, dev & MINORMASK);
++	s->s_dev = make_unnamed_dev(dev);
+ 	return 0;
+ }
+ 
+@@ -715,8 +734,9 @@ EXPORT_SYMBOL(set_anon_super);
+ 
+ void kill_anon_super(struct super_block *sb)
+ {
+-	int slot = MINOR(sb->s_dev);
++	int slot;
+ 
++	slot = unnamed_dev_idx(sb->s_dev);
+ 	generic_shutdown_super(sb);
+ 	spin_lock(&unnamed_dev_lock);
+ 	idr_remove(&unnamed_dev_idr, slot);
+diff --git a/fs/sync.c b/fs/sync.c
+index 228e17b..32ad4fc 100644
+--- a/fs/sync.c
++++ b/fs/sync.c
+@@ -14,6 +14,8 @@
+ #include <linux/quotaops.h>
+ #include <linux/buffer_head.h>
+ 
++#include <bc/beancounter.h>
++
+ #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
+ 			SYNC_FILE_RANGE_WAIT_AFTER)
+ 
+@@ -38,7 +40,14 @@ static void do_sync(unsigned long wait)
+ 
+ asmlinkage long sys_sync(void)
+ {
++	struct user_beancounter *ub;
++
++	ub = get_exec_ub();
++	ub_percpu_inc(ub, sync);
++
+ 	do_sync(1);
++
++	ub_percpu_inc(ub, sync_done);
+ 	return 0;
+ }
+ 
+@@ -80,6 +89,7 @@ long do_fsync(struct file *file, int datasync)
+ 	int ret;
+ 	int err;
+ 	struct address_space *mapping = file->f_mapping;
++	struct user_beancounter *ub;
+ 
+ 	if (!file->f_op || !file->f_op->fsync) {
+ 		/* Why?  We can still call filemap_fdatawrite */
+@@ -87,6 +97,12 @@ long do_fsync(struct file *file, int datasync)
+ 		goto out;
+ 	}
+ 
++	ub = get_exec_ub();
++	if (datasync)
++		ub_percpu_inc(ub, fdsync);
++	else
++		ub_percpu_inc(ub, fsync);
++
+ 	ret = filemap_fdatawrite(mapping);
+ 
+ 	/*
+@@ -101,6 +117,11 @@ long do_fsync(struct file *file, int datasync)
+ 	err = filemap_fdatawait(mapping);
+ 	if (!ret)
+ 		ret = err;
++
++	if (datasync)
++		ub_percpu_inc(ub, fdsync_done);
++	else
++		ub_percpu_inc(ub, fsync_done);
+ out:
+ 	return ret;
+ }
+@@ -251,12 +272,16 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+ 			  loff_t endbyte, unsigned int flags)
+ {
+ 	int ret;
++	struct user_beancounter *ub;
+ 
+ 	if (!mapping) {
+ 		ret = -EINVAL;
+-		goto out;
++		goto out_noacct;
+ 	}
+ 
++	ub = get_exec_ub();
++	ub_percpu_inc(ub, frsync);
++
+ 	ret = 0;
+ 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
+ 		ret = wait_on_page_writeback_range(mapping,
+@@ -279,6 +304,8 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+ 					endbyte >> PAGE_CACHE_SHIFT);
+ 	}
+ out:
++	ub_percpu_inc(ub, frsync_done);
++out_noacct:
+ 	return ret;
+ }
+ EXPORT_SYMBOL_GPL(do_sync_mapping_range);
+diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
+index 006fc64..9aec999 100644
+--- a/fs/sysfs/bin.c
++++ b/fs/sysfs/bin.c
+@@ -177,6 +177,9 @@ static int open(struct inode * inode, struct file * file)
+ 	struct bin_buffer *bb = NULL;
+ 	int error;
+ 
++	if (!ve_sysfs_alowed())
++		return 0;
++
+ 	/* binary file operations requires both @sd and its parent */
+ 	if (!sysfs_get_active_two(attr_sd))
+ 		return -ENODEV;
+@@ -238,6 +241,9 @@ const struct file_operations bin_fops = {
+ 
+ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+ {
++	if (!ve_sysfs_alowed())
++		return 0;
++
+ 	BUG_ON(!kobj || !kobj->sd || !attr);
+ 
+ 	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+@@ -252,6 +258,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+ 
+ void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+ {
++	if (!ve_sysfs_alowed())
++		return;
+ 	sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+ }
+ 
+diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
+index 8c0e4b9..38c93f2 100644
+--- a/fs/sysfs/dir.c
++++ b/fs/sysfs/dir.c
+@@ -478,6 +478,9 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
+ 	struct inode *inode;
+ 	struct dentry *dentry;
+ 
++	if (!ve_sysfs_alowed())
++		return;
++
+ 	inode = ilookup(sysfs_sb, sd->s_ino);
+ 	if (!inode)
+ 		return;
+@@ -649,12 +652,15 @@ int sysfs_create_dir(struct kobject * kobj)
+ 	struct sysfs_dirent *parent_sd, *sd;
+ 	int error = 0;
+ 
++	if (!ve_sysfs_alowed())
++		return 0;
++
+ 	BUG_ON(!kobj);
+ 
+ 	if (kobj->parent)
+ 		parent_sd = kobj->parent->sd;
+ 	else
+-		parent_sd = &sysfs_root;
++		parent_sd = ve_sysfs_root;
+ 
+ 	error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+ 	if (!error)
+@@ -755,6 +761,9 @@ void sysfs_remove_dir(struct kobject * kobj)
+ {
+ 	struct sysfs_dirent *sd = kobj->sd;
+ 
++	if (!ve_sysfs_alowed())
++		return;
++
+ 	spin_lock(&sysfs_assoc_lock);
+ 	kobj->sd = NULL;
+ 	spin_unlock(&sysfs_assoc_lock);
+@@ -770,6 +779,9 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+ 	const char *dup_name = NULL;
+ 	int error;
+ 
++	if (!ve_sysfs_alowed())
++		return 0;
++
+ 	mutex_lock(&sysfs_rename_mutex);
+ 
+ 	error = 0;
+@@ -838,7 +850,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
+ 
+ 	mutex_lock(&sysfs_rename_mutex);
+ 	BUG_ON(!sd->s_parent);
+-	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
++	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : ve_sysfs_root;
+ 
+ 	error = 0;
+ 	if (sd->s_parent == new_parent_sd)
+diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
+index e7735f6..fed6ceb 100644
+--- a/fs/sysfs/file.c
++++ b/fs/sysfs/file.c
+@@ -516,6 +516,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
+ 
+ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
+ {
++	if (!ve_sysfs_alowed())
++		return 0;
+ 	BUG_ON(!kobj || !kobj->sd || !attr);
+ 
+ 	return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
+@@ -612,6 +614,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
+ 
+ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
+ {
++	if (!ve_sysfs_alowed())
++		return;
+ 	sysfs_hash_and_remove(kobj->sd, attr->name);
+ }
+ 
+diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
+index eeba384..c302cbe 100644
+--- a/fs/sysfs/group.c
++++ b/fs/sysfs/group.c
+@@ -62,6 +62,8 @@ static int internal_create_group(struct kobject *kobj, int update,
+ 	struct sysfs_dirent *sd;
+ 	int error;
+ 
++	if (!ve_sysfs_alowed())
++		return 0;
+ 	BUG_ON(!kobj || (!update && !kobj->sd));
+ 
+ 	/* Updates may happen before the object has been instantiated */
+@@ -131,6 +133,9 @@ void sysfs_remove_group(struct kobject * kobj,
+ 	struct sysfs_dirent *dir_sd = kobj->sd;
+ 	struct sysfs_dirent *sd;
+ 
++	if (!ve_sysfs_alowed())
++		return;
++
+ 	if (grp->name) {
+ 		sd = sysfs_get_dirent(dir_sd, grp->name);
+ 		if (!sd) {
+diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
+index eb53c63..a09bfa5 100644
+--- a/fs/sysfs/inode.c
++++ b/fs/sysfs/inode.c
+@@ -20,8 +20,6 @@
+ #include <linux/sched.h>
+ #include "sysfs.h"
+ 
+-extern struct super_block * sysfs_sb;
+-
+ static const struct address_space_operations sysfs_aops = {
+ 	.readpage	= simple_readpage,
+ 	.write_begin	= simple_write_begin,
+diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
+index 14f0023..974bf82 100644
+--- a/fs/sysfs/mount.c
++++ b/fs/sysfs/mount.c
+@@ -15,6 +15,7 @@
+ #include <linux/fs.h>
+ #include <linux/mount.h>
+ #include <linux/pagemap.h>
++#include <linux/module.h>
+ #include <linux/init.h>
+ 
+ #include "sysfs.h"
+@@ -22,8 +23,11 @@
+ /* Random magic number */
+ #define SYSFS_MAGIC 0x62656572
+ 
+-static struct vfsmount *sysfs_mount;
++#ifndef CONFIG_VE
++struct vfsmount *sysfs_mount;
+ struct super_block * sysfs_sb = NULL;
++#endif
++
+ struct kmem_cache *sysfs_dir_cachep;
+ 
+ static const struct super_operations sysfs_ops = {
+@@ -39,6 +43,13 @@ struct sysfs_dirent sysfs_root = {
+ 	.s_ino		= 1,
+ };
+ 
++static void init_ve0_sysfs_root(void)
++{
++#ifdef CONFIG_VE
++	get_ve0()->_sysfs_root = &sysfs_root;
++#endif
++}
++
+ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+ {
+ 	struct inode *inode;
+@@ -52,7 +63,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+ 	sysfs_sb = sb;
+ 
+ 	/* get root inode, initialize and unlock it */
+-	inode = sysfs_get_inode(&sysfs_root);
++	inode = sysfs_get_inode(ve_sysfs_root);
+ 	if (!inode) {
+ 		pr_debug("sysfs: could not get root inode\n");
+ 		return -ENOMEM;
+@@ -65,7 +76,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+ 		iput(inode);
+ 		return -ENOMEM;
+ 	}
+-	root->d_fsdata = &sysfs_root;
++	root->d_fsdata = ve_sysfs_root;
+ 	sb->s_root = root;
+ 	return 0;
+ }
+@@ -76,16 +87,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
+ 	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+ }
+ 
+-static struct file_system_type sysfs_fs_type = {
++struct file_system_type sysfs_fs_type = {
+ 	.name		= "sysfs",
+ 	.get_sb		= sysfs_get_sb,
+ 	.kill_sb	= kill_anon_super,
+ };
+ 
++EXPORT_SYMBOL(sysfs_fs_type);
++
+ int __init sysfs_init(void)
+ {
+ 	int err = -ENOMEM;
+ 
++	init_ve0_sysfs_root();
+ 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+ 					      sizeof(struct sysfs_dirent),
+ 					      0, 0, NULL);
+diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
+index 817f596..1e3e7e7 100644
+--- a/fs/sysfs/symlink.c
++++ b/fs/sysfs/symlink.c
+@@ -33,10 +33,13 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
+ 	struct sysfs_addrm_cxt acxt;
+ 	int error;
+ 
++	if (!ve_sysfs_alowed())
++		return 0;
++
+ 	BUG_ON(!name);
+ 
+ 	if (!kobj)
+-		parent_sd = &sysfs_root;
++		parent_sd = ve_sysfs_root;
+ 	else
+ 		parent_sd = kobj->sd;
+ 
+@@ -89,8 +92,11 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
+ {
+ 	struct sysfs_dirent *parent_sd = NULL;
+ 
++	if(!ve_sysfs_alowed())
++		return;
++
+ 	if (!kobj)
+-		parent_sd = &sysfs_root;
++		parent_sd = ve_sysfs_root;
+ 	else
+ 		parent_sd = kobj->sd;
+ 
+diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
+index ce4e15f..11e8464 100644
+--- a/fs/sysfs/sysfs.h
++++ b/fs/sysfs/sysfs.h
+@@ -8,67 +8,17 @@
+  * This file is released under the GPLv2.
+  */
+ 
+-struct sysfs_open_dirent;
+-
+-/* type-specific structures for sysfs_dirent->s_* union members */
+-struct sysfs_elem_dir {
+-	struct kobject		*kobj;
+-	/* children list starts here and goes through sd->s_sibling */
+-	struct sysfs_dirent	*children;
+-};
+-
+-struct sysfs_elem_symlink {
+-	struct sysfs_dirent	*target_sd;
+-};
+-
+-struct sysfs_elem_attr {
+-	struct attribute	*attr;
+-	struct sysfs_open_dirent *open;
+-};
+-
+-struct sysfs_elem_bin_attr {
+-	struct bin_attribute	*bin_attr;
+-};
+-
+-/*
+- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
+- * every sysfs node is represented by single sysfs_dirent.
+- *
+- * As long as s_count reference is held, the sysfs_dirent itself is
+- * accessible.  Dereferencing s_elem or any other outer entity
+- * requires s_active reference.
+- */
+-struct sysfs_dirent {
+-	atomic_t		s_count;
+-	atomic_t		s_active;
+-	struct sysfs_dirent	*s_parent;
+-	struct sysfs_dirent	*s_sibling;
+-	const char		*s_name;
+-
+-	union {
+-		struct sysfs_elem_dir		s_dir;
+-		struct sysfs_elem_symlink	s_symlink;
+-		struct sysfs_elem_attr		s_attr;
+-		struct sysfs_elem_bin_attr	s_bin_attr;
+-	};
+-
+-	unsigned int		s_flags;
+-	ino_t			s_ino;
+-	umode_t			s_mode;
+-	struct iattr		*s_iattr;
+-};
+-
+-#define SD_DEACTIVATED_BIAS		INT_MIN
+-
+-#define SYSFS_TYPE_MASK			0x00ff
+-#define SYSFS_DIR			0x0001
+-#define SYSFS_KOBJ_ATTR			0x0002
+-#define SYSFS_KOBJ_BIN_ATTR		0x0004
+-#define SYSFS_KOBJ_LINK			0x0008
+-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
+-
+-#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+-#define SYSFS_FLAG_REMOVED		0x0200
++#ifndef CONFIG_VE
++extern struct vfsmount *sysfs_mount;
++extern struct super_block *sysfs_sb;
++#define ve_sysfs_alowed()	1
++#else
++#include <linux/sched.h>
++#include <linux/ve.h>
++#define sysfs_mount		(get_exec_env()->sysfs_mnt)
++#define sysfs_sb		(get_exec_env()->sysfs_sb)
++#define ve_sysfs_alowed()	(sysfs_sb != NULL)
++#endif
+ 
+ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
+ {
+@@ -88,8 +38,12 @@ struct sysfs_addrm_cxt {
+ /*
+  * mount.c
+  */
++#ifdef CONFIG_VE
++#define ve_sysfs_root	(get_exec_env()->_sysfs_root)
++#else
+ extern struct sysfs_dirent sysfs_root;
+-extern struct super_block *sysfs_sb;
++#define ve_sysfs_root	(&sysfs_root)
++#endif
+ extern struct kmem_cache *sysfs_dir_cachep;
+ 
+ /*
+diff --git a/fs/vzdq_file.c b/fs/vzdq_file.c
+new file mode 100644
+index 0000000..4d814d9
+--- /dev/null
++++ b/fs/vzdq_file.c
+@@ -0,0 +1,923 @@
++/*
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo quota files as proc entry implementation.
++ * It is required for std quota tools to work correctly as they are expecting
++ * aquota.user and aquota.group files.
++ */
++
++#include <linux/ctype.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++#include <linux/sysctl.h>
++#include <linux/mount.h>
++#include <linux/mnt_namespace.h>
++#include <linux/quotaio_v2.h>
++#include <asm/uaccess.h>
++
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/vzdq_tree.h>
++#include <linux/vzquota.h>
++
++/* ----------------------------------------------------------------------
++ *
++ * File read operation
++ *
++ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c,
++ * perhaps) abuse vz_quota_sem.
++ * Taking a global semaphore for lengthy and user-controlled operations inside
++ * VPSs is not a good idea in general.
++ * In this case, the reasons for taking this semaphore are completely unclear,
++ * especially taking into account that the only function that has comments
++ * about the necessity to be called under this semaphore
++ * (create_proc_quotafile) is actually called OUTSIDE it.
++ *
++ * --------------------------------------------------------------------- */
++
++#define DQBLOCK_SIZE		1024
++#define DQUOTBLKNUM		21U
++#define DQTREE_DEPTH		4
++#define TREENUM_2_BLKNUM(num)	(((num) + 1) << 1)
++#define ISINDBLOCK(num)		((num)%2 != 0)
++#define FIRST_DATABLK	  	2  /* first even number */
++#define LAST_IND_LEVEL		(DQTREE_DEPTH - 1)
++#define CONVERT_LEVEL(level)	((level) * (QUOTAID_EBITS/QUOTAID_BBITS))
++#define GETLEVINDX(ind, lev)	(((ind) >> QUOTAID_BBITS*(lev)) \
++					& QUOTATREE_BMASK)
++
++#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH)
++#error xBITS and DQTREE_DEPTH does not correspond
++#endif
++
++#define BLOCK_NOT_FOUND	1
++
++/* data for quota file -- one per proc entry */
++struct quotatree_data {
++	struct list_head	list;
++	struct vz_quota_master	*qmblk;
++	int			type;	/* type of the tree */
++};
++
++/* serialized by vz_quota_sem */
++static LIST_HEAD(qf_data_head);
++
++static const u_int32_t vzquota_magics[] = V2_INITQMAGICS;
++static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS;
++static const char aquota_user[] = "aquota.user";
++static const char aquota_group[] = "aquota.group";
++
++
++static inline loff_t get_depoff(int depth)
++{
++	loff_t res = 1;
++	while (depth) {
++		res += (1 << ((depth - 1)*QUOTAID_EBITS + 1));
++		depth--;
++	}
++	return res;
++}
++
++static inline loff_t get_blknum(loff_t num, int depth)
++{
++	loff_t res;
++	res = (num << 1) + get_depoff(depth);
++	return res;
++}
++
++static int get_depth(loff_t num)
++{
++	int i;
++	for (i = 0; i < DQTREE_DEPTH; i++) {
++		if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1
++				|| num < get_depoff(i + 1)))
++			return i;
++	}
++	return -1;
++}
++
++static inline loff_t get_offset(loff_t num)
++{
++	loff_t res, tmp;
++
++	tmp = get_depth(num);
++	if (tmp < 0)
++		return -1;
++	num -= get_depoff(tmp);
++	BUG_ON(num < 0);
++	res = num >> 1;
++
++	return res;
++}
++
++static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level)
++{
++	/* return maximum available block num */
++	return tree->levels[level].freenum;
++}
++
++static inline loff_t get_block_num(struct quotatree_tree *tree)
++{
++	loff_t ind_blk_num, quot_blk_num, max_ind, max_quot;
++
++	quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1);
++	max_quot = TREENUM_2_BLKNUM(quot_blk_num);
++	ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1));
++	max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL)
++		: get_blknum(ind_blk_num, 0);
++
++	return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1;
++}
++
++/*  Write quota file header */
++static int read_header(void *buf, struct quotatree_tree *tree,
++	struct dq_info *dq_ugid_info, int type)
++{
++	struct v2_disk_dqheader *dqh;
++	struct v2_disk_dqinfo *dq_disk_info;
++
++	dqh = buf;
++	dq_disk_info = buf + sizeof(struct v2_disk_dqheader);
++
++	dqh->dqh_magic = vzquota_magics[type];
++	dqh->dqh_version = vzquota_versions[type];
++
++	dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire;
++	dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire;
++	dq_disk_info->dqi_flags = 0;	/* no flags */
++	dq_disk_info->dqi_blocks = get_block_num(tree);
++	dq_disk_info->dqi_free_blk = 0;	/* first block in the file */
++	dq_disk_info->dqi_free_entry = FIRST_DATABLK;
++
++	return 0;
++}
++
++static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf)
++{
++	int i, j, lev_num;
++
++	lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1;
++	for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) {
++		struct quotatree_node *next, *parent;
++
++		parent = p;
++		next = p;
++		for (j = lev_num; j >= 0; j--) {
++			if (!next->blocks[GETLEVINDX(i,j)]) {
++				buf[i] = 0;
++				goto bad_branch;
++			}
++			parent = next;
++			next = next->blocks[GETLEVINDX(i,j)];
++		}
++		buf[i] = (depth == DQTREE_DEPTH - 1) ?
++			TREENUM_2_BLKNUM(parent->num)
++			: get_blknum(next->num, depth + 1);
++
++	bad_branch:
++		;
++	}
++
++	return 0;
++}
++
++/*
++ * Write index block to disk (or buffer)
++ * @buf has length 256*sizeof(u_int32_t) bytes
++ */
++static int read_index_block(int num, u_int32_t *buf,
++		struct quotatree_tree *tree)
++{
++	struct quotatree_node *p;
++	u_int32_t index;
++	loff_t off;
++	int depth, res;
++
++	res = BLOCK_NOT_FOUND; 
++	index = 0;
++	depth = get_depth(num);
++	off = get_offset(num);
++	if (depth < 0 || off < 0)
++		return -EINVAL;
++
++	list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh,
++			list) {
++		if (p->num >= off)
++			res = 0;
++		if (p->num != off)
++			continue;
++		get_block_child(depth, p, buf);
++		break;
++	}
++
++	return res;
++}
++
++static inline void convert_quot_format(struct v2_disk_dqblk *dq,
++		struct vz_quota_ugid *vzq)
++{
++	dq->dqb_id = vzq->qugid_id;
++	dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit;
++	dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit;
++	dq->dqb_curinodes = vzq->qugid_stat.icurrent;
++	dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE;
++	dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE;
++	dq->dqb_curspace = vzq->qugid_stat.bcurrent;
++	dq->dqb_btime = vzq->qugid_stat.btime;
++	dq->dqb_itime = vzq->qugid_stat.itime;
++}
++
++static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree)
++{
++	int res, i, entries = 0;
++	struct v2_disk_dqdbheader *dq_header;
++	struct quotatree_node *p;
++	struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader);
++
++	res = BLOCK_NOT_FOUND;
++	dq_header = buf;
++	memset(dq_header, 0, sizeof(*dq_header));
++
++	list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh),
++			list) {
++		if (TREENUM_2_BLKNUM(p->num) >= num)
++			res = 0;
++		if (TREENUM_2_BLKNUM(p->num) != num)
++			continue;
++
++		for (i = 0; i < QUOTATREE_BSIZE; i++) {
++			if (!p->blocks[i])
++				continue;
++			convert_quot_format(blk + entries,
++					(struct vz_quota_ugid *)p->blocks[i]);
++			entries++;
++			res = 0;
++		}
++		break;
++	}
++	dq_header->dqdh_entries = entries;
++
++	return res;
++}
++
++static int read_block(int num, void *buf, struct quotatree_tree *tree,
++	struct dq_info *dq_ugid_info, int magic)
++{
++	int res;
++
++	memset(buf, 0, DQBLOCK_SIZE);
++	if (!num)
++		res = read_header(buf, tree, dq_ugid_info, magic);
++	else if (ISINDBLOCK(num))
++		res = read_index_block(num, (u_int32_t*)buf, tree);
++	else
++		res = read_dquot(num, buf, tree);
++
++	return res;
++}
++
++/*
++ * FIXME: this function can handle quota files up to 2GB only.
++ */
++static int read_proc_quotafile(char *page, char **start, off_t off, int count,
++		int *eof, void *data)
++{
++	off_t blk_num, blk_off, buf_off;
++	char *tmp;
++	size_t buf_size;
++	struct quotatree_data *qtd;
++	struct quotatree_tree *tree;
++	struct dq_info *dqi;
++	int res;
++
++	*start = NULL;
++	tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL);
++	if (!tmp)
++		return -ENOMEM;
++
++	qtd = data;
++	down(&vz_quota_sem);
++	down(&qtd->qmblk->dq_sem);
++
++	res = 0;
++	tree = QUGID_TREE(qtd->qmblk, qtd->type);
++	if (!tree) {
++		*eof = 1;
++		goto out_dq;
++	}
++
++	dqi = &qtd->qmblk->dq_ugid_info[qtd->type];
++
++	buf_off = 0;
++	buf_size = count;
++	blk_num = off / DQBLOCK_SIZE;
++	blk_off = off % DQBLOCK_SIZE;
++
++	while (buf_size > 0) {
++		off_t len;
++
++		len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size);
++		res = read_block(blk_num, tmp, tree, dqi, qtd->type);
++		if (res < 0)
++			goto out_err;
++		if (res == BLOCK_NOT_FOUND) {
++			*eof = 1;
++			break;
++		} 
++		memcpy(page + buf_off, tmp + blk_off, len);
++
++		blk_num++;
++		buf_size -= len;
++		blk_off = 0;
++		buf_off += len;
++	}
++	res = buf_off;
++
++out_err:
++	*start += count;
++out_dq:
++	up(&qtd->qmblk->dq_sem);
++	up(&vz_quota_sem);
++	kfree(tmp);
++
++	return res;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota/QID/aquota.* files
++ *
++ * FIXME: this code lacks serialization of read/readdir/lseek.
++ * However, this problem should be fixed after the mainstream issue of what
++ * appears to be non-atomic read and update of file position in sys_read.
++ *
++ * --------------------------------------------------------------------- */
++
++static inline unsigned long vzdq_aquot_getino(dev_t dev)
++{
++	return 0xec000000UL + dev;
++}
++
++static inline dev_t vzdq_aquot_getidev(struct inode *inode)
++{
++	return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link;
++}
++
++static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev)
++{
++	PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev;
++}
++
++static ssize_t vzdq_aquotf_read(struct file *file,
++		char __user *buf, size_t size, loff_t *ppos)
++{
++	char *page;
++	size_t bufsize;
++	ssize_t l, l2, copied;
++	char *start;
++	struct inode *inode;
++	struct block_device *bdev;
++	struct super_block *sb;
++	struct quotatree_data data;
++	int eof, err;
++
++	err = -ENOMEM;
++	page = (char *)__get_free_page(GFP_KERNEL);
++	if (page == NULL)
++		goto out_err;
++
++	err = -ENODEV;
++	inode = file->f_dentry->d_inode;
++	bdev = bdget(vzdq_aquot_getidev(inode));
++	if (bdev == NULL)
++		goto out_err;
++	sb = get_super(bdev);
++	bdput(bdev);
++	if (sb == NULL)
++		goto out_err;
++	data.qmblk = vzquota_find_qmblk(sb);
++	data.type = PROC_I(inode)->fd - 1;
++	drop_super(sb);
++	if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD)
++		goto out_err;
++
++	copied = 0;
++	l = l2 = 0;
++	while (1) {
++		bufsize = min(size, (size_t)PAGE_SIZE);
++		if (bufsize <= 0)
++			break;
++
++		l = read_proc_quotafile(page, &start, *ppos, bufsize,
++				&eof, &data);
++		if (l <= 0)
++			break;
++
++		l2 = copy_to_user(buf, page, l);
++		copied += l - l2;
++		if (l2)
++			break;
++
++		buf += l;
++		size -= l;
++		*ppos += (unsigned long)start;
++		l = l2 = 0;
++	}
++
++	qmblk_put(data.qmblk);
++	free_page((unsigned long)page);
++	if (copied)
++		return copied;
++	else if (l2)		/* last copy_to_user failed */
++		return -EFAULT;
++	else			/* read error or EOF */
++		return l;
++
++out_err:
++	if (page != NULL)
++		free_page((unsigned long)page);
++	return err;
++}
++
++static struct file_operations vzdq_aquotf_file_operations = {
++	.read		= &vzdq_aquotf_read,
++};
++
++static struct inode_operations vzdq_aquotf_inode_operations = {
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota/QID directory
++ *
++ * --------------------------------------------------------------------- */
++
++static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler)
++{
++	loff_t n;
++	int err;
++
++	n = file->f_pos;
++	for (err = 0; !err; n++) {
++		/* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */
++		switch ((unsigned long)n) {
++		case 0:
++			err = (*filler)(data, ".", 1, n,
++					file->f_dentry->d_inode->i_ino,
++					DT_DIR);
++			break;
++		case 1:
++			err = (*filler)(data, "..", 2, n,
++					parent_ino(file->f_dentry), DT_DIR);
++			break;
++		case 2:
++			err = (*filler)(data, aquota_user,
++					sizeof(aquota_user)-1, n,
++					file->f_dentry->d_inode->i_ino
++								+ USRQUOTA + 1,
++					DT_REG);
++			break;
++		case 3:
++			err = (*filler)(data, aquota_group,
++					sizeof(aquota_group)-1, n,
++					file->f_dentry->d_inode->i_ino 
++								+ GRPQUOTA + 1,
++					DT_REG);
++			break;
++		default:
++			goto out;
++		}
++	}
++out:
++	file->f_pos = n;
++	return err;
++}
++
++struct vzdq_aquotq_lookdata {
++	dev_t dev;
++	int type;
++	struct vz_quota_master *qmblk;
++};
++
++static int vzdq_aquotq_looktest(struct inode *inode, void *data)
++{
++	struct vzdq_aquotq_lookdata *d;
++
++	d = data;
++	return inode->i_op == &vzdq_aquotf_inode_operations &&
++	       vzdq_aquot_getidev(inode) == d->dev &&
++	       PROC_I(inode)->fd == d->type + 1;
++}
++
++static int vzdq_aquotq_lookset(struct inode *inode, void *data)
++{
++	struct vzdq_aquotq_lookdata *d;
++	struct super_block *sb;
++	struct quotatree_data qtd;
++	struct quotatree_tree *tree;
++
++	d = data;
++	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++	inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1;
++	inode->i_mode = S_IFREG | S_IRUSR;
++	inode->i_uid = 0;
++	inode->i_gid = 0;
++	inode->i_nlink = 1;
++	inode->i_op = &vzdq_aquotf_inode_operations;
++	inode->i_fop = &vzdq_aquotf_file_operations;
++	PROC_I(inode)->fd = d->type + 1;
++	vzdq_aquot_setidev(inode, d->dev);
++
++	/* Setting size */
++	sb = user_get_super(d->dev);
++	if (sb == NULL)
++		return -ENODEV;
++	qtd.qmblk = vzquota_find_qmblk(sb);
++	drop_super(sb);
++
++	if (qtd.qmblk == NULL)
++		return -ESRCH;
++	if (qtd.qmblk == VZ_QUOTA_BAD)
++		return -EIO;
++
++	qtd.type = PROC_I(inode)->fd - 1;
++	tree = QUGID_TREE(qtd.qmblk, qtd.type);
++	inode->i_size = get_block_num(tree) * 1024;
++	return 0;
++}
++
++static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd)
++{
++	return 0;
++}
++
++static struct dentry_operations vzdq_aquotq_dentry_operations = {
++	.d_revalidate	= &vzdq_aquotq_revalidate,
++};
++
++static struct vz_quota_master *find_qmblk_by_dev(dev_t dev)
++{
++	struct super_block *sb;
++	struct vz_quota_master *qmblk;
++
++	qmblk = NULL;
++	sb = user_get_super(dev);
++	if (sb != NULL) {
++		qmblk = vzquota_find_qmblk(sb);
++		drop_super(sb);
++
++		if (qmblk == VZ_QUOTA_BAD)
++			qmblk = NULL;
++	}
++
++	return qmblk;
++}
++
++static struct dentry *vzdq_aquotq_lookup(struct inode *dir,
++		struct dentry *dentry,
++		struct nameidata *nd)
++{
++	struct inode *inode;
++	struct vzdq_aquotq_lookdata d;
++	int k;
++
++	if (dentry->d_name.len == sizeof(aquota_user)-1) {
++		if (memcmp(dentry->d_name.name, aquota_user,
++					sizeof(aquota_user)-1))
++			goto out;
++		k = USRQUOTA;
++	} else if (dentry->d_name.len == sizeof(aquota_group)-1) {
++		if (memcmp(dentry->d_name.name, aquota_group,
++					sizeof(aquota_group)-1))
++			goto out;
++		k = GRPQUOTA;
++	} else
++		goto out;
++	d.dev = vzdq_aquot_getidev(dir);
++	d.type = k;
++	d.qmblk = find_qmblk_by_dev(d.dev);
++	if (d.qmblk == NULL)
++		goto out;
++
++	inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1,
++			vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d);
++	if (inode == NULL)
++		goto out;
++	unlock_new_inode(inode);
++	dentry->d_op = &vzdq_aquotq_dentry_operations;
++	d_add(dentry, inode);
++	return NULL;
++
++out:
++	return ERR_PTR(-ENOENT);
++}
++
++static struct file_operations vzdq_aquotq_file_operations = {
++	.read		= &generic_read_dir,
++	.readdir	= &vzdq_aquotq_readdir,
++};
++
++static struct inode_operations vzdq_aquotq_inode_operations = {
++	.lookup		= &vzdq_aquotq_lookup,
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota directory
++ *
++ * --------------------------------------------------------------------- */
++
++struct vzdq_aquot_de {
++	struct list_head list;
++	struct vfsmount *mnt;
++};
++
++static int vzdq_aquot_buildmntlist(struct ve_struct *ve,
++		struct list_head *head)
++{
++	struct vfsmount *rmnt, *mnt;
++	struct vzdq_aquot_de *p;
++	int err;
++
++#ifdef CONFIG_VE
++	rmnt = mntget(ve->root_path.mnt);
++#else
++	read_lock(&current->fs->lock);
++	rmnt = mntget(current->fs->rootmnt);
++	read_unlock(&current->fs->lock);
++#endif
++	mnt = rmnt;
++	spin_lock(&vfsmount_lock);
++	while (1) {
++		list_for_each_entry(p, head, list) {
++			if (p->mnt->mnt_sb == mnt->mnt_sb)
++				goto skip;
++		}
++
++		err = -ENOMEM;
++		p = kmalloc(sizeof(*p), GFP_ATOMIC);
++		if (p == NULL)
++			goto out;
++		p->mnt = mntget(mnt);
++		list_add_tail(&p->list, head);
++
++skip:
++		err = 0;
++		if (list_empty(&mnt->mnt_mounts)) {
++			while (1) {
++				if (mnt == rmnt)
++					goto out;
++				if (mnt->mnt_child.next !=
++						&mnt->mnt_parent->mnt_mounts)
++					break;
++				mnt = mnt->mnt_parent;
++			}
++			mnt = list_entry(mnt->mnt_child.next,
++					struct vfsmount, mnt_child);
++		} else
++			mnt = list_entry(mnt->mnt_mounts.next,
++					struct vfsmount, mnt_child);
++	}
++out:
++	spin_unlock(&vfsmount_lock);
++	mntput(rmnt);
++	return err;
++}
++
++static void vzdq_aquot_releasemntlist(struct ve_struct *ve,
++		struct list_head *head)
++{
++	struct vzdq_aquot_de *p;
++
++	while (!list_empty(head)) {
++		p = list_entry(head->next, typeof(*p), list);
++		mntput(p->mnt);
++		list_del(&p->list);
++		kfree(p);
++	}
++}
++
++static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler)
++{
++	struct ve_struct *ve, *old_ve;
++	struct list_head mntlist;
++	struct vzdq_aquot_de *de;
++	struct super_block *sb;
++	struct vz_quota_master *qmblk;
++	loff_t i, n;
++	char buf[24];
++	int l, err;
++
++	i = 0;
++	n = file->f_pos;
++	ve = file->f_dentry->d_sb->s_type->owner_env;
++	old_ve = set_exec_env(ve);
++
++	INIT_LIST_HEAD(&mntlist);
++#ifdef CONFIG_VE
++	/*
++	 * The only reason of disabling readdir for the host system is that
++	 * this readdir can be slow and CPU consuming with large number of VPSs
++	 * (or just mount points).
++	 */
++	err = ve_is_super(ve);
++#else
++	err = 0;
++#endif
++	if (!err) {
++		err = vzdq_aquot_buildmntlist(ve, &mntlist);
++		if (err)
++			goto out_err;
++	}
++
++	if (i >= n) {
++		if ((*filler)(data, ".", 1, i,
++					file->f_dentry->d_inode->i_ino, DT_DIR))
++			goto out_fill;
++	}
++	i++;
++
++	if (i >= n) {
++		if ((*filler)(data, "..", 2, i,
++					parent_ino(file->f_dentry), DT_DIR))
++			goto out_fill;
++	}
++	i++;
++
++	list_for_each_entry (de, &mntlist, list) {
++		sb = de->mnt->mnt_sb;
++		if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL))
++			continue;
++
++		qmblk = vzquota_find_qmblk(sb);
++		if (qmblk == NULL || qmblk == VZ_QUOTA_BAD)
++			continue;
++
++		qmblk_put(qmblk);
++		i++;
++		if (i <= n)
++			continue;
++
++		l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev));
++		if ((*filler)(data, buf, l, i - 1,
++					vzdq_aquot_getino(sb->s_dev), DT_DIR))
++			break;
++	}
++
++out_fill:
++	err = 0;
++	file->f_pos = i;
++out_err:
++	vzdq_aquot_releasemntlist(ve, &mntlist);
++	(void)set_exec_env(old_ve);
++	return err;
++}
++
++static int vzdq_aquotd_looktest(struct inode *inode, void *data)
++{
++	return inode->i_op == &vzdq_aquotq_inode_operations &&
++	       vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data;
++}
++
++static int vzdq_aquotd_lookset(struct inode *inode, void *data)
++{
++	dev_t dev;
++
++	dev = (dev_t)(unsigned long)data;
++	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++	inode->i_ino = vzdq_aquot_getino(dev);
++	inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++	inode->i_uid = 0;
++	inode->i_gid = 0;
++	inode->i_nlink = 2;
++	inode->i_op = &vzdq_aquotq_inode_operations;
++	inode->i_fop = &vzdq_aquotq_file_operations;
++	vzdq_aquot_setidev(inode, dev);
++	return 0;
++}
++
++static struct dentry *vzdq_aquotd_lookup(struct inode *dir,
++		struct dentry *dentry,
++		struct nameidata *nd)
++{
++	struct ve_struct *ve, *old_ve;
++	const unsigned char *s;
++	int l;
++	dev_t dev;
++	struct inode *inode;
++
++	ve = dir->i_sb->s_type->owner_env;
++	old_ve = set_exec_env(ve);
++#ifdef CONFIG_VE
++	/*
++	 * Lookup is much lighter than readdir, so it can be allowed for the
++	 * host system.  But it would be strange to be able to do lookup only
++	 * without readdir...
++	 */
++	if (ve_is_super(ve))
++		goto out;
++#endif
++
++	dev = 0;
++	l = dentry->d_name.len;
++	if (l <= 0)
++		goto out;
++	for (s = dentry->d_name.name; l > 0; s++, l--) {
++		if (!isxdigit(*s))
++			goto out;
++		if (dev & ~(~0UL >> 4))
++			goto out;
++		dev <<= 4;
++		if (isdigit(*s))
++			dev += *s - '0';
++		else if (islower(*s))
++			dev += *s - 'a' + 10;
++		else
++			dev += *s - 'A' + 10;
++	}
++	dev = new_decode_dev(dev);
++
++	if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL))
++		goto out;
++
++	inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev),
++			vzdq_aquotd_looktest, vzdq_aquotd_lookset,
++			(void *)(unsigned long)dev);
++	if (inode == NULL)
++		goto out;
++	unlock_new_inode(inode);
++
++	d_add(dentry, inode);
++	(void)set_exec_env(old_ve);
++	return NULL;
++
++out:
++	(void)set_exec_env(old_ve);
++	return ERR_PTR(-ENOENT);
++}
++
++static struct file_operations vzdq_aquotd_file_operations = {
++	.read		= &generic_read_dir,
++	.readdir	= &vzdq_aquotd_readdir,
++};
++
++static struct inode_operations vzdq_aquotd_inode_operations = {
++	.lookup		= &vzdq_aquotd_lookup,
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Initialization and deinitialization
++ *
++ * --------------------------------------------------------------------- */
++static int fake_data;
++static struct ctl_table fake_table[] = {
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= ".fake",
++		.mode		= 0600,
++		.proc_handler	= proc_dointvec,
++		.data		= &fake_data,
++		.maxlen		= sizeof(int),
++	},
++	{ }
++};
++
++static struct ctl_path fake_path[] = {
++	{ .ctl_name = CTL_FS, .procname = "fs", },
++	{ .ctl_name = FS_DQSTATS, .procname = "quota", },
++	{ }
++};
++
++/*
++ * FIXME: creation of proc entries here is unsafe with respect to module
++ * unloading.
++ */
++void vzaquota_init(void)
++{
++	struct proc_dir_entry *de;
++
++	de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR,
++			glob_proc_vz_dir, &vzdq_aquotd_file_operations);
++	if (de != NULL)
++		de->proc_iops = &vzdq_aquotd_inode_operations;
++	else
++		printk("VZDQ: vz/vzaquota creation failed\n");
++
++	register_sysctl_glob_paths(fake_path, fake_table, 1);
++}
++
++void vzaquota_fini(void)
++{
++	remove_proc_entry("vz/vzaquota", NULL);
++}
+diff --git a/fs/vzdq_mgmt.c b/fs/vzdq_mgmt.c
+new file mode 100644
+index 0000000..a1e92e2
+--- /dev/null
++++ b/fs/vzdq_mgmt.c
+@@ -0,0 +1,754 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ */
++
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <asm/semaphore.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/dcache.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/writeback.h>
++#include <linux/gfp.h>
++#include <linux/module.h>
++#include <asm/uaccess.h>
++#include <linux/proc_fs.h>
++#include <linux/quota.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++
++
++/* ----------------------------------------------------------------------
++ * Switching quota on.
++ * --------------------------------------------------------------------- */
++
++/*
++ * check limits copied from user
++ */
++int vzquota_check_sane_limits(struct dq_stat *qstat)
++{
++	int err;
++
++	err = -EINVAL;
++
++	/* softlimit must be less then hardlimit */
++	if (qstat->bsoftlimit > qstat->bhardlimit)
++		goto out;
++
++	if (qstat->isoftlimit > qstat->ihardlimit)
++		goto out;
++
++	err = 0;
++out:
++	return err;
++}
++
++/*
++ * check usage values copied from user
++ */
++int vzquota_check_sane_values(struct dq_stat *qstat)
++{
++	int err;
++
++	err = -EINVAL;
++
++	/* expiration time must not be set if softlimit was not exceeded */
++	if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0)
++		goto out;
++
++	if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0)
++		goto out;
++
++	err = vzquota_check_sane_limits(qstat);
++out:
++	return err;
++}
++
++/*
++ * create new quota master block
++ * this function should:
++ *  - copy limits and usage parameters from user buffer;
++ *  - allock, initialize quota block and insert it to hash;
++ */
++static int vzquota_create(unsigned int quota_id,
++		struct vz_quota_stat __user *u_qstat, int compat)
++{
++	int err;
++	struct vz_quota_stat qstat;
++	struct vz_quota_master *qmblk;
++
++	down(&vz_quota_sem);
++
++	err = -EFAULT;
++	if (!compat) {
++		if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
++			goto out;
++	} else {
++#ifdef CONFIG_COMPAT
++		struct compat_vz_quota_stat cqstat;
++		if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
++			goto out;
++		compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat);
++		compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info);
++#endif
++	}
++
++	err = -EINVAL;
++	if (quota_id == 0)
++		goto out;
++
++	if (vzquota_check_sane_values(&qstat.dq_stat))
++		goto out;
++	err = 0;
++	qmblk = vzquota_alloc_master(quota_id, &qstat);
++
++	if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */
++		err = PTR_ERR(qmblk);
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++/**
++ * vzquota_on - turn quota on
++ *
++ * This function should:
++ *  - find and get refcnt of directory entry for quota root and corresponding
++ *    mountpoint;
++ *  - find corresponding quota block and mark it with given path;
++ *  - check quota tree;
++ *  - initialize quota for the tree root.
++ */
++static int vzquota_on(unsigned int quota_id, const char __user *quota_root,
++					char __user *buf)
++{
++	int err;
++	struct nameidata nd;
++	struct vz_quota_master *qmblk;
++	struct super_block *dqsb;
++
++	dqsb = NULL;
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EBUSY;
++	if (qmblk->dq_state != VZDQ_STARTING)
++		goto out;
++
++	err = user_path_walk(quota_root, &nd);
++	if (err)
++		goto out;
++	/* init path must be a directory */
++	err = -ENOTDIR;
++	if (!S_ISDIR(nd.path.dentry->d_inode->i_mode))
++		goto out_path;
++
++	qmblk->dq_root_path = nd.path;
++	qmblk->dq_sb = nd.path.dentry->d_inode->i_sb;
++	err = vzquota_get_super(qmblk->dq_sb);
++	if (err)
++		goto out_super;
++
++	/*
++	 * Serialization with quota initialization and operations is performed
++	 * through generation check: generation is memorized before qmblk is
++	 * found and compared under inode_qmblk_lock with assignment.
++	 *
++	 * Note that the dentry tree is shrunk only for high-level logical
++	 * serialization, purely as a courtesy to the user: to have consistent
++	 * quota statistics, files should be closed etc. on quota on.
++	 */
++	err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode,
++			qmblk, buf);
++	if (err)
++		goto out_init;
++	qmblk->dq_state = VZDQ_WORKING;
++
++	up(&vz_quota_sem);
++	return 0;
++
++out_init:
++	dqsb = qmblk->dq_sb;
++out_super:
++	/* clear for qmblk_put/quota_free_master */
++	qmblk->dq_sb = NULL;
++	qmblk->dq_root_path.dentry = NULL;
++	qmblk->dq_root_path.mnt = NULL;
++out_path:
++	path_put(&nd.path);
++out:
++	if (dqsb)
++		vzquota_put_super(dqsb);
++	up(&vz_quota_sem);
++	return err;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Switching quota off.
++ * --------------------------------------------------------------------- */
++
++/*
++ * destroy quota block by ID
++ */
++static int vzquota_destroy(unsigned int quota_id)
++{
++	int err;
++	struct vz_quota_master *qmblk;
++	struct path root;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EBUSY;
++	if (qmblk->dq_state == VZDQ_WORKING)
++		goto out; /* quota_off first */
++
++	list_del_init(&qmblk->dq_hash);
++	root = qmblk->dq_root_path;
++	qmblk->dq_root_path.dentry = NULL;
++	qmblk->dq_root_path.mnt = NULL;
++
++	if (qmblk->dq_sb)
++		vzquota_put_super(qmblk->dq_sb);
++	up(&vz_quota_sem);
++
++	qmblk_put(qmblk);
++	path_put(&root);
++	return 0;
++
++out:
++	up(&vz_quota_sem);
++	return err;
++}
++
++/**
++ * vzquota_off - turn quota off
++ */
++
++static int __vzquota_sync_list(struct list_head *lh,
++		struct vz_quota_master *qmblk,
++		enum writeback_sync_modes sync_mode)
++{
++	struct writeback_control wbc;
++	LIST_HEAD(list);
++	struct vz_quota_ilink *qlnk;
++	struct inode *inode;
++	int err, ret;
++
++	memset(&wbc, 0, sizeof(wbc));
++	wbc.sync_mode = sync_mode;
++
++	err = ret = 0;
++	while (!list_empty(lh)) {
++		if (need_resched()) {
++			inode_qmblk_unlock(qmblk->dq_sb);
++			schedule();
++			inode_qmblk_lock(qmblk->dq_sb);
++			continue;
++		}
++
++		qlnk = list_first_entry(lh, struct vz_quota_ilink, list);
++		list_move(&qlnk->list, &list);
++
++		inode = igrab(QLNK_INODE(qlnk));
++		if (!inode)
++			continue;
++
++		inode_qmblk_unlock(qmblk->dq_sb);
++
++		wbc.nr_to_write = LONG_MAX;
++		ret = sync_inode(inode, &wbc);
++		if (ret)
++			err = ret;
++		iput(inode);
++
++		inode_qmblk_lock(qmblk->dq_sb);
++	}
++
++	list_splice(&list, lh);
++	return err;
++}
++
++static int vzquota_sync_list(struct list_head *lh,
++		struct vz_quota_master *qmblk)
++{
++	(void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE);
++	return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL);
++}
++
++static int vzquota_sync_inodes(struct vz_quota_master *qmblk)
++{
++	int err;
++	LIST_HEAD(qlnk_list);
++
++	list_splice_init(&qmblk->dq_ilink_list, &qlnk_list);
++	err = vzquota_sync_list(&qlnk_list, qmblk);
++	if (!err && !list_empty(&qmblk->dq_ilink_list))
++		err = -EBUSY;
++	list_splice(&qlnk_list, &qmblk->dq_ilink_list);
++
++	return err;
++}
++
++static int vzquota_off(unsigned int quota_id, char __user *buf, int force)
++{
++	int err, ret;
++	struct vz_quota_master *qmblk;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EALREADY;
++	if (qmblk->dq_state != VZDQ_WORKING)
++		goto out;
++
++	inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */
++	ret = vzquota_sync_inodes(qmblk);
++	inode_qmblk_unlock(qmblk->dq_sb);
++
++	err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force);
++	if (err)
++		goto out;
++
++	err = ret;
++	/* vzquota_destroy will free resources */
++	qmblk->dq_state = VZDQ_STOPING;
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Other VZQUOTA ioctl's.
++ * --------------------------------------------------------------------- */
++
++/*
++ * this function should:
++ * - set new limits/buffer under quota master block lock
++ * - if new softlimit less then usage, then set expiration time
++ * - no need to alloc ugid hash table - we'll do that on demand
++ */
++int vzquota_update_limit(struct dq_stat *_qstat,
++		struct dq_stat *qstat)
++{
++	int err;
++
++	err = -EINVAL;
++	if (vzquota_check_sane_limits(qstat))
++		goto out;
++
++	err = 0;
++
++	/* limits */
++	_qstat->bsoftlimit = qstat->bsoftlimit;
++	_qstat->bhardlimit = qstat->bhardlimit;
++	/*
++	 * If the soft limit is exceeded, administrator can override the moment
++	 * when the grace period for limit exceeding ends.
++	 * Specifying the moment may be useful if the soft limit is set to be
++	 * lower than the current usage.  In the latter case, if the grace
++	 * period end isn't specified, the grace period will start from the
++	 * moment of the first write operation.
++	 * There is a race with the user level.  Soft limit may be already
++	 * exceeded before the limit change, and grace period end calculated by
++	 * the kernel will be overriden.  User level may check if the limit is
++	 * already exceeded, but check and set calls are not atomic.
++	 * This race isn't dangerous.  Under normal cicrumstances, the
++	 * difference between the grace period end calculated by the kernel and
++	 * the user level should be not greater than as the difference between
++	 * the moments of check and set calls, i.e. not bigger than the quota
++	 * timer resolution - 1 sec.
++	 */
++	if (qstat->btime != (time_t)0 &&
++			_qstat->bcurrent >= _qstat->bsoftlimit)
++		_qstat->btime = qstat->btime;
++
++	_qstat->isoftlimit = qstat->isoftlimit;
++	_qstat->ihardlimit = qstat->ihardlimit;
++	if (qstat->itime != (time_t)0 &&
++			_qstat->icurrent >= _qstat->isoftlimit)
++		_qstat->itime = qstat->itime;
++
++out:
++	return err;
++}
++
++/*
++ * set new quota limits.
++ * this function should:
++ *  copy new limits from user level
++ *  - find quota block
++ *  - set new limits and flags.
++ */
++static int vzquota_setlimit(unsigned int quota_id,
++		struct vz_quota_stat __user *u_qstat, int compat)
++{
++	int err;
++	struct vz_quota_stat qstat;
++	struct vz_quota_master *qmblk;
++
++	down(&vz_quota_sem); /* for hash list protection */
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EFAULT;
++	if (!compat) {
++		if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
++			goto out;
++	} else {
++#ifdef CONFIG_COMPAT
++		struct compat_vz_quota_stat cqstat;
++		if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
++			goto out;
++		compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat);
++		compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info);
++#endif
++	}
++
++	qmblk_data_write_lock(qmblk);
++	err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat);
++	if (err == 0)
++		qmblk->dq_info = qstat.dq_info;
++	qmblk_data_write_unlock(qmblk);
++
++out:
++	up(&vz_quota_sem);
++	return err;
++}
++
++/*
++ * get quota limits.
++ * very simple - just return stat buffer to user
++ */
++static int vzquota_getstat(unsigned int quota_id,
++		struct vz_quota_stat __user *u_qstat, int compat)
++{
++	int err;
++	struct vz_quota_stat qstat;
++	struct vz_quota_master *qmblk;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	qmblk_data_read_lock(qmblk);
++	/* copy whole buffer under lock */
++	memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat));
++	memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info));
++	qmblk_data_read_unlock(qmblk);
++
++	if (!compat)
++		err = copy_to_user(u_qstat, &qstat, sizeof(qstat));
++	else {
++#ifdef CONFIG_COMPAT
++		struct compat_vz_quota_stat cqstat;
++		dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat);
++		dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info);
++		err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat));
++#endif
++	}
++	if (err)
++		err = -EFAULT;
++
++out:
++	up(&vz_quota_sem);
++	return err;
++}
++
++/*
++ * This is a system call to turn per-VE disk quota on.
++ * Note this call is allowed to run ONLY from VE0
++ */
++long do_vzquotactl(int cmd, unsigned int quota_id,
++		struct vz_quota_stat __user *qstat, const char __user *ve_root,
++		int compat)
++{
++	int ret;
++	int force = 0;
++
++	ret = -EPERM;
++	/* access allowed only from root of VE0 */
++	if (!capable(CAP_SYS_RESOURCE) ||
++	    !capable(CAP_SYS_ADMIN))
++		goto out;
++
++	switch (cmd) {
++		case VZ_DQ_CREATE:
++			ret = vzquota_create(quota_id, qstat, compat);
++			break;
++		case VZ_DQ_DESTROY:
++			ret = vzquota_destroy(quota_id);
++			break;
++		case VZ_DQ_ON:
++			/* 
++			 * qstat is just a pointer to userspace buffer to
++			 * store busy files path in case of vzquota_on fail
++			 */
++			ret = vzquota_on(quota_id, ve_root, (char *)qstat);
++			break;
++		case VZ_DQ_OFF_FORCED:
++			force = 1;
++		case VZ_DQ_OFF:
++			/* 
++			 * ve_root is just a pointer to userspace buffer to
++			 * store busy files path in case of vzquota_off fail
++			 */
++			ret = vzquota_off(quota_id, (char *)ve_root, force);
++			break;
++		case VZ_DQ_SETLIMIT:
++			ret = vzquota_setlimit(quota_id, qstat, compat);
++			break;
++		case VZ_DQ_GETSTAT:
++			ret = vzquota_getstat(quota_id, qstat, compat);
++			break;
++
++		default:
++			ret = -EINVAL;
++			goto out;
++	}
++
++out:
++	return ret;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Proc filesystem routines
++ * ---------------------------------------------------------------------*/
++
++#if defined(CONFIG_PROC_FS)
++
++#define QUOTA_UINT_LEN		15
++#define QUOTA_TIME_LEN_FMT_UINT	"%11u"
++#define QUOTA_NUM_LEN_FMT_UINT	"%15u"
++#define QUOTA_NUM_LEN_FMT_ULL	"%15Lu"
++#define QUOTA_TIME_LEN_FMT_STR	"%11s"
++#define QUOTA_NUM_LEN_FMT_STR	"%15s"
++#define QUOTA_PROC_MAX_LINE_LEN 2048
++
++/*
++ * prints /proc/ve_dq header line
++ */
++static int print_proc_header(char * buffer)
++{
++	return sprintf(buffer,
++		       "%-11s"
++		       QUOTA_NUM_LEN_FMT_STR
++		       QUOTA_NUM_LEN_FMT_STR
++		       QUOTA_NUM_LEN_FMT_STR
++		       QUOTA_TIME_LEN_FMT_STR
++		       QUOTA_TIME_LEN_FMT_STR
++		       "\n",
++		       "qid: path", 
++		       "usage", "softlimit", "hardlimit", "time", "expire");
++}
++
++/*
++ * prints proc master record id, dentry path
++ */
++static int print_proc_master_id(char * buffer, char * path_buf,
++		struct vz_quota_master * qp)
++{
++	char *path;
++	int over;
++
++	path = NULL;
++	switch (qp->dq_state) {
++		case VZDQ_WORKING:
++			if (!path_buf) {
++				path = "";
++				break;
++			}
++			path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE);
++			if (IS_ERR(path)) {
++				path = "";
++				break;
++			}
++			/* do not print large path, truncate it */
++			over = strlen(path) -
++				(QUOTA_PROC_MAX_LINE_LEN - 3 - 3 -
++				 	QUOTA_UINT_LEN);
++			if (over > 0) {
++				path += over - 3;
++				path[0] = path[1] = path[3] = '.';
++			}
++			break;
++		case VZDQ_STARTING:
++			path = "-- started --";
++			break;
++		case VZDQ_STOPING:
++			path = "-- stopped --";
++			break;
++	}
++
++	return sprintf(buffer, "%u: %s\n", qp->dq_id, path);
++}
++
++/*
++ * prints struct vz_quota_stat data
++ */
++static int print_proc_stat(char * buffer, struct dq_stat *qs,
++		struct dq_info *qi)
++{
++	return sprintf(buffer,
++		       "%11s"
++		       QUOTA_NUM_LEN_FMT_ULL
++		       QUOTA_NUM_LEN_FMT_ULL
++		       QUOTA_NUM_LEN_FMT_ULL
++		       QUOTA_TIME_LEN_FMT_UINT
++		       QUOTA_TIME_LEN_FMT_UINT
++		       "\n"
++		       "%11s"
++		       QUOTA_NUM_LEN_FMT_UINT
++		       QUOTA_NUM_LEN_FMT_UINT
++		       QUOTA_NUM_LEN_FMT_UINT
++		       QUOTA_TIME_LEN_FMT_UINT
++		       QUOTA_TIME_LEN_FMT_UINT
++		       "\n",
++		       "1k-blocks",
++		       (unsigned long long)qs->bcurrent >> 10,
++		       (unsigned long long)qs->bsoftlimit >> 10,
++		       (unsigned long long)qs->bhardlimit >> 10,
++		       (unsigned int)qs->btime,
++		       (unsigned int)qi->bexpire,
++		       "inodes",
++		       qs->icurrent,
++		       qs->isoftlimit,
++		       qs->ihardlimit,
++		       (unsigned int)qs->itime,
++		       (unsigned int)qi->iexpire);
++}
++
++
++/*
++ * for /proc filesystem output
++ */
++static int vzquota_read_proc(char *page, char **start, off_t off, int count,
++			   int *eof, void *data)
++{
++	int len, i;
++	off_t printed = 0;
++	char *p = page;
++	struct vz_quota_master *qp;
++	struct vz_quota_ilink *ql2;
++	struct list_head *listp;
++	char *path_buf;
++
++	path_buf = (char*)__get_free_page(GFP_KERNEL);
++	if (path_buf == NULL)
++		return -ENOMEM;
++
++	len = print_proc_header(p);
++	printed += len;
++	if (off < printed) /* keep header in output */ {
++		*start = p + off;
++		p += len;
++	}
++
++	down(&vz_quota_sem);
++
++	/* traverse master hash table for all records */
++	for (i = 0; i < vzquota_hash_size; i++) {
++		list_for_each(listp, &vzquota_hash_table[i]) {
++			qp = list_entry(listp,
++					struct vz_quota_master, dq_hash);
++
++			/* Skip other VE's information if not root of VE0 */
++			if ((!capable(CAP_SYS_ADMIN) ||
++			     !capable(CAP_SYS_RESOURCE))) {
++				ql2 = INODE_QLNK(current->fs->root.dentry->d_inode);
++				if (ql2 == NULL || qp != ql2->qmblk)
++					continue;
++			}
++			/*
++			 * Now print the next record
++			 */
++			len = 0;
++			/* we print quotaid and path only in VE0 */
++			if (capable(CAP_SYS_ADMIN))
++				len += print_proc_master_id(p+len,path_buf, qp);
++			len += print_proc_stat(p+len, &qp->dq_stat,
++					&qp->dq_info);
++			printed += len;
++			/* skip unnecessary lines */
++			if (printed <= off)
++				continue;
++			p += len;
++			/* provide start offset */
++			if (*start == NULL)
++				*start = p + (off - printed);
++			/* have we printed all requested size? */
++			if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN ||
++			    (p - *start) >= count)
++				goto out;
++		}
++	}
++
++	*eof = 1; /* checked all hash */
++out:
++	up(&vz_quota_sem);
++
++	len = 0;
++	if (*start != NULL) {
++		len = (p - *start);
++		if (len > count)
++			len = count;
++	}
++
++	if (path_buf)
++		free_page((unsigned long) path_buf);
++
++	return len;
++}
++
++/*
++ * Register procfs read callback
++ */
++int vzquota_proc_init(void)
++{
++	struct proc_dir_entry *de;
++
++	de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL);
++	if (de == NULL)
++		return -EBUSY;
++
++	de->read_proc = vzquota_read_proc;
++	de->data = NULL;
++	return 0;
++}
++
++void vzquota_proc_release(void)
++{
++	/* Unregister procfs read callback */
++	remove_proc_entry("vzquota", proc_vz_dir);
++}
++
++#endif
+diff --git a/fs/vzdq_ops.c b/fs/vzdq_ops.c
+new file mode 100644
+index 0000000..5eb7d84
+--- /dev/null
++++ b/fs/vzdq_ops.c
+@@ -0,0 +1,633 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ */
++
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <asm/semaphore.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/quota.h>
++#include <linux/vzquota.h>
++
++
++/* ----------------------------------------------------------------------
++ * Quota superblock operations - helper functions.
++ * --------------------------------------------------------------------- */
++
++static inline void vzquota_incr_inodes(struct dq_stat *dqstat,
++		unsigned long number)
++{
++	dqstat->icurrent += number;
++}
++
++static inline void vzquota_incr_space(struct dq_stat *dqstat,
++		__u64 number)
++{
++	dqstat->bcurrent += number;
++}
++
++static inline void vzquota_decr_inodes(struct dq_stat *dqstat,
++		unsigned long number)
++{
++	if (dqstat->icurrent > number)
++		dqstat->icurrent -= number;
++	else
++		dqstat->icurrent = 0;
++	if (dqstat->icurrent < dqstat->isoftlimit)
++		dqstat->itime = (time_t) 0;
++}
++
++static inline void vzquota_decr_space(struct dq_stat *dqstat,
++		__u64 number)
++{
++	if (dqstat->bcurrent > number)
++		dqstat->bcurrent -= number;
++	else
++		dqstat->bcurrent = 0;
++	if (dqstat->bcurrent < dqstat->bsoftlimit)
++		dqstat->btime = (time_t) 0;
++}
++
++/*
++ * better printk() message or use /proc/vzquotamsg interface
++ * similar to /proc/kmsg
++ */
++static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag,
++		const char *fmt)
++{
++	if (dq_info->flags & flag) /* warning already printed for this
++				       masterblock */
++		return;
++	printk(fmt, dq_id);
++	dq_info->flags |= flag;
++}
++
++/*
++ * ignore_hardlimit -
++ *
++ * Intended to allow superuser of VE0 to overwrite hardlimits.
++ *
++ * ignore_hardlimit() has a very bad feature:
++ *
++ *	writepage() operation for writable mapping of a file with holes
++ *	may trigger get_block() with wrong current and as a consequence,
++ *	opens a possibility to overcommit hardlimits
++ */
++/* for the reason above, it is disabled now */
++static inline int ignore_hardlimit(struct dq_info *dqstat)
++{
++#if 0
++	return	ve_is_super(get_exec_env()) &&
++		capable(CAP_SYS_RESOURCE) &&
++		(dqstat->options & VZ_QUOTA_OPT_RSQUASH);
++#else
++	return 0;
++#endif
++}
++
++static int vzquota_check_inodes(struct dq_info *dq_info,
++		struct dq_stat *dqstat,
++		unsigned long number, int dq_id)
++{
++	if (number == 0)
++		return QUOTA_OK;
++
++	if (dqstat->icurrent + number > dqstat->ihardlimit &&
++	    !ignore_hardlimit(dq_info)) {
++		vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
++			   "VZ QUOTA: file hardlimit reached for id=%d\n");
++		return NO_QUOTA;
++	}
++
++	if (dqstat->icurrent + number > dqstat->isoftlimit) {
++		if (dqstat->itime == (time_t)0) {
++			vzquota_warn(dq_info, dq_id, 0,
++				"VZ QUOTA: file softlimit exceeded "
++				"for id=%d\n");
++			dqstat->itime = CURRENT_TIME_SECONDS +
++				dq_info->iexpire;
++		} else if (CURRENT_TIME_SECONDS >= dqstat->itime &&
++			   !ignore_hardlimit(dq_info)) {
++			vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
++				"VZ QUOTA: file softlimit expired "
++				"for id=%d\n");
++			return NO_QUOTA;
++		}
++	}
++
++	return QUOTA_OK;
++}
++
++static int vzquota_check_space(struct dq_info *dq_info,
++		struct dq_stat *dqstat,
++		__u64 number, int dq_id, char prealloc)
++{
++	if (number == 0)
++		return QUOTA_OK;
++
++	if (prealloc == DQUOT_CMD_FORCE)
++		return QUOTA_OK;
++
++	if (dqstat->bcurrent + number > dqstat->bhardlimit &&
++	    !ignore_hardlimit(dq_info)) {
++		if (!prealloc)
++			vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
++				"VZ QUOTA: disk hardlimit reached "
++				"for id=%d\n");
++		return NO_QUOTA;
++	}
++
++	if (dqstat->bcurrent + number > dqstat->bsoftlimit) {
++		if (dqstat->btime == (time_t)0) {
++			if (!prealloc) {
++				vzquota_warn(dq_info, dq_id, 0,
++					"VZ QUOTA: disk softlimit exceeded "
++					"for id=%d\n");
++				dqstat->btime = CURRENT_TIME_SECONDS
++							+ dq_info->bexpire;
++			} else {
++				/*
++				 * Original Linux quota doesn't allow
++				 * preallocation to exceed softlimit so
++				 * exceeding will be always printed
++				 */
++				return NO_QUOTA;
++			}
++		} else if (CURRENT_TIME_SECONDS >= dqstat->btime &&
++			   !ignore_hardlimit(dq_info)) {
++			if (!prealloc)
++				vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
++					"VZ QUOTA: disk quota "
++					"softlimit expired "
++					"for id=%d\n");
++			return NO_QUOTA;
++		}
++	}
++
++	return QUOTA_OK;
++}
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk,
++		struct vz_quota_ugid *qugid[],
++		int type, unsigned long number)
++{
++	struct dq_info *dqinfo;
++	struct dq_stat *dqstat;
++
++	if (qugid[type] == NULL)
++		return QUOTA_OK;
++	if (qugid[type] == VZ_QUOTA_UGBAD)
++		return NO_QUOTA;
++
++	if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
++		return QUOTA_OK;
++	if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
++		return QUOTA_OK;
++	if (number == 0)
++		return QUOTA_OK;
++
++	dqinfo = &qmblk->dq_ugid_info[type];
++	dqstat = &qugid[type]->qugid_stat;
++
++	if (dqstat->ihardlimit != 0 &&
++	    dqstat->icurrent + number > dqstat->ihardlimit)
++		return NO_QUOTA;
++
++	if (dqstat->isoftlimit != 0 &&
++	    dqstat->icurrent + number > dqstat->isoftlimit) {
++		if (dqstat->itime == (time_t)0)
++			dqstat->itime = CURRENT_TIME_SECONDS +
++				dqinfo->iexpire;
++		else if (CURRENT_TIME_SECONDS >= dqstat->itime)
++			return NO_QUOTA;
++	}
++
++	return QUOTA_OK;
++}
++
++static int vzquota_check_ugid_space(struct vz_quota_master *qmblk,
++		struct vz_quota_ugid *qugid[],
++		int type, __u64 number, char prealloc)
++{
++	struct dq_info *dqinfo;
++	struct dq_stat *dqstat;
++
++	if (prealloc == DQUOT_CMD_FORCE)
++		return QUOTA_OK;
++
++	if (qugid[type] == NULL)
++		return QUOTA_OK;
++	if (qugid[type] == VZ_QUOTA_UGBAD)
++		return NO_QUOTA;
++
++	if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
++		return QUOTA_OK;
++	if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
++		return QUOTA_OK;
++	if (number == 0)
++		return QUOTA_OK;
++
++	dqinfo = &qmblk->dq_ugid_info[type];
++	dqstat = &qugid[type]->qugid_stat;
++
++	if (dqstat->bhardlimit != 0 &&
++	    dqstat->bcurrent + number > dqstat->bhardlimit)
++		return NO_QUOTA;
++
++	if (dqstat->bsoftlimit != 0 &&
++	    dqstat->bcurrent + number > dqstat->bsoftlimit) {
++		if (dqstat->btime == (time_t)0) {
++			if (!prealloc)
++				dqstat->btime = CURRENT_TIME_SECONDS
++							+ dqinfo->bexpire;
++			else
++				/*
++				 * Original Linux quota doesn't allow
++				 * preallocation to exceed softlimit so
++				 * exceeding will be always printed
++				 */
++				return NO_QUOTA;
++		} else if (CURRENT_TIME_SECONDS >= dqstat->btime)
++			return NO_QUOTA;
++	}
++
++	return QUOTA_OK;
++}
++#endif
++
++/* ----------------------------------------------------------------------
++ * Quota superblock operations
++ * --------------------------------------------------------------------- */
++
++/*
++ * S_NOQUOTA note.
++ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for
++ *  - quota file (absent in our case)
++ *  - after explicit DQUOT_DROP (earlier than clear_inode) in functions like
++ *    filesystem-specific new_inode, before the inode gets outside links.
++ * For the latter case, the only quota operation where care about S_NOQUOTA
++ * might be required is vzquota_drop, but there S_NOQUOTA has already been
++ * checked in DQUOT_DROP().
++ * So, S_NOQUOTA may be ignored for now in the VZDQ code.
++ *
++ * The above note is not entirely correct.
++ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from
++ * delete_inode if new_inode fails (for example, because of inode quota
++ * limits), so S_NOQUOTA check is needed in free_inode.
++ * This seems to be the dark corner of the current quota API.
++ */
++
++/*
++ * Initialize quota operations for the specified inode.
++ */
++static int vzquota_initialize(struct inode *inode, int type)
++{
++	vzquota_inode_init_call(inode);
++	return 0; /* ignored by caller */
++}
++
++/*
++ * Release quota for the specified inode.
++ */
++static int vzquota_drop(struct inode *inode)
++{
++	vzquota_inode_drop_call(inode);
++	return 0; /* ignored by caller */
++}
++
++/*
++ * Allocate block callback.
++ *
++ * If (prealloc) disk quota exceeding warning is not printed.
++ * See Linux quota to know why.
++ *
++ * Return:
++ *	QUOTA_OK == 0 on SUCCESS
++ *	NO_QUOTA == 1 if allocation should fail
++ */
++static int vzquota_alloc_space(struct inode *inode,
++			     qsize_t number, int prealloc)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++	int ret = QUOTA_OK;
++
++	qmblk = vzquota_inode_data(inode, &data);
++	if (qmblk == VZ_QUOTA_BAD)
++		return NO_QUOTA;
++	if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++		int cnt;
++		struct vz_quota_ugid * qugid[MAXQUOTAS];
++#endif
++
++		/* checking first */
++		ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat,
++				number, qmblk->dq_id, prealloc);
++		if (ret == NO_QUOTA)
++			goto no_quota;
++#ifdef CONFIG_VZ_QUOTA_UGID
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
++			ret = vzquota_check_ugid_space(qmblk, qugid,
++					cnt, number, prealloc);
++			if (ret == NO_QUOTA)
++				goto no_quota;
++		}
++		/* check ok, may increment */
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			if (qugid[cnt] == NULL)
++				continue;
++			vzquota_incr_space(&qugid[cnt]->qugid_stat, number);
++		}
++#endif
++		vzquota_incr_space(&qmblk->dq_stat, number);
++		vzquota_data_unlock(inode, &data);
++	}
++
++	inode_add_bytes(inode, number);
++	might_sleep();
++	return QUOTA_OK;
++
++no_quota:
++	vzquota_data_unlock(inode, &data);
++	return NO_QUOTA;
++}
++
++/*
++ * Allocate inodes callback.
++ *
++ * Return:
++ *	QUOTA_OK == 0 on SUCCESS
++ *	NO_QUOTA == 1 if allocation should fail
++ */
++static int vzquota_alloc_inode(const struct inode *inode, unsigned long number)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++	int ret = QUOTA_OK;
++
++	qmblk = vzquota_inode_data((struct inode *)inode, &data);
++	if (qmblk == VZ_QUOTA_BAD)
++		return NO_QUOTA;
++	if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++		int cnt;
++		struct vz_quota_ugid *qugid[MAXQUOTAS];
++#endif
++
++		/* checking first */
++		ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat,
++				number, qmblk->dq_id);
++		if (ret == NO_QUOTA)
++			goto no_quota;
++#ifdef CONFIG_VZ_QUOTA_UGID
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
++			ret = vzquota_check_ugid_inodes(qmblk, qugid,
++					cnt, number);
++			if (ret == NO_QUOTA)
++				goto no_quota;
++		}
++		/* check ok, may increment */
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			if (qugid[cnt] == NULL)
++				continue;
++			vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number);
++		}
++#endif
++		vzquota_incr_inodes(&qmblk->dq_stat, number);
++		vzquota_data_unlock((struct inode *)inode, &data);
++	}
++
++	might_sleep();
++	return QUOTA_OK;
++
++no_quota:
++	vzquota_data_unlock((struct inode *)inode, &data);
++	return NO_QUOTA;
++}
++
++/*
++ * Free space callback.
++ */
++static int vzquota_free_space(struct inode *inode, qsize_t number)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++
++	qmblk = vzquota_inode_data(inode, &data);
++	if (qmblk == VZ_QUOTA_BAD)
++		return NO_QUOTA; /* isn't checked by the caller */
++	if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++		int cnt;
++		struct vz_quota_ugid * qugid;
++#endif
++
++		vzquota_decr_space(&qmblk->dq_stat, number);
++#ifdef CONFIG_VZ_QUOTA_UGID
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			qugid = INODE_QLNK(inode)->qugid[cnt];
++			if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
++				continue;
++			vzquota_decr_space(&qugid->qugid_stat, number);
++		}
++#endif
++		vzquota_data_unlock(inode, &data);
++	}
++	inode_sub_bytes(inode, number);
++	might_sleep();
++	return QUOTA_OK;
++}
++
++/*
++ * Free inodes callback.
++ */
++static int vzquota_free_inode(const struct inode *inode, unsigned long number)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++
++	qmblk = vzquota_inode_data((struct inode *)inode, &data);
++	if (qmblk == VZ_QUOTA_BAD)
++		return NO_QUOTA;
++	if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++		int cnt;
++		struct vz_quota_ugid * qugid;
++#endif
++
++		vzquota_decr_inodes(&qmblk->dq_stat, number);
++#ifdef CONFIG_VZ_QUOTA_UGID
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			qugid = INODE_QLNK(inode)->qugid[cnt];
++			if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
++				continue;
++			vzquota_decr_inodes(&qugid->qugid_stat, number);
++		}
++#endif
++		vzquota_data_unlock((struct inode *)inode, &data);
++	}
++	might_sleep();
++	return QUOTA_OK;
++}
++
++void vzquota_inode_off(struct inode * inode)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++
++	/* The call is made through virtinfo, it can be an inode
++	 * not controlled by vzquota.
++	 */
++	if (inode->i_sb->dq_op != &vz_quota_operations)
++		return;
++
++	qmblk = vzquota_inode_data(inode, &data);
++	if (qmblk == VZ_QUOTA_BAD)
++		return;
++
++	if (qmblk == NULL) {
++		/* Tricky place. If qmblk == NULL, it means that this inode
++		 * is not in area controlled by vzquota (except for rare
++		 * case of already set S_NOQUOTA). But we have to set
++		 * S_NOQUOTA in any case because vzquota can be turned
++		 * on later, when this inode is invalid from viewpoint
++		 * of vzquota.
++		 *
++		 * To be safe, we reacquire vzquota lock.
++		 */
++		inode_qmblk_lock(inode->i_sb);
++		inode->i_flags |= S_NOQUOTA;
++		inode_qmblk_unlock(inode->i_sb);
++		return;
++	} else {
++		loff_t bytes = inode_get_bytes(inode);
++#ifdef CONFIG_VZ_QUOTA_UGID
++		int cnt;
++		struct vz_quota_ugid * qugid;
++#endif
++
++		inode->i_flags |= S_NOQUOTA;
++
++		vzquota_decr_space(&qmblk->dq_stat, bytes);
++		vzquota_decr_inodes(&qmblk->dq_stat, 1);
++#ifdef CONFIG_VZ_QUOTA_UGID
++		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++			qugid = INODE_QLNK(inode)->qugid[cnt];
++			if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
++				continue;
++			vzquota_decr_space(&qugid->qugid_stat, bytes);
++			vzquota_decr_inodes(&qugid->qugid_stat, 1);
++		}
++#endif
++
++		vzquota_data_unlock(inode, &data);
++
++		vzquota_inode_drop_call(inode);
++	}
++}
++
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++
++/*
++ * helper function for quota_transfer
++ * check that we can add inode to this quota_id
++ */
++static int vzquota_transfer_check(struct vz_quota_master *qmblk,
++		struct vz_quota_ugid *qugid[],
++		unsigned int type, __u64 size)
++{
++	if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK ||
++	    vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK)
++		return -1;
++	return 0;
++}
++
++int vzquota_transfer_usage(struct inode *inode,
++		int mask,
++		struct vz_quota_ilink *qlnk)
++{
++	struct vz_quota_ugid *qugid_old;
++	__u64 space;
++	int i;
++
++	space = inode_get_bytes(inode);
++	for (i = 0; i < MAXQUOTAS; i++) {
++		if (!(mask & (1 << i)))
++			continue;
++		/*
++		 * Do not permit chown a file if its owner does not have
++		 * ugid record. This might happen if we somehow exceeded
++		 * the UID/GID (e.g. set uglimit less than number of users).
++		 */
++		if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD)
++			return -1;
++		if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space))
++			return -1;
++	}
++
++	for (i = 0; i < MAXQUOTAS; i++) {
++		if (!(mask & (1 << i)))
++			continue;
++		qugid_old = INODE_QLNK(inode)->qugid[i];
++		vzquota_decr_space(&qugid_old->qugid_stat, space);
++		vzquota_decr_inodes(&qugid_old->qugid_stat, 1);
++		vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space);
++		vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1);
++	}
++	return 0;
++}
++
++/*
++ * Transfer the inode between diffent user/group quotas.
++ */
++static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
++{
++	return vzquota_inode_transfer_call(inode, iattr) ?
++		NO_QUOTA : QUOTA_OK;
++}
++
++#else /* CONFIG_VZ_QUOTA_UGID */
++
++static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
++{
++	return QUOTA_OK;
++}
++
++#endif
++
++/*
++ * Called under following semaphores:
++ *	old_d->d_inode->i_sb->s_vfs_rename_sem
++ *	old_d->d_inode->i_sem
++ *	new_d->d_inode->i_sem
++ * [not verified  --SAW]
++ */
++static int vzquota_rename(struct inode *inode,
++		struct inode *old_dir, struct inode *new_dir)
++{
++	return vzquota_rename_check(inode, old_dir, new_dir) ?
++		NO_QUOTA : QUOTA_OK;
++}
++
++/*
++ * Structure of superblock diskquota operations.
++ */
++struct dquot_operations vz_quota_operations = {
++	.initialize	= vzquota_initialize,
++	.drop		= vzquota_drop,
++	.alloc_space	= vzquota_alloc_space,
++	.alloc_inode	= vzquota_alloc_inode,
++	.free_space	= vzquota_free_space,
++	.free_inode	= vzquota_free_inode,
++	.transfer	= vzquota_transfer,
++	.rename		= vzquota_rename,
++};
+diff --git a/fs/vzdq_tree.c b/fs/vzdq_tree.c
+new file mode 100644
+index 0000000..f4f2152
+--- /dev/null
++++ b/fs/vzdq_tree.c
+@@ -0,0 +1,286 @@
++/*
++ *
++ * Copyright (C) 2005  SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo quota tree implementation
++ */
++
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/vzdq_tree.h>
++
++struct quotatree_tree *quotatree_alloc(void)
++{
++	int l;
++	struct quotatree_tree *tree;
++
++	tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL);
++	if (tree == NULL)
++		goto out;
++
++	for (l = 0; l < QUOTATREE_DEPTH; l++) {
++		INIT_LIST_HEAD(&tree->levels[l].usedlh);
++		INIT_LIST_HEAD(&tree->levels[l].freelh);
++		tree->levels[l].freenum = 0;
++	}
++	tree->root = NULL;
++	tree->leaf_num = 0;
++out:
++	return tree;
++}
++
++static struct quotatree_node *
++quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level,
++		struct quotatree_find_state *st)
++{
++	void **block;
++	struct quotatree_node *parent;
++	int l, index;
++
++	parent = NULL;
++	block = (void **)&tree->root;
++	l = 0;
++	while (l < level && *block != NULL) {
++		index = (id >>  QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
++		parent = *block;
++		block = parent->blocks + index;
++		l++;
++	}
++	if (st != NULL) {
++		st->block = block;
++		st->level = l;
++	}
++
++	return parent;
++}
++
++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
++		struct quotatree_find_state *st)
++{
++	quotatree_follow(tree, id, QUOTATREE_DEPTH, st);
++	if (st->level == QUOTATREE_DEPTH)
++		return *st->block;
++	else
++		return NULL;
++}
++
++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index)
++{
++	int i, count;
++	struct quotatree_node *p;
++	void *leaf;
++
++	if (QTREE_LEAFNUM(tree) <= index)
++		return NULL;
++
++	count = 0;
++	list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
++		for (i = 0; i < QUOTATREE_BSIZE; i++) {	
++			leaf = p->blocks[i];
++			if (leaf == NULL)
++				continue;
++			if (count == index)
++				return leaf;
++			count++;
++		}
++	}
++	return NULL;
++}
++
++/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id)
++ * in the tree... */
++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id)
++{
++	int off;
++	struct quotatree_node *parent, *p;
++	struct list_head *lh;
++
++	/* get parent refering correct quota tree node of the last level */
++	parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL);
++	if (!parent)
++		return NULL;
++
++	off = (id & QUOTATREE_BMASK) + 1;	/* next ugid */
++	lh = &parent->list;
++	do {
++		p = list_entry(lh, struct quotatree_node, list);
++		for ( ; off < QUOTATREE_BSIZE; off++)
++			if (p->blocks[off])
++				return p->blocks[off];
++		off = 0;
++		lh = lh->next;
++	} while (lh != &QTREE_LEAFLVL(tree)->usedlh);
++
++	return NULL;
++}
++
++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
++		struct quotatree_find_state *st, void *data)
++{
++	struct quotatree_node *p;
++	int l, index;
++
++	while (st->level < QUOTATREE_DEPTH) {
++		l = st->level;
++		if (!list_empty(&tree->levels[l].freelh)) {
++			p = list_entry(tree->levels[l].freelh.next,
++					struct quotatree_node, list);
++			list_del(&p->list);
++		} else {
++			p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL);
++			if (p == NULL)
++				return -ENOMEM;
++			/* save block number in the l-level
++			 * it uses for quota file generation */
++			p->num = tree->levels[l].freenum++;
++		}
++		list_add(&p->list, &tree->levels[l].usedlh);
++		memset(p->blocks, 0, sizeof(p->blocks));
++		*st->block = p;
++
++		index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
++		st->block = p->blocks + index;
++		st->level++;
++	}
++	tree->leaf_num++;
++	*st->block = data;
++
++	return 0;
++}
++
++static struct quotatree_node *
++quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id,
++		int level)
++{
++	struct quotatree_node *parent;
++	struct quotatree_find_state st;
++
++	parent = quotatree_follow(tree, id, level, &st);
++	if (st.level == QUOTATREE_DEPTH)
++		tree->leaf_num--;
++	*st.block = NULL;
++	return parent;
++}
++
++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id)
++{
++	struct quotatree_node *p;
++	int level, i;
++
++	p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH);
++	for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) {
++		for (i = 0; i < QUOTATREE_BSIZE; i++)
++			if (p->blocks[i] != NULL)
++				return;
++		list_move(&p->list, &tree->levels[level].freelh);
++		p = quotatree_remove_ptr(tree, id, level);
++	}
++}
++
++#if 0
++static void quotatree_walk(struct quotatree_tree *tree,
++		struct quotatree_node *node_start,
++		quotaid_t id_start,
++		int level_start, int level_end,
++		int (*callback)(struct quotatree_tree *,
++				quotaid_t id,
++				int level,
++				void *ptr,
++				void *data),
++		void *data)
++{
++	struct quotatree_node *p;
++	int l, shift, index;
++	quotaid_t id;
++	struct quotatree_find_state st;
++
++	p = node_start;
++	l = level_start;
++	shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
++	id = id_start;
++	index = 0;
++
++	/*
++	 * Invariants:
++	 * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
++	 * id & ((1 << shift) - 1) == 0
++	 * p is l-level node corresponding to id
++	 */
++	do {
++		if (!p)
++			break;
++
++		if (l < level_end) {
++			for (; index < QUOTATREE_BSIZE; index++)
++				if (p->blocks[index] != NULL)
++					break;
++			if (index < QUOTATREE_BSIZE) {
++				/* descend */
++				p = p->blocks[index];
++				l++;
++				shift -= QUOTAID_BBITS;
++				id += (quotaid_t)index << shift;
++				index = 0;
++				continue;
++			}
++		}
++
++		if ((*callback)(tree, id, l, p, data))
++			break;
++
++		/* ascend and to the next node */
++		p = quotatree_follow(tree, id, l, &st);
++
++		index = ((id >> shift) & QUOTATREE_BMASK) + 1;
++		l--;
++		shift += QUOTAID_BBITS;
++		id &= ~(((quotaid_t)1 << shift) - 1);
++	} while (l >= level_start);
++}
++#endif
++
++static void free_list(struct list_head *node_list)
++{
++	struct quotatree_node *p, *tmp;
++
++	list_for_each_entry_safe(p, tmp, node_list, list) {
++		list_del(&p->list);
++		kfree(p);
++	}
++}
++
++static inline void quotatree_free_nodes(struct quotatree_tree *tree)
++{
++	int i;
++
++	for (i = 0; i < QUOTATREE_DEPTH; i++) {
++		free_list(&tree->levels[i].usedlh);
++		free_list(&tree->levels[i].freelh);
++	}
++}
++
++static void quotatree_free_leafs(struct quotatree_tree *tree,
++		void (*dtor)(void *))
++{
++	int i;
++	struct quotatree_node *p;
++
++	list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
++		for (i = 0; i < QUOTATREE_BSIZE; i++) {
++			if (p->blocks[i] == NULL)
++				continue;
++
++			dtor(p->blocks[i]);
++		}
++	}
++}
++
++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *))
++{
++	quotatree_free_leafs(tree, dtor);
++	quotatree_free_nodes(tree);
++	kfree(tree);
++}
+diff --git a/fs/vzdq_ugid.c b/fs/vzdq_ugid.c
+new file mode 100644
+index 0000000..1031149
+--- /dev/null
++++ b/fs/vzdq_ugid.c
+@@ -0,0 +1,1221 @@
++/*
++ * Copyright (C) 2002 SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo UID/GID disk quota implementation
++ */
++
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/smp_lock.h>
++#include <linux/rcupdate.h>
++#include <asm/uaccess.h>
++#include <linux/proc_fs.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/quota.h>
++#include <linux/quotaio_v2.h>
++#include <linux/virtinfo.h>
++#include <linux/namei.h>
++#include <linux/mount.h>
++#include <linux/mnt_namespace.h>
++#include <linux/vmalloc.h>
++
++#include <linux/vzctl.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++
++/*
++ * XXX
++ * may be something is needed for sb->s_dquot->info[]?
++ */
++
++#define USRQUOTA_MASK		(1 << USRQUOTA)
++#define GRPQUOTA_MASK		(1 << GRPQUOTA)
++#define QTYPE2MASK(type)	(1 << (type))
++
++static struct kmem_cache *vz_quota_ugid_cachep;
++
++/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects
++ * list on the hash table */
++extern struct semaphore vz_quota_sem;
++
++inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid)
++{
++	if (qugid != VZ_QUOTA_UGBAD)
++		atomic_inc(&qugid->qugid_count);
++	return qugid;
++}
++
++/* we don't limit users with zero limits */
++static inline int vzquota_fake_stat(struct dq_stat *stat)
++{
++	return stat->bhardlimit == 0 && stat->bsoftlimit == 0 &&
++		stat->ihardlimit == 0 && stat->isoftlimit == 0;
++}
++
++/* callback function for quotatree_free() */
++static inline void vzquota_free_qugid(void *ptr)
++{
++	kmem_cache_free(vz_quota_ugid_cachep, ptr);
++}
++
++/*
++ * destroy ugid, if it have zero refcount, limits and usage
++ * must be called under qmblk->dq_sem
++ */
++void vzquota_put_ugid(struct vz_quota_master *qmblk,
++		struct vz_quota_ugid *qugid)
++{
++	if (qugid == VZ_QUOTA_UGBAD)
++		return;
++	qmblk_data_read_lock(qmblk);
++	if (atomic_dec_and_test(&qugid->qugid_count) &&
++	    (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 &&
++	    vzquota_fake_stat(&qugid->qugid_stat) &&
++	    qugid->qugid_stat.bcurrent == 0 &&
++	    qugid->qugid_stat.icurrent == 0) {
++		quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type),
++				qugid->qugid_id);
++		qmblk->dq_ugid_count--;
++		vzquota_free_qugid(qugid);
++	}
++	qmblk_data_read_unlock(qmblk);
++}
++
++/*
++ * Get ugid block by its index, like it would present in array.
++ * In reality, this is not array - this is leafs chain of the tree.
++ * NULL if index is out of range.
++ * qmblk semaphore is required to protect the tree.
++ */
++static inline struct vz_quota_ugid *
++vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type)
++{
++	return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index);
++}
++
++/*
++ * get next element from ugid "virtual array"
++ * ugid must be in current array and this array may not be changed between
++ * two accesses (quaranteed by "stopped" quota state and quota semaphore)
++ * qmblk semaphore is required to protect the tree
++ */
++static inline struct vz_quota_ugid *
++vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid)
++{
++	return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type),
++			qugid->qugid_id);
++}
++
++/*
++ * requires dq_sem
++ */
++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
++			unsigned int quota_id, int type, int flags)
++{
++	struct vz_quota_ugid *qugid;
++	struct quotatree_tree *tree;
++	struct quotatree_find_state st;
++
++	tree = QUGID_TREE(qmblk, type);
++	qugid = quotatree_find(tree, quota_id, &st);
++	if (qugid)
++		goto success;
++
++	/* caller does not want alloc */
++	if (flags & VZDQUG_FIND_DONT_ALLOC)
++		goto fail;
++
++	if (flags & VZDQUG_FIND_FAKE)
++		goto doit;
++
++	/* check limit */
++	if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max)
++		goto fail;
++
++	/* see comment at VZDQUG_FIXED_SET define */
++	if (qmblk->dq_flags & VZDQUG_FIXED_SET)
++		goto fail;
++
++doit:
++	/* alloc new structure */
++	qugid = kmem_cache_alloc(vz_quota_ugid_cachep,
++			GFP_NOFS | __GFP_NOFAIL);
++	if (qugid == NULL)
++		goto fail;
++
++	/* initialize new structure */
++	qugid->qugid_id = quota_id;
++	memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat));
++	qugid->qugid_type = type;
++	atomic_set(&qugid->qugid_count, 0);
++
++	/* insert in tree */
++	if (quotatree_insert(tree, quota_id, &st, qugid) < 0)
++		goto fail_insert;
++	qmblk->dq_ugid_count++;
++
++success:
++	vzquota_get_ugid(qugid);
++	return qugid;
++
++fail_insert:
++	vzquota_free_qugid(qugid);
++fail:
++	return VZ_QUOTA_UGBAD;
++}
++
++/*
++ * takes dq_sem, may schedule
++ */
++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
++			unsigned int quota_id, int type, int flags)
++{
++	struct vz_quota_ugid *qugid;
++
++	down(&qmblk->dq_sem);
++	qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags);
++	up(&qmblk->dq_sem);
++
++	return qugid;
++}
++
++/*
++ * destroy all ugid records on given quota master
++ */
++void vzquota_kill_ugid(struct vz_quota_master *qmblk)
++{
++	BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) ||
++		(qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL));
++
++	if (qmblk->dq_uid_tree != NULL) {
++		quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid);
++		quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid);
++	}
++}
++
++
++/* ----------------------------------------------------------------------
++ * Management interface to ugid quota for (super)users.
++ * --------------------------------------------------------------------- */
++
++static int vzquota_initialize2(struct inode *inode, int type)
++{
++	return QUOTA_OK;
++}
++
++static int vzquota_drop2(struct inode *inode)
++{
++	return QUOTA_OK;
++}
++
++static int vzquota_alloc_space2(struct inode *inode,
++			     qsize_t number, int prealloc)
++{
++	inode_add_bytes(inode, number);
++	return QUOTA_OK;
++}
++
++static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number)
++{
++	return QUOTA_OK;
++}
++
++static int vzquota_free_space2(struct inode *inode, qsize_t number)
++{
++	inode_sub_bytes(inode, number);
++	return QUOTA_OK;
++}
++
++static int vzquota_free_inode2(const struct inode *inode, unsigned long number)
++{
++	return QUOTA_OK;
++}
++
++static int vzquota_transfer2(struct inode *inode, struct iattr *iattr)
++{
++	return QUOTA_OK;
++}
++
++struct dquot_operations vz_quota_operations2 = {
++	.initialize	= vzquota_initialize2,
++	.drop		= vzquota_drop2,
++	.alloc_space	= vzquota_alloc_space2,
++	.alloc_inode	= vzquota_alloc_inode2,
++	.free_space	= vzquota_free_space2,
++	.free_inode	= vzquota_free_inode2,
++	.transfer	= vzquota_transfer2,
++};
++
++
++asmlinkage long sys_unlink(const char __user * pathname);
++asmlinkage long sys_rename(const char __user * oldname,
++	       const char __user * newname);
++asmlinkage long sys_symlink(const char __user * oldname,
++	       const char __user * newname);
++
++/* called under sb->s_umount semaphore */
++static int vz_restore_symlink(struct super_block *sb, char *path, int type)
++{
++	mm_segment_t oldfs;
++	char *newpath;
++	char dest[64];
++	const char *names[] = {
++		[USRQUOTA] "aquota.user",
++		[GRPQUOTA] "aquota.group"
++	};
++	int err;
++
++	newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL);
++	if (newpath == NULL)
++		return -ENOMEM;
++
++	strcpy(newpath, path);
++	strcat(newpath, ".new");
++
++	sprintf(dest, "/proc/vz/vzaquota/%08x/%s",
++			new_encode_dev(sb->s_dev), names[type]);
++
++	/*
++	 * Lockdep will learn unneeded dependency while unlink(2):
++	 *	->s_umount => ->i_mutex/1 => ->i_mutex
++	 * Reverse dependency is,
++	 *	open_namei() => ->i_mutex => lookup_hash() => __lookup_hash()
++	 *	=> ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev()
++	 *	=> user_get_super() => ->s_umount
++	 *
++	 * However, first set of ->i_mutex'es belong to /, second to /proc .
++	 * Right fix is to get rid of vz_restore_symlink(), of course.
++	 */
++	up_read(&sb->s_umount);
++
++	oldfs = get_fs();
++	set_fs(KERNEL_DS);
++	err = sys_unlink(newpath);
++	if (err < 0 && err != -ENOENT)
++		goto out_restore;
++	err = sys_symlink(dest, newpath);
++	if (err < 0)
++		goto out_restore;
++	err = sys_rename(newpath, path);
++out_restore:
++	set_fs(oldfs);
++
++	down_read(&sb->s_umount);
++	/* umounted meanwhile? */
++	if (err == 0 && !sb->s_root)
++		err = -ENODEV;
++
++	kfree(newpath);
++	return err;
++}
++
++/* called under sb->s_umount semaphore */
++static int vz_quota_on(struct super_block *sb, int type,
++		int format_id, char *path, int remount)
++{
++	struct vz_quota_master *qmblk;
++	int mask, mask2;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++
++	err = vz_restore_symlink(sb, path, type);
++	if (err < 0)
++		goto out_put;
++
++	down(&vz_quota_sem);
++	mask = 0;
++	mask2 = 0;
++	sb->dq_op = &vz_quota_operations2;
++	sb->s_qcop = &vz_quotactl_operations;
++	if (type == USRQUOTA) {
++		mask = DQUOT_USR_ENABLED;
++		mask2 = VZDQ_USRQUOTA;
++	}
++	if (type == GRPQUOTA) {
++		mask = DQUOT_GRP_ENABLED;
++		mask2 = VZDQ_GRPQUOTA;
++	}
++	err = -EBUSY;
++	if (qmblk->dq_flags & mask2)
++		goto out_sem;
++
++	err = 0;
++	qmblk->dq_flags |= mask2;
++	sb->s_dquot.flags |= mask;
++
++out_sem:
++	up(&vz_quota_sem);
++out_put:
++	qmblk_put(qmblk);
++out:
++	return err;
++}
++
++static int vz_quota_off(struct super_block *sb, int type, int remount)
++{
++	struct vz_quota_master *qmblk;
++	int mask2;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	down(&vz_quota_sem);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++
++	mask2 = 0;
++	if (type == USRQUOTA)
++		mask2 = VZDQ_USRQUOTA;
++	if (type == GRPQUOTA)
++		mask2 = VZDQ_GRPQUOTA;
++	err = -EINVAL;
++	if (!(qmblk->dq_flags & mask2))
++		goto out;
++
++	qmblk->dq_flags &= ~mask2;
++	err = 0;
++
++out:
++	up(&vz_quota_sem);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++	return err;
++}
++
++static int vz_quota_sync(struct super_block *sb, int type)
++{
++	return 0;	/* vz quota is always uptodate */
++}
++
++static int vz_get_dqblk(struct super_block *sb, int type,
++		qid_t id, struct if_dqblk *di)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid *ugid;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	down(&vz_quota_sem);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++
++	err = 0;
++	ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC);
++	if (ugid != VZ_QUOTA_UGBAD) {
++		qmblk_data_read_lock(qmblk);
++		di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10;
++		di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10;
++		di->dqb_curspace = ugid->qugid_stat.bcurrent;
++		di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit;
++		di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit;
++		di->dqb_curinodes = ugid->qugid_stat.icurrent;
++		di->dqb_btime = ugid->qugid_stat.btime;
++		di->dqb_itime = ugid->qugid_stat.itime;
++		qmblk_data_read_unlock(qmblk);
++		di->dqb_valid = QIF_ALL;
++		vzquota_put_ugid(qmblk, ugid);
++	} else {
++		memset(di, 0, sizeof(*di));
++		di->dqb_valid = QIF_ALL;
++	}
++
++out:
++	up(&vz_quota_sem);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++	return err;
++}
++
++/* must be called under vz_quota_sem */
++static int __vz_set_dqblk(struct vz_quota_master *qmblk,
++		int type, qid_t id, struct if_dqblk *di)
++{
++	struct vz_quota_ugid *ugid;
++
++	ugid = vzquota_find_ugid(qmblk, id, type, 0);
++	if (ugid == VZ_QUOTA_UGBAD)
++		return -ESRCH;
++
++	qmblk_data_write_lock(qmblk);
++	/*
++	 * Subtle compatibility breakage.
++	 *
++	 * Some old non-vz kernel quota didn't start grace period
++	 * if the new soft limit happens to be below the usage.
++	 * Non-vz kernel quota in 2.4.20 starts the grace period
++	 * (if it hasn't been started).
++	 * Current non-vz kernel performs even more complicated
++	 * manipulations...
++	 *
++	 * Also, current non-vz kernels have inconsistency related to 
++	 * the grace time start.  In regular operations the grace period
++	 * is started if the usage is greater than the soft limit (and,
++	 * strangely, is cancelled if the usage is less).
++	 * However, set_dqblk starts the grace period if the usage is greater
++	 * or equal to the soft limit.
++	 *
++	 * Here we try to mimic the behavior of the current non-vz kernel.
++	 */
++	if (di->dqb_valid & QIF_BLIMITS) {
++		ugid->qugid_stat.bhardlimit =
++			(__u64)di->dqb_bhardlimit << 10;
++		ugid->qugid_stat.bsoftlimit =
++			(__u64)di->dqb_bsoftlimit << 10;
++		if (di->dqb_bsoftlimit == 0 ||
++		    ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit)
++			ugid->qugid_stat.btime = 0;
++		else if (!(di->dqb_valid & QIF_BTIME))
++			ugid->qugid_stat.btime = CURRENT_TIME_SECONDS
++				+ qmblk->dq_ugid_info[type].bexpire;
++		else
++			ugid->qugid_stat.btime = di->dqb_btime;
++	}
++	if (di->dqb_valid & QIF_ILIMITS) {
++		ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit;
++		ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit;
++		if (di->dqb_isoftlimit == 0 ||
++		    ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit)
++			ugid->qugid_stat.itime = 0;
++		else if (!(di->dqb_valid & QIF_ITIME))
++			ugid->qugid_stat.itime = CURRENT_TIME_SECONDS
++				+ qmblk->dq_ugid_info[type].iexpire;
++		else
++			ugid->qugid_stat.itime = di->dqb_itime;
++	}
++	qmblk_data_write_unlock(qmblk);
++	vzquota_put_ugid(qmblk, ugid);
++
++	return 0;
++}
++
++static int vz_set_dqblk(struct super_block *sb, int type,
++		qid_t id, struct if_dqblk *di)
++{
++	struct vz_quota_master *qmblk;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	down(&vz_quota_sem);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++	err = __vz_set_dqblk(qmblk, type, id, di);
++out:
++	up(&vz_quota_sem);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++	return err;
++}
++
++static int vz_get_dqinfo(struct super_block *sb, int type,
++		struct if_dqinfo *ii)
++{
++	struct vz_quota_master *qmblk;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	down(&vz_quota_sem);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++
++	err = 0;
++	ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire;
++	ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire;
++	ii->dqi_flags = 0;
++	ii->dqi_valid = IIF_ALL;
++
++out:
++	up(&vz_quota_sem);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++	return err;
++}
++
++/* must be called under vz_quota_sem */
++static int __vz_set_dqinfo(struct vz_quota_master *qmblk,
++		int type, struct if_dqinfo *ii)
++{
++	if (ii->dqi_valid & IIF_FLAGS)
++		if (ii->dqi_flags & DQF_MASK)
++			return -EINVAL;
++
++	if (ii->dqi_valid & IIF_BGRACE)
++		qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace;
++	if (ii->dqi_valid & IIF_IGRACE)
++		qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace;
++	return 0;
++}
++
++static int vz_set_dqinfo(struct super_block *sb, int type,
++		struct if_dqinfo *ii)
++{
++	struct vz_quota_master *qmblk;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	down(&vz_quota_sem);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++	err = __vz_set_dqinfo(qmblk, type, ii);
++out:
++	up(&vz_quota_sem);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++	return err;
++}
++
++#ifdef CONFIG_QUOTA_COMPAT
++
++#define Q_GETQUOTI_SIZE 1024
++
++#define UGID2DQBLK(dst, src)						\
++	do {								\
++		(dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit;	\
++		(dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit;	\
++		(dst)->dqb_curinodes = (src)->qugid_stat.icurrent;	\
++		/* in 1K blocks */					\
++		(dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \
++		/* in 1K blocks */					\
++		(dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \
++		/* in bytes, 64 bit */					\
++		(dst)->dqb_curspace = (src)->qugid_stat.bcurrent;	\
++		(dst)->dqb_btime = (src)->qugid_stat.btime;		\
++		(dst)->dqb_itime = (src)->qugid_stat.itime;		\
++	} while (0)
++
++static int vz_get_quoti(struct super_block *sb, int type, qid_t idx,
++		struct v2_disk_dqblk __user *dqblk)
++{
++	struct vz_quota_master *qmblk;
++	struct v2_disk_dqblk *data, *kbuf;
++	struct vz_quota_ugid *ugid;
++	int count;
++	int err;
++
++	qmblk = vzquota_find_qmblk(sb);
++	err = -ESRCH;
++	if (qmblk == NULL)
++		goto out;
++	err = -EIO;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out;
++
++	err = -ENOMEM;
++	kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf));
++	if (!kbuf)
++		goto out;
++
++	down(&vz_quota_sem);
++	down(&qmblk->dq_sem);
++	for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0;
++		ugid != NULL && count < Q_GETQUOTI_SIZE;
++		count++)
++	{
++		data = kbuf + count;
++		qmblk_data_read_lock(qmblk);
++		UGID2DQBLK(data, ugid);
++		qmblk_data_read_unlock(qmblk);
++		data->dqb_id = ugid->qugid_id;
++
++		/* Find next entry */
++		ugid = vzquota_get_next(qmblk, ugid);
++		BUG_ON(ugid != NULL && ugid->qugid_type != type);
++	}
++	up(&qmblk->dq_sem);
++	up(&vz_quota_sem);
++
++	err = count;
++	if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf)))
++		err = -EFAULT;
++
++	vfree(kbuf);
++out:
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qmblk);
++
++	return err;
++}
++
++#endif
++
++struct quotactl_ops vz_quotactl_operations = {
++	.quota_on	= vz_quota_on,
++	.quota_off	= vz_quota_off,
++	.quota_sync	= vz_quota_sync,
++	.get_info	= vz_get_dqinfo,
++	.set_info	= vz_set_dqinfo,
++	.get_dqblk	= vz_get_dqblk,
++	.set_dqblk	= vz_set_dqblk,
++#ifdef CONFIG_QUOTA_COMPAT
++	.get_quoti	= vz_get_quoti,
++#endif
++};
++
++
++/* ----------------------------------------------------------------------
++ * Management interface for host system admins.
++ * --------------------------------------------------------------------- */
++
++static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size,
++		struct vz_quota_iface __user *u_ugid_buf, int compat)
++{
++	struct vz_quota_master *qmblk;
++	int ret;
++
++	down(&vz_quota_sem);
++
++	ret = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	ret = -EBUSY;
++	if (qmblk->dq_state != VZDQ_STARTING)
++		goto out; /* working quota doesn't accept new ugids */
++
++	ret = 0;
++	/* start to add ugids */
++	for (ret = 0; ret < ugid_size; ret++) {
++		struct vz_quota_iface ugid_buf;
++		struct vz_quota_ugid *ugid;
++
++		if (!compat) {
++			if (copy_from_user(&ugid_buf, u_ugid_buf,
++							sizeof(ugid_buf)))
++				break;
++			u_ugid_buf++; /* next user buffer */
++		} else {
++#ifdef CONFIG_COMPAT
++			struct compat_vz_quota_iface oqif;
++			if (copy_from_user(&oqif, u_ugid_buf,
++							sizeof(oqif)))
++				break;
++			ugid_buf.qi_id = oqif.qi_id;
++			ugid_buf.qi_type = oqif.qi_type;
++			compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat);
++			u_ugid_buf = (struct vz_quota_iface __user *)
++					(((void *)u_ugid_buf) + sizeof(oqif));
++#endif
++		}
++
++		if (ugid_buf.qi_type >= MAXQUOTAS)
++			break; /* bad quota type - this is the only check */
++
++		ugid = vzquota_find_ugid(qmblk,
++				ugid_buf.qi_id, ugid_buf.qi_type, 0);
++		if (ugid == VZ_QUOTA_UGBAD) {
++			qmblk->dq_flags |= VZDQUG_FIXED_SET;
++			break; /* limit reached */
++		}
++
++		/* update usage/limits 
++		 * we can copy the data without the lock, because the data
++		 * cannot be modified in VZDQ_STARTING state */
++		ugid->qugid_stat = ugid_buf.qi_stat;
++
++		vzquota_put_ugid(qmblk, ugid);
++	}
++out:
++	up(&vz_quota_sem);
++
++	return ret;
++}
++
++static int quota_ugid_setgrace(unsigned int quota_id,
++		struct dq_info __user u_dq_info[], int compat)
++{
++	struct vz_quota_master *qmblk;
++	struct dq_info dq_info[MAXQUOTAS];
++	struct dq_info *target;
++	int err, type;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++	
++	err = -EBUSY;
++	if (qmblk->dq_state != VZDQ_STARTING)
++		goto out; /* working quota doesn't accept changing options */
++
++	err = -EFAULT;
++	if (!compat) {
++		if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info)))
++			goto out;
++	} else {
++#ifdef CONFIG_COMPAT
++		struct compat_dq_info odqi[MAXQUOTAS];
++		if (copy_from_user(odqi, u_dq_info, sizeof(odqi)))
++			goto out;
++		for (type = 0; type < MAXQUOTAS; type++)
++			compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]);
++#endif
++	}
++
++	err = 0;
++
++	/* update in qmblk */
++	for (type = 0; type < MAXQUOTAS; type++) {
++		target = &qmblk->dq_ugid_info[type];
++		target->bexpire = dq_info[type].bexpire;
++		target->iexpire = dq_info[type].iexpire;
++	}
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size,
++		struct vz_quota_iface *u_ugid_buf)
++{
++	int type, count;
++	struct vz_quota_ugid *ugid;
++
++	if (QTREE_LEAFNUM(qmblk->dq_uid_tree) +
++	    QTREE_LEAFNUM(qmblk->dq_gid_tree)
++	    		<= index)
++		return 0;
++
++	count = 0;
++
++	type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA;
++	if (type == GRPQUOTA)
++		index -= QTREE_LEAFNUM(qmblk->dq_uid_tree);
++
++	/* loop through ugid and then qgid quota */
++repeat:
++	for (ugid = vzquota_get_byindex(qmblk, index, type);
++		ugid != NULL && count < size;
++		ugid = vzquota_get_next(qmblk, ugid), count++)
++	{
++		struct vz_quota_iface ugid_buf;
++
++		/* form interface buffer and send in to user-level */
++		qmblk_data_read_lock(qmblk);
++		memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat,
++				sizeof(ugid_buf.qi_stat));
++		qmblk_data_read_unlock(qmblk);
++		ugid_buf.qi_id = ugid->qugid_id;
++		ugid_buf.qi_type = ugid->qugid_type;
++
++		memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf));
++		u_ugid_buf++; /* next portion of user buffer */
++	}
++
++	if (type == USRQUOTA && count < size) {
++		type = GRPQUOTA;
++		index = 0;
++		goto repeat;
++	}
++
++	return count;
++}
++
++static int quota_ugid_getstat(unsigned int quota_id,
++		int index, int size, struct vz_quota_iface __user *u_ugid_buf,
++		int compat)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_iface *k_ugid_buf;
++	int err;
++
++	if (index < 0 || size < 0)
++		return -EINVAL;
++
++	if (size > INT_MAX / sizeof(struct vz_quota_iface))
++		return -EINVAL;
++
++	k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface));
++	if (k_ugid_buf == NULL)
++		return -ENOMEM;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	down(&qmblk->dq_sem);
++	err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf);
++	up(&qmblk->dq_sem);
++	if (err < 0)
++		goto out;
++
++	if (!compat) {
++		if (copy_to_user(u_ugid_buf, k_ugid_buf,
++					err * sizeof(struct vz_quota_iface)))
++			err = -EFAULT;
++	} else {
++#ifdef CONFIG_COMPAT
++		struct compat_vz_quota_iface oqif;
++		int i;
++		for (i = 0; i < err; i++) {
++			oqif.qi_id = k_ugid_buf[i].qi_id;
++			oqif.qi_type = k_ugid_buf[i].qi_type;
++			dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat,
++					  &oqif.qi_stat);
++			if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif)))
++				err = -EFAULT;
++			u_ugid_buf = (struct vz_quota_iface __user *)
++					(((void *)u_ugid_buf) + sizeof(oqif));
++		}
++#endif
++	}
++
++out:
++	up(&vz_quota_sem);
++	vfree(k_ugid_buf);
++	return err;
++}
++
++static int quota_ugid_getgrace(unsigned int quota_id,
++		struct dq_info __user u_dq_info[], int compat)
++{
++	struct vz_quota_master *qmblk;
++	struct dq_info dq_info[MAXQUOTAS];
++	struct dq_info *target;
++	int err, type;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++	
++	err = 0;
++	/* update from qmblk */
++	for (type = 0; type < MAXQUOTAS; type ++) {
++		target = &qmblk->dq_ugid_info[type];
++		dq_info[type].bexpire = target->bexpire;
++		dq_info[type].iexpire = target->iexpire;
++		dq_info[type].flags = target->flags;
++	}
++
++	if (!compat) {
++		if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info)))
++			err = -EFAULT;
++	} else {
++#ifdef CONFIG_COMPAT
++		struct compat_dq_info odqi[MAXQUOTAS];
++		for (type = 0; type < MAXQUOTAS; type ++)
++			dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]);
++		if (copy_to_user(u_dq_info, odqi, sizeof(odqi)))
++			err = -EFAULT;
++#endif
++	}
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++static int quota_ugid_getconfig(unsigned int quota_id, 
++		struct vz_quota_ugid_stat __user *info)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid_stat kinfo;
++	int err;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++	
++	err = 0;
++	kinfo.limit = qmblk->dq_ugid_max;
++	kinfo.count = qmblk->dq_ugid_count;
++	kinfo.flags = qmblk->dq_flags;
++
++	if (copy_to_user(info, &kinfo, sizeof(kinfo)))
++		err = -EFAULT;
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++static int quota_ugid_setconfig(unsigned int quota_id,
++		struct vz_quota_ugid_stat __user *info)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid_stat kinfo;
++	int err;
++
++	down(&vz_quota_sem);
++
++	err = -ENOENT;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EFAULT;
++	if (copy_from_user(&kinfo, info, sizeof(kinfo)))
++		goto out;
++
++	err = 0;
++	qmblk->dq_ugid_max = kinfo.limit;
++	if (qmblk->dq_state == VZDQ_STARTING) {
++		qmblk->dq_flags = kinfo.flags;
++		if (qmblk->dq_flags & VZDQUG_ON)
++			qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA;
++	}		
++
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++static int quota_ugid_setlimit(unsigned int quota_id,
++		struct vz_quota_ugid_setlimit __user *u_lim)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid_setlimit lim;
++	int err;
++
++	down(&vz_quota_sem);
++
++	err = -ESRCH;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EFAULT;
++	if (copy_from_user(&lim, u_lim, sizeof(lim)))
++		goto out;
++
++	err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb);
++
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++static int quota_ugid_setinfo(unsigned int quota_id,
++		struct vz_quota_ugid_setinfo __user *u_info)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid_setinfo info;
++	int err;
++
++	down(&vz_quota_sem);
++
++	err = -ESRCH;
++	qmblk = vzquota_find_master(quota_id);
++	if (qmblk == NULL)
++		goto out;
++
++	err = -EFAULT;
++	if (copy_from_user(&info, u_info, sizeof(info)))
++		goto out;
++
++	err = __vz_set_dqinfo(qmblk, info.type, &info.dqi);
++
++out:
++	up(&vz_quota_sem);
++
++	return err;
++}
++
++/*
++ * This is a system call to maintain UGID quotas
++ * Note this call is allowed to run ONLY from VE0
++ */
++long do_vzquotaugidctl(int cmd, unsigned int quota_id,
++		unsigned int ugid_index, unsigned int ugid_size,
++		void *addr, int compat)
++{
++	int ret;
++
++	ret = -EPERM;
++	/* access allowed only from root of VE0 */
++	if (!capable(CAP_SYS_RESOURCE) ||
++	    !capable(CAP_SYS_ADMIN))
++		goto out;
++
++	switch (cmd) {
++		case VZ_DQ_UGID_GETSTAT:
++			ret = quota_ugid_getstat(quota_id,
++					ugid_index, ugid_size,
++				       	(struct vz_quota_iface __user *)addr,
++					compat);
++			break;
++		case VZ_DQ_UGID_ADDSTAT:
++			ret = quota_ugid_addstat(quota_id, ugid_size,
++					(struct vz_quota_iface __user *) addr,
++					compat);
++			break;
++		case VZ_DQ_UGID_GETGRACE:
++			ret = quota_ugid_getgrace(quota_id,
++					(struct dq_info __user *)addr, compat);
++			break;
++		case VZ_DQ_UGID_SETGRACE:
++			ret = quota_ugid_setgrace(quota_id,
++					(struct dq_info __user *)addr, compat);
++			break;
++		case VZ_DQ_UGID_GETCONFIG:
++			ret = quota_ugid_getconfig(quota_id,
++					(struct vz_quota_ugid_stat __user *)
++								addr);
++			break;
++		case VZ_DQ_UGID_SETCONFIG:
++			ret = quota_ugid_setconfig(quota_id,
++					(struct vz_quota_ugid_stat __user *)
++								addr);
++			break;
++		case VZ_DQ_UGID_SETLIMIT:
++			ret = quota_ugid_setlimit(quota_id,
++					(struct vz_quota_ugid_setlimit __user *)
++								addr);
++			break;
++		case VZ_DQ_UGID_SETINFO:
++			ret = quota_ugid_setinfo(quota_id,
++					(struct vz_quota_ugid_setinfo __user *)
++								addr);
++			break;
++		default:
++			ret = -EINVAL;
++			goto out;
++	}
++out:
++	return ret;
++}
++
++static void ugid_quota_on_sb(struct super_block *sb)
++{
++	struct super_block *real_sb;
++	struct vz_quota_master *qmblk;
++
++	if (!sb->s_op->get_quota_root)
++		return;
++
++	real_sb = sb->s_op->get_quota_root(sb)->i_sb;
++	if (real_sb->dq_op != &vz_quota_operations)
++		return;
++
++	sb->dq_op = &vz_quota_operations2;
++	sb->s_qcop = &vz_quotactl_operations;
++	INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++	INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++	sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
++	sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
++
++	qmblk = vzquota_find_qmblk(sb);
++	if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD))
++		return;
++	down(&vz_quota_sem);
++	if (qmblk->dq_flags & VZDQ_USRQUOTA)
++		sb->s_dquot.flags |= DQUOT_USR_ENABLED;
++	if (qmblk->dq_flags & VZDQ_GRPQUOTA)
++		sb->s_dquot.flags |= DQUOT_GRP_ENABLED;
++	up(&vz_quota_sem);
++	qmblk_put(qmblk);
++}
++
++static void ugid_quota_off_sb(struct super_block *sb)
++{
++	/* can't make quota off on mounted super block */
++	BUG_ON(sb->s_root != NULL);
++}
++
++static int ugid_notifier_call(struct vnotifier_block *self,
++		unsigned long n, void *data, int old_ret)
++{
++	struct virt_info_quota *viq;
++
++	viq = (struct virt_info_quota *)data;
++
++	switch (n) {
++	case VIRTINFO_QUOTA_ON:
++		ugid_quota_on_sb(viq->super);
++		break;
++	case VIRTINFO_QUOTA_OFF:
++		ugid_quota_off_sb(viq->super);
++		break;
++	case VIRTINFO_QUOTA_GETSTAT:
++		break;
++	default:
++		return old_ret;
++	}
++	return NOTIFY_OK;
++}
++
++static struct vnotifier_block ugid_notifier_block = {
++	.notifier_call = ugid_notifier_call,
++};
++
++/* ----------------------------------------------------------------------
++ * Init/exit.
++ * --------------------------------------------------------------------- */
++
++int vzquota_ugid_init(void)
++{
++	int err;
++
++	vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid",
++				      sizeof(struct vz_quota_ugid),
++				      0, SLAB_HWCACHE_ALIGN, NULL);
++	if (vz_quota_ugid_cachep == NULL)
++		goto err_slab;
++
++	err = register_quota_format(&vz_quota_empty_v2_format);
++	if (err)
++		goto err_reg;
++
++	virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block);
++	return 0;
++
++err_reg:
++	kmem_cache_destroy(vz_quota_ugid_cachep);
++	return err;
++
++err_slab:
++	printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
++	return -ENOMEM;
++}
++
++void vzquota_ugid_release(void)
++{
++	virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block);
++	unregister_quota_format(&vz_quota_empty_v2_format);
++
++	kmem_cache_destroy(vz_quota_ugid_cachep);
++}
+diff --git a/fs/vzdquot.c b/fs/vzdquot.c
+new file mode 100644
+index 0000000..c13bec2
+--- /dev/null
++++ b/fs/vzdquot.c
+@@ -0,0 +1,1955 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005  SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains the core of Virtuozzo disk quota implementation:
++ * maintenance of VZDQ information in inodes,
++ * external interfaces,
++ * module entry.
++ */
++
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/dcache.h>
++#include <linux/quota.h>
++#include <linux/rcupdate.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <asm/uaccess.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++#include <linux/virtinfo.h>
++#include <linux/vzdq_tree.h>
++#include <linux/mount.h>
++
++/* ----------------------------------------------------------------------
++ *
++ * Locking
++ *
++ * ---------------------------------------------------------------------- */
++
++/*
++ * Serializes on/off and all other do_vzquotactl operations.
++ * Protects qmblk hash.
++ */
++struct semaphore vz_quota_sem;
++
++/*
++ * Data access locks
++ *  inode_qmblk
++ *	protects qmblk pointers in all inodes and qlnk content in general
++ *	(but not qmblk content);
++ *	also protects related qmblk invalidation procedures;
++ *	can't be per-inode because of vzquota_dtree_qmblk complications
++ *	and problems with serialization with quota_on,
++ *	but can be per-superblock;
++ *  qmblk_data
++ *	protects qmblk fields (such as current usage)
++ *  quota_data
++ *	protects charge/uncharge operations, thus, implies
++ *	qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock
++ *	(to protect ugid pointers).
++ *
++ * Lock order:
++ *  inode_qmblk_lock -> dcache_lock
++ *  inode_qmblk_lock -> qmblk_data
++ */
++static DEFINE_SPINLOCK(vzdq_qmblk_lock);
++
++inline void inode_qmblk_lock(struct super_block *sb)
++{
++	spin_lock(&vzdq_qmblk_lock);
++}
++
++inline void inode_qmblk_unlock(struct super_block *sb)
++{
++	spin_unlock(&vzdq_qmblk_lock);
++}
++
++inline void qmblk_data_read_lock(struct vz_quota_master *qmblk)
++{
++	spin_lock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk)
++{
++	spin_unlock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_write_lock(struct vz_quota_master *qmblk)
++{
++	spin_lock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk)
++{
++	spin_unlock(&qmblk->dq_data_lock);
++}
++
++struct quota_format_type vz_quota_empty_v2_format = {
++	.qf_fmt_id	= QFMT_VFS_V0,
++	.qf_ops		= NULL,
++	.qf_owner	= THIS_MODULE,
++};
++
++/* ----------------------------------------------------------------------
++ *
++ * Master hash table handling.
++ *
++ * SMP not safe, serialied by vz_quota_sem within quota syscalls
++ *
++ * --------------------------------------------------------------------- */
++
++static struct kmem_cache *vzquota_cachep;
++
++/*
++ * Hash function.
++ */
++#define QHASH_BITS		6
++#define	VZ_QUOTA_HASH_SIZE	(1 << QHASH_BITS)
++#define QHASH_MASK		(VZ_QUOTA_HASH_SIZE - 1)
++
++struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE];
++int vzquota_hash_size = VZ_QUOTA_HASH_SIZE;
++
++static inline int vzquota_hash_func(unsigned int qid)
++{
++	return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK);
++}
++
++/**
++ * vzquota_alloc_master - alloc and instantiate master quota record
++ *
++ * Returns:
++ *	pointer to newly created record if SUCCESS
++ *	-ENOMEM if out of memory
++ *	-EEXIST if record with given quota_id already exist
++ */
++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
++		struct vz_quota_stat *qstat)
++{
++	int err;
++	struct vz_quota_master *qmblk;
++
++	err = -EEXIST;
++	if (vzquota_find_master(quota_id) != NULL)
++		goto out;
++
++	err = -ENOMEM;
++	qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
++	if (qmblk == NULL)
++		goto out;
++#ifdef CONFIG_VZ_QUOTA_UGID
++	qmblk->dq_uid_tree = quotatree_alloc();
++	if (!qmblk->dq_uid_tree)
++		goto out_free;
++
++	qmblk->dq_gid_tree = quotatree_alloc();
++	if (!qmblk->dq_gid_tree)
++		goto out_free_tree;
++#endif
++
++	qmblk->dq_state = VZDQ_STARTING;
++	init_MUTEX(&qmblk->dq_sem);
++	spin_lock_init(&qmblk->dq_data_lock);
++
++	qmblk->dq_id = quota_id;
++	qmblk->dq_stat = qstat->dq_stat;
++	qmblk->dq_info = qstat->dq_info;
++	qmblk->dq_root_path.dentry = NULL;
++	qmblk->dq_root_path.mnt = NULL;
++	qmblk->dq_sb = NULL;
++	qmblk->dq_ugid_count = 0;
++	qmblk->dq_ugid_max = 0;
++	qmblk->dq_flags = 0;
++	memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info));
++	INIT_LIST_HEAD(&qmblk->dq_ilink_list);
++
++	atomic_set(&qmblk->dq_count, 1);
++
++	/* insert in hash chain */
++	list_add(&qmblk->dq_hash,
++		&vzquota_hash_table[vzquota_hash_func(quota_id)]);
++
++	/* success */
++	return qmblk;
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++out_free_tree:
++	quotatree_free(qmblk->dq_uid_tree, NULL);
++out_free:
++	kmem_cache_free(vzquota_cachep, qmblk);
++#endif
++out:
++	return ERR_PTR(err);
++}
++
++static struct vz_quota_master *vzquota_alloc_fake(void)
++{
++	struct vz_quota_master *qmblk;
++
++	qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
++	if (qmblk == NULL)
++		return NULL;
++	memset(qmblk, 0, sizeof(*qmblk));
++	qmblk->dq_state = VZDQ_STOPING;
++	qmblk->dq_flags = VZDQ_NOQUOT;
++	spin_lock_init(&qmblk->dq_data_lock);
++	INIT_LIST_HEAD(&qmblk->dq_ilink_list);
++	atomic_set(&qmblk->dq_count, 1);
++	return qmblk;
++}
++
++/**
++ * vzquota_find_master - find master record with given id
++ *
++ * Returns qmblk without touching its refcounter.
++ * Called under vz_quota_sem.
++ */
++struct vz_quota_master *vzquota_find_master(unsigned int quota_id)
++{
++	int i;
++	struct vz_quota_master *qp;
++
++	i = vzquota_hash_func(quota_id);
++	list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) {
++		if (qp->dq_id == quota_id)
++			return qp;
++	}
++	return NULL;
++}
++
++/**
++ * vzquota_free_master - release resources taken by qmblk, freeing memory
++ *
++ * qmblk is assumed to be already taken out from the hash.
++ * Should be called outside vz_quota_sem.
++ */
++void vzquota_free_master(struct vz_quota_master *qmblk)
++{
++#ifdef CONFIG_VZ_QUOTA_UGID
++	vzquota_kill_ugid(qmblk);
++#endif
++	BUG_ON(!list_empty(&qmblk->dq_ilink_list));
++	kmem_cache_free(vzquota_cachep, qmblk);
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Passing quota information through current
++ *
++ * Used in inode -> qmblk lookup at inode creation stage (since at that
++ * time there are no links between the inode being created and its parent
++ * directory).
++ *
++ * --------------------------------------------------------------------- */
++
++#define VZDQ_CUR_MAGIC	0x57d0fee2
++
++static inline int vzquota_cur_qmblk_check(void)
++{
++	return current->magic == VZDQ_CUR_MAGIC;
++}
++
++static inline struct inode *vzquota_cur_qmblk_fetch(void)
++{
++	return current->ino;
++}
++
++static inline void vzquota_cur_qmblk_set(struct inode *data)
++{
++	struct task_struct *tsk;
++
++	tsk = current;
++	tsk->magic = VZDQ_CUR_MAGIC;
++	tsk->ino = data;
++}
++
++#if 0
++static inline void vzquota_cur_qmblk_reset(void)
++{
++	current->magic = 0;
++}
++#endif
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Superblock quota operations
++ *
++ * --------------------------------------------------------------------- */
++
++/*
++ * Kernel structure abuse.
++ * We use files[0] pointer as an int variable:
++ * reference counter of how many quota blocks uses this superblock.
++ * files[1] is used for generations structure which helps us to track
++ * when traversing of dentries is really required.
++ */
++#define __VZ_QUOTA_NOQUOTA(sb)		sb->s_dquot.vzdq_master
++#define __VZ_QUOTA_TSTAMP(sb)		((struct timeval *)\
++						&sb->s_dquot.dqio_mutex)
++
++#if defined(VZ_QUOTA_UNLOAD)
++
++#define __VZ_QUOTA_SBREF(sb)		sb->s_dquot.vzdq_count
++
++struct dquot_operations *orig_dq_op;
++struct quotactl_ops *orig_dq_cop;
++
++/**
++ * quota_get_super - account for new a quoted tree under the superblock
++ *
++ * One superblock can have multiple directory subtrees with different VZ
++ * quotas.  We keep a counter of such subtrees and set VZ quota operations or
++ * reset the default ones.
++ *
++ * Called under vz_quota_sem (from quota_on).
++ */
++int vzquota_get_super(struct super_block *sb)
++{
++	if (sb->dq_op != &vz_quota_operations) {
++		down(&sb->s_dquot.dqonoff_sem);
++		if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) {
++			up(&sb->s_dquot.dqonoff_sem);
++			return -EEXIST;
++		}
++		if (orig_dq_op == NULL && sb->dq_op != NULL)
++			orig_dq_op = sb->dq_op;
++		sb->dq_op = &vz_quota_operations;
++		if (orig_dq_cop == NULL && sb->s_qcop != NULL)
++			orig_dq_cop = sb->s_qcop;
++		/* XXX this may race with sys_quotactl */
++#ifdef CONFIG_VZ_QUOTA_UGID
++		sb->s_qcop = &vz_quotactl_operations;
++#else
++		sb->s_qcop = NULL;
++#endif
++		do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
++		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++
++		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++		sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
++		sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
++		/*
++		 * To get quotaops.h call us we need to mark superblock
++		 * as having quota.  These flags mark the moment when
++		 * our dq_op start to be called.
++		 *
++		 * The ordering of dq_op and s_dquot.flags assignment
++		 * needs to be enforced, but other CPUs do not do rmb()
++		 * between s_dquot.flags and dq_op accesses.
++		 */
++		wmb(); synchronize_sched();
++		sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
++		__module_get(THIS_MODULE);
++		up(&sb->s_dquot.dqonoff_sem);
++	}
++	/* protected by vz_quota_sem */
++	__VZ_QUOTA_SBREF(sb)++;
++	return 0;
++}
++
++/**
++ * quota_put_super - release superblock when one quota tree goes away
++ *
++ * Called under vz_quota_sem.
++ */
++void vzquota_put_super(struct super_block *sb)
++{
++	int count;
++
++	count = --__VZ_QUOTA_SBREF(sb);
++	if (count == 0) {
++		down(&sb->s_dquot.dqonoff_sem);
++		sb->s_dquot.flags = 0;
++		wmb(); synchronize_sched();
++		sema_init(&sb->s_dquot.dqio_sem, 1);
++		sb->s_qcop = orig_dq_cop;
++		sb->dq_op = orig_dq_op;
++		inode_qmblk_lock(sb);
++		quota_gen_put(SB_QGEN(sb));
++		SB_QGEN(sb) = NULL;
++		/* release qlnk's without qmblk */
++		remove_inode_quota_links_list(&non_vzquota_inodes_lh,
++				sb, NULL);
++		/*
++		 * Races with quota initialization:
++		 * after this inode_qmblk_unlock all inode's generations are
++		 * invalidated, quota_inode_qmblk checks superblock operations.
++		 */
++		inode_qmblk_unlock(sb);
++		/*
++		 * Module refcounting: in theory, this is the best place
++		 * to call module_put(THIS_MODULE).
++		 * In reality, it can't be done because we can't be sure that
++		 * other CPUs do not enter our code segment through dq_op
++		 * cached long time ago.  Quotaops interface isn't supposed to
++		 * go into modules currently (that is, into unloadable
++		 * modules).  By omitting module_put, our module isn't
++		 * unloadable.
++		 */
++		up(&sb->s_dquot.dqonoff_sem);
++	}
++}
++
++#else
++
++struct vzquota_new_sop {
++	struct super_operations new_op;
++	const struct super_operations *old_op;
++};
++
++/**
++ * vzquota_shutdown_super - callback on umount
++ */
++void vzquota_shutdown_super(struct super_block *sb)
++{
++	struct vz_quota_master *qmblk;
++	struct vzquota_new_sop *sop;
++
++	qmblk = __VZ_QUOTA_NOQUOTA(sb);
++	__VZ_QUOTA_NOQUOTA(sb) = NULL;
++	if (qmblk != NULL)
++		qmblk_put(qmblk);
++	sop = container_of(sb->s_op, struct vzquota_new_sop, new_op);
++	sb->s_op = sop->old_op;
++	kfree(sop);
++	if (sb->s_op->put_super != NULL)
++		(*sb->s_op->put_super)(sb);
++}
++
++/**
++ * vzquota_get_super - account for new a quoted tree under the superblock
++ *
++ * One superblock can have multiple directory subtrees with different VZ
++ * quotas.
++ *
++ * Called under vz_quota_sem (from vzquota_on).
++ */
++int vzquota_get_super(struct super_block *sb)
++{
++	struct vz_quota_master *qnew;
++	struct vzquota_new_sop *sop;
++	int err;
++
++	mutex_lock(&sb->s_dquot.dqonoff_mutex);
++	err = -EEXIST;
++	if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) &&
++	    sb->dq_op != &vz_quota_operations)
++		goto out_up;
++
++	/*
++	 * This allocation code should be under sb->dq_op check below, but
++	 * it doesn't really matter...
++	 */
++	if (__VZ_QUOTA_NOQUOTA(sb) == NULL) {
++		qnew = vzquota_alloc_fake();
++		if (qnew == NULL)
++			goto out_up;
++		__VZ_QUOTA_NOQUOTA(sb) = qnew;
++	}
++
++	if (sb->dq_op != &vz_quota_operations) {
++		sop = kmalloc(sizeof(*sop), GFP_KERNEL);
++		if (sop == NULL) {
++			vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb));
++			__VZ_QUOTA_NOQUOTA(sb) = NULL;
++			goto out_up;
++		}
++		memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op));
++		sop->new_op.put_super = &vzquota_shutdown_super;
++		sop->old_op = sb->s_op;
++		sb->s_op = &sop->new_op;
++
++		sb->dq_op = &vz_quota_operations;
++#ifdef CONFIG_VZ_QUOTA_UGID
++		sb->s_qcop = &vz_quotactl_operations;
++#else
++		sb->s_qcop = NULL;
++#endif
++		do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
++
++		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++		/* these 2 list heads are checked in sync_dquots() */
++		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++		sb->s_dquot.info[USRQUOTA].dqi_format =
++						&vz_quota_empty_v2_format;
++		sb->s_dquot.info[GRPQUOTA].dqi_format =
++						&vz_quota_empty_v2_format;
++
++		/*
++		 * To get quotaops.h to call us we need to mark superblock
++		 * as having quota.  These flags mark the moment when
++		 * our dq_op start to be called.
++		 *
++		 * The ordering of dq_op and s_dquot.flags assignment
++		 * needs to be enforced, but other CPUs do not do rmb()
++		 * between s_dquot.flags and dq_op accesses.
++		 */
++		wmb(); synchronize_sched();
++		sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
++	}
++	err = 0;
++
++out_up:
++	mutex_unlock(&sb->s_dquot.dqonoff_mutex);
++	return err;
++}
++
++/**
++ * vzquota_put_super - one quota tree less on this superblock
++ *
++ * Called under vz_quota_sem.
++ */
++void vzquota_put_super(struct super_block *sb)
++{
++	/*
++	 * Even if this put is the last one,
++	 * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop
++	 * won't be called and the remaining qmblk references won't be put.
++	 */
++}
++
++#endif
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Helpers for inode -> qmblk link maintenance
++ *
++ * --------------------------------------------------------------------- */
++
++#define __VZ_QUOTA_EMPTY		((void *)0xbdbdbdbd)
++#define VZ_QUOTA_IS_NOQUOTA(qm, sb)	((qm)->dq_flags & VZDQ_NOQUOT)
++#define VZ_QUOTA_EMPTY_IOPS		(&vfs_empty_iops)
++extern struct inode_operations vfs_empty_iops;
++
++static int VZ_QUOTA_IS_ACTUAL(struct inode *inode)
++{
++	struct vz_quota_master *qmblk;
++
++	qmblk = INODE_QLNK(inode)->qmblk;
++	if (qmblk == VZ_QUOTA_BAD)
++		return 1;
++	if (qmblk == __VZ_QUOTA_EMPTY)
++		return 0;
++	if (qmblk->dq_flags & VZDQ_NOACT)
++		/* not actual (invalidated) qmblk */
++		return 0;
++	return 1;
++}
++
++static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk)
++{
++	return qlnk->qmblk == __VZ_QUOTA_EMPTY;
++}
++
++static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk,
++		unsigned char origin)
++{
++	qlnk->origin[0] = qlnk->origin[1];
++	qlnk->origin[1] = origin;
++}
++
++static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk)
++{
++	qlnk->qmblk = __VZ_QUOTA_EMPTY;
++	set_qlnk_origin(qlnk, VZ_QUOTAO_SETE);
++}
++
++void vzquota_qlnk_init(struct vz_quota_ilink *qlnk)
++{
++	memset(qlnk, 0, sizeof(*qlnk));
++	INIT_LIST_HEAD(&qlnk->list);
++	vzquota_qlnk_set_empty(qlnk);
++	set_qlnk_origin(qlnk, VZ_QUOTAO_INIT);
++}
++
++void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk)
++{
++	might_sleep();
++	if (vzquota_qlnk_is_empty(qlnk))
++		return;
++#if defined(CONFIG_VZ_QUOTA_UGID)
++	if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) {
++		struct vz_quota_master *qmblk;
++		struct vz_quota_ugid *quid, *qgid;
++		qmblk = qlnk->qmblk;
++		quid = qlnk->qugid[USRQUOTA];
++		qgid = qlnk->qugid[GRPQUOTA];
++		if (quid != NULL || qgid != NULL) {
++			down(&qmblk->dq_sem);
++			if (qgid != NULL)
++				vzquota_put_ugid(qmblk, qgid);
++			if (quid != NULL)
++				vzquota_put_ugid(qmblk, quid);
++			up(&qmblk->dq_sem);
++		}
++	}
++#endif
++	if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD)
++		qmblk_put(qlnk->qmblk);
++	set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR);
++}
++
++/**
++ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents
++ * @qlt: temporary
++ * @qli: inode's
++ *
++ * Locking is provided by the caller (depending on the context).
++ * After swap, @qli is inserted into the corresponding dq_ilink_list,
++ * @qlt list is reinitialized.
++ */
++static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt,
++		struct vz_quota_ilink *qli)
++{
++	struct vz_quota_master *qb;
++	struct vz_quota_ugid *qu;
++	int i;
++
++	qb = qlt->qmblk;
++	qlt->qmblk = qli->qmblk;
++	qli->qmblk = qb;
++	list_del_init(&qli->list);
++	if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD)
++		list_add(&qli->list, &qb->dq_ilink_list);
++	INIT_LIST_HEAD(&qlt->list);
++	set_qlnk_origin(qli, VZ_QUOTAO_SWAP);
++
++	for (i = 0; i < MAXQUOTAS; i++) {
++		qu = qlt->qugid[i];
++		qlt->qugid[i] = qli->qugid[i];
++		qli->qugid[i] = qu;
++	}
++}
++
++/**
++ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks
++ *
++ * Called under dcache_lock and inode_qmblk locks.
++ * Returns 1 if locks were dropped inside, 0 if atomic.
++ */
++static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk,
++		struct inode *inode)
++{
++	if (vzquota_qlnk_is_empty(qlnk))
++		return 0;
++	if (qlnk->qmblk == VZ_QUOTA_BAD) {
++		vzquota_qlnk_set_empty(qlnk);
++		set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK);
++		return 0;
++	}
++	spin_unlock(&dcache_lock);
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(qlnk);
++	vzquota_qlnk_init(qlnk);
++	inode_qmblk_lock(inode->i_sb);
++	spin_lock(&dcache_lock);
++	return 1;
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content
++ *
++ * Similar to vzquota_qlnk_reinit_locked, called under different locks.
++ */
++static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk,
++		struct inode *inode,
++		struct vz_quota_master *qmblk)
++{
++	if (vzquota_qlnk_is_empty(qlnk))
++		return 0;
++	/* may be optimized if qlnk->qugid all NULLs */
++	qmblk_data_write_unlock(qmblk);
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(qlnk);
++	vzquota_qlnk_init(qlnk);
++	inode_qmblk_lock(inode->i_sb);
++	qmblk_data_write_lock(qmblk);
++	return 1;
++}
++#endif
++
++/**
++ * vzquota_qlnk_fill - fill vz_quota_ilink content
++ * @qlnk: vz_quota_ilink to fill
++ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid)
++ * @qmblk: qmblk to which this @qlnk will belong
++ *
++ * Called under dcache_lock and inode_qmblk locks.
++ * Returns 1 if locks were dropped inside, 0 if atomic.
++ * @qlnk is expected to be empty.
++ */
++static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk,
++		struct inode *inode,
++		struct vz_quota_master *qmblk)
++{
++	if (qmblk != VZ_QUOTA_BAD)
++		qmblk_get(qmblk);
++	qlnk->qmblk = qmblk;
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++	if (qmblk != VZ_QUOTA_BAD &&
++	    !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
++	    (qmblk->dq_flags & VZDQUG_ON)) {
++		struct vz_quota_ugid *quid, *qgid;
++
++		spin_unlock(&dcache_lock);
++		inode_qmblk_unlock(inode->i_sb);
++
++		down(&qmblk->dq_sem);
++		quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0);
++		qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0);
++		up(&qmblk->dq_sem);
++
++		inode_qmblk_lock(inode->i_sb);
++		spin_lock(&dcache_lock);
++		qlnk->qugid[USRQUOTA] = quid;
++		qlnk->qugid[GRPQUOTA] = qgid;
++		return 1;
++	}
++#endif
++
++	return 0;
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid
++ *
++ * This function is a helper for vzquota_transfer, and differs from
++ * vzquota_qlnk_fill only by locking.
++ */
++static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk,
++		struct inode *inode,
++		struct iattr *iattr,
++		int mask,
++		struct vz_quota_master *qmblk)
++{
++	qmblk_get(qmblk);
++	qlnk->qmblk = qmblk;
++
++	if (mask) {
++		struct vz_quota_ugid *quid, *qgid;
++
++		quid = qgid = NULL; /* to make gcc happy */
++		if (!(mask & (1 << USRQUOTA)))
++			quid = vzquota_get_ugid(INODE_QLNK(inode)->
++							qugid[USRQUOTA]);
++		if (!(mask & (1 << GRPQUOTA)))
++			qgid = vzquota_get_ugid(INODE_QLNK(inode)->
++							qugid[GRPQUOTA]);
++
++		qmblk_data_write_unlock(qmblk);
++		inode_qmblk_unlock(inode->i_sb);
++
++		down(&qmblk->dq_sem);
++		if (mask & (1 << USRQUOTA))
++			quid = __vzquota_find_ugid(qmblk, iattr->ia_uid,
++					USRQUOTA, 0);
++		if (mask & (1 << GRPQUOTA))
++			qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid,
++					GRPQUOTA, 0);
++		up(&qmblk->dq_sem);
++
++		inode_qmblk_lock(inode->i_sb);
++		qmblk_data_write_lock(qmblk);
++		qlnk->qugid[USRQUOTA] = quid;
++		qlnk->qugid[GRPQUOTA] = qgid;
++		return 1;
++	}
++
++	return 0;
++}
++#endif
++
++/**
++ * __vzquota_inode_init - make sure inode's qlnk is initialized
++ *
++ * May be called if qlnk is already initialized, detects this situation itself.
++ * Called under inode_qmblk_lock.
++ */
++static void __vzquota_inode_init(struct inode *inode, unsigned char origin)
++{
++	if (inode->i_dquot[USRQUOTA] == NODQUOT) {
++		vzquota_qlnk_init(INODE_QLNK(inode));
++		inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT;
++	}
++	set_qlnk_origin(INODE_QLNK(inode), origin);
++}
++
++/**
++ * vzquota_inode_drop - destroy VZ quota information in the inode
++ *
++ * Inode must not be externally accessible or dirty.
++ */
++static void vzquota_inode_drop(struct inode *inode)
++{
++	struct vz_quota_ilink qlnk;
++
++	vzquota_qlnk_init(&qlnk);
++	inode_qmblk_lock(inode->i_sb);
++	vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode));
++	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL);
++	inode->i_dquot[USRQUOTA] = NODQUOT;
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(&qlnk);
++}
++
++/**
++ * vzquota_inode_qmblk_set - initialize inode's qlnk
++ * @inode: inode to be initialized
++ * @qmblk: quota master block to which this inode should belong (may be BAD)
++ * @qlnk: placeholder to store data to resolve locking issues
++ *
++ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise.
++ * Called under dcache_lock and inode_qmblk locks.
++ * @qlnk will be destroyed in the caller chain.
++ *
++ * It is not mandatory to restart parent checks since quota on/off currently
++ * shrinks dentry tree and checks that there are not outside references.
++ * But if at some time that shink is removed, restarts will be required.
++ * Additionally, the restarts prevent inconsistencies if the dentry tree
++ * changes (inode is moved).  This is not a big deal, but anyway...
++ */
++static int vzquota_inode_qmblk_set(struct inode *inode,
++		struct vz_quota_master *qmblk,
++		struct vz_quota_ilink *qlnk)
++{
++	if (qmblk == NULL) {
++		printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, "
++				"dev %s, inode %lu, fs %s\n",
++				INODE_QLNK(inode)->origin[0],
++				INODE_QLNK(inode)->origin[1],
++				inode->i_sb->s_id, inode->i_ino,
++				inode->i_sb->s_type->name);
++		printk(KERN_ERR "current %d (%s), VE %d\n",
++				current->pid, current->comm,
++				VEID(get_exec_env()));
++		dump_stack();
++		qmblk = VZ_QUOTA_BAD;
++	}
++	while (1) {
++		if (vzquota_qlnk_is_empty(qlnk) &&
++		    vzquota_qlnk_fill(qlnk, inode, qmblk))
++			return 1;
++		if (qlnk->qmblk == qmblk)
++			break;
++		if (vzquota_qlnk_reinit_locked(qlnk, inode))
++			return 1;
++	}
++	vzquota_qlnk_swap(qlnk, INODE_QLNK(inode));
++	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET);
++	return 0;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * vzquota_inode_qmblk (inode -> qmblk lookup) parts
++ *
++ * --------------------------------------------------------------------- */
++
++static int vzquota_dparents_check_attach(struct inode *inode)
++{
++	if (!list_empty(&inode->i_dentry))
++		return 0;
++	printk(KERN_ERR "VZDQ: no parent for "
++			"dev %s, inode %lu, fs %s\n",
++			inode->i_sb->s_id,
++			inode->i_ino,
++			inode->i_sb->s_type->name);
++	return -1;
++}
++
++static struct inode *vzquota_dparents_check_actual(struct inode *inode)
++{
++	struct dentry *de;
++
++	list_for_each_entry(de, &inode->i_dentry, d_alias) {
++		if (de->d_parent == de) /* detached dentry, perhaps */
++			continue;
++		/* first access to parent, make sure its qlnk initialized */
++		__vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT);
++		if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode))
++			return de->d_parent->d_inode;
++	}
++	return NULL;
++}
++
++static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode)
++{
++	struct dentry *de;
++	struct vz_quota_master *qmblk;
++
++	qmblk = NULL;
++	list_for_each_entry(de, &inode->i_dentry, d_alias) {
++		if (de->d_parent == de) /* detached dentry, perhaps */
++			continue;
++		if (qmblk == NULL) {
++			qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk;
++			continue;
++		}
++		if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) {
++			printk(KERN_WARNING "VZDQ: multiple quotas for "
++					"dev %s, inode %lu, fs %s\n",
++					inode->i_sb->s_id,
++					inode->i_ino,
++					inode->i_sb->s_type->name);
++			qmblk = VZ_QUOTA_BAD;
++			break;
++		}
++	}
++	if (qmblk == NULL) {
++		printk(KERN_WARNING "VZDQ: not attached to tree, "
++				"dev %s, inode %lu, fs %s\n",
++				inode->i_sb->s_id,
++				inode->i_ino,
++				inode->i_sb->s_type->name);
++		qmblk = VZ_QUOTA_BAD;
++	}
++	return qmblk;
++}
++
++static void vzquota_dbranch_actualize(struct inode *inode,
++		struct inode *refinode)
++{
++	struct inode *pinode;
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ilink qlnk;
++
++	vzquota_qlnk_init(&qlnk);
++
++start:
++	if (inode == inode->i_sb->s_root->d_inode) {
++		/* filesystem root */
++		atomic_inc(&inode->i_count);
++		do {
++			qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++		} while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk));
++		goto out;
++	}
++
++	if (!vzquota_dparents_check_attach(inode)) {
++		pinode = vzquota_dparents_check_actual(inode);
++		if (pinode != NULL) {
++			inode = pinode;
++			goto start;
++		}
++	}
++
++	atomic_inc(&inode->i_count);
++	while (1) {
++		if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */
++			break;
++		/*
++		 * Need to check parents again if we have slept inside
++		 * vzquota_inode_qmblk_set() in the loop.
++		 * If the state of parents is different, just return and repeat
++		 * the actualizing process again from the inode passed to
++		 * vzquota_inode_qmblk_recalc().
++		 */
++		if (!vzquota_dparents_check_attach(inode)) {
++			if (vzquota_dparents_check_actual(inode) != NULL)
++				break;
++			qmblk = vzquota_dparents_check_same(inode);
++		} else
++			qmblk = VZ_QUOTA_BAD;
++		if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */
++			set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT);
++			break;
++		}
++	}
++
++out:
++	spin_unlock(&dcache_lock);
++	inode_qmblk_unlock(refinode->i_sb);
++	vzquota_qlnk_destroy(&qlnk);
++	iput(inode);
++	inode_qmblk_lock(refinode->i_sb);
++	spin_lock(&dcache_lock);
++}
++
++static void vzquota_dtree_qmblk_recalc(struct inode *inode,
++		struct vz_quota_ilink *qlnk)
++{
++	struct inode *pinode;
++	struct vz_quota_master *qmblk;
++
++	if (inode == inode->i_sb->s_root->d_inode) {
++		/* filesystem root */
++		do {
++			qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++		} while (vzquota_inode_qmblk_set(inode, qmblk, qlnk));
++		return;
++	}
++
++start:
++	if (VZ_QUOTA_IS_ACTUAL(inode))
++		return;
++	/*
++	 * Here qmblk is (re-)initialized for all ancestors.
++	 * This is not a very efficient procedure, but it guarantees that
++	 * the quota tree is consistent (that is, the inode doesn't have two
++	 * ancestors with different qmblk).
++	 */
++	if (!vzquota_dparents_check_attach(inode)) {
++		pinode = vzquota_dparents_check_actual(inode);
++		if (pinode != NULL) {
++			vzquota_dbranch_actualize(pinode, inode);
++			goto start;
++		}
++		qmblk = vzquota_dparents_check_same(inode);
++	} else
++		qmblk = VZ_QUOTA_BAD;
++
++	if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
++		goto start;
++	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE);
++}
++
++static void vzquota_det_qmblk_recalc(struct inode *inode,
++		struct vz_quota_ilink *qlnk)
++{
++	struct inode *parent;
++	struct vz_quota_master *qmblk;
++	char *msg;
++	int cnt;
++	time_t timeout;
++
++	cnt = 0;
++	parent = NULL;
++start:
++	/*
++	 * qmblk of detached inodes shouldn't be considered as not actual.
++	 * They are not in any dentry tree, so quota on/off shouldn't affect
++	 * them.
++	 */
++	if (!vzquota_qlnk_is_empty(INODE_QLNK(inode)))
++		return;
++
++	timeout = 3;
++	qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++	/*
++	 * Scenario:
++	 *	open
++	 *	unlink
++	 * 	quotaon
++	 *	generic_delete_inode
++	 *
++	 * This is the first time vzquota sees inode. inode is outside of
++	 * vzquota area of interest, otherwise quotaon would have got -EBUSY
++	 * due to shrink_dcache_parent().
++	 * inode is almost completely destroyed, so don't intervene.
++	 * 
++	 * dev@:
++	 * However, there is a small race here...
++	 * dput() first removes itself from all the lists,
++	 * so shrink_dcache_parent() can succeed while dentry_iput is not
++	 * done yet.
++	 */
++	if (inode->i_state & I_FREEING)
++		goto set;
++
++	msg = "detached inode not in creation";
++	if (inode->i_op != VZ_QUOTA_EMPTY_IOPS)
++		goto fail;
++	qmblk = VZ_QUOTA_BAD;
++	msg = "unexpected creation context";
++	if (!vzquota_cur_qmblk_check())
++		goto fail;
++	timeout = 0;
++	parent = vzquota_cur_qmblk_fetch();
++	msg = "uninitialized parent";
++	if (vzquota_qlnk_is_empty(INODE_QLNK(parent)))
++		goto fail;
++	msg = "parent not in tree";
++	if (list_empty(&parent->i_dentry))
++		goto fail;
++	msg = "parent has 0 refcount";
++	if (!atomic_read(&parent->i_count))
++		goto fail;
++	msg = "parent has different sb";
++	if (parent->i_sb != inode->i_sb)
++		goto fail;
++	if (!VZ_QUOTA_IS_ACTUAL(parent)) {
++		vzquota_dbranch_actualize(parent, inode);
++		goto start;
++	}
++
++	qmblk = INODE_QLNK(parent)->qmblk;
++set:
++	if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
++		goto start;
++	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET);
++	return;
++
++fail:
++	{
++		struct timeval tv, tvo;
++		do_gettimeofday(&tv);
++		memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo));
++		tv.tv_sec -= tvo.tv_sec;
++		if (tv.tv_usec < tvo.tv_usec) {
++			tv.tv_sec--;
++			tv.tv_usec += USEC_PER_SEC - tvo.tv_usec;
++		} else
++			tv.tv_usec -= tvo.tv_usec;
++		if (tv.tv_sec < timeout)
++			goto set;
++		printk(KERN_ERR "VZDQ: %s, orig {%u, %u},"
++			" dev %s, inode %lu, fs %s\n",
++			msg,
++			INODE_QLNK(inode)->origin[0],
++			INODE_QLNK(inode)->origin[1],
++			inode->i_sb->s_id, inode->i_ino,
++			inode->i_sb->s_type->name);
++		printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count));
++		printk(KERN_ERR "i_mode %o, ", inode->i_mode);
++		printk(KERN_ERR "i_state %lx, ", inode->i_state);
++		printk(KERN_ERR "i_flags %x\n", inode->i_flags);
++		printk(KERN_ERR "i_op %p, vfs_empty_iops %p, "
++				"i_fop %p, i_mapping %p\n",
++				inode->i_op, &vfs_empty_iops,
++				inode->i_fop, inode->i_mapping);
++		if (!cnt++) {
++			printk(KERN_ERR "current %d (%s), VE %d,"
++				" time %ld.%06ld\n",
++				current->pid, current->comm,
++				VEID(get_exec_env()),
++				tv.tv_sec, (long)tv.tv_usec);
++			dump_stack();
++		}
++		if (parent != NULL)
++			printk(KERN_ERR "VZDQ: parent of %lu is %lu\n",
++				inode->i_ino, parent->i_ino);
++	}
++	goto set;
++}
++
++static void vzquota_inode_qmblk_recalc(struct inode *inode,
++		struct vz_quota_ilink *qlnk)
++{
++	spin_lock(&dcache_lock);
++	if (!list_empty(&inode->i_dentry))
++		vzquota_dtree_qmblk_recalc(inode, qlnk);
++	else
++		vzquota_det_qmblk_recalc(inode, qlnk);
++	spin_unlock(&dcache_lock);
++}
++
++/**
++ * vzquota_inode_qmblk - obtain inode's qmblk
++ *
++ * Returns qmblk with refcounter taken, %NULL if not under
++ * VZ quota or %VZ_QUOTA_BAD.
++ *
++ * FIXME: This function should be removed when vzquota_find_qmblk /
++ * get_quota_root / vzquota_dstat code is cleaned up.
++ */
++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ilink qlnk;
++
++	might_sleep();
++
++	if (inode->i_sb->dq_op != &vz_quota_operations)
++		return NULL;
++#if defined(VZ_QUOTA_UNLOAD)
++#error Make sure qmblk does not disappear
++#endif
++
++	vzquota_qlnk_init(&qlnk);
++	inode_qmblk_lock(inode->i_sb);
++	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++	if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++	    !VZ_QUOTA_IS_ACTUAL(inode))
++		vzquota_inode_qmblk_recalc(inode, &qlnk);
++
++	qmblk = INODE_QLNK(inode)->qmblk;
++	if (qmblk != VZ_QUOTA_BAD) {
++		if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb))
++			qmblk_get(qmblk);
++		else
++			qmblk = NULL;
++	}
++
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(&qlnk);
++	return qmblk;
++}
++
++/**
++ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems
++ *
++ * This function finds a quota master block corresponding to the root of
++ * a virtual filesystem.
++ * Returns a quota master block with reference taken, or %NULL if not under
++ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation
++ * operations will fail).
++ *
++ * Note: this function uses vzquota_inode_qmblk().
++ * The latter is a rather confusing function: it returns qmblk that used to be
++ * on the inode some time ago (without guarantee that it still has any
++ * relations to the inode).  So, vzquota_find_qmblk() leaves it up to the
++ * caller to think whether the inode could have changed its qmblk and what to
++ * do in that case.
++ * Currently, the callers appear to not care :(
++ */
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb)
++{
++	struct inode *qrinode;
++	struct vz_quota_master *qmblk;
++
++	qmblk = NULL;
++	qrinode = NULL;
++	if (sb->s_op->get_quota_root != NULL)
++		qrinode = sb->s_op->get_quota_root(sb);
++	if (qrinode != NULL)
++		qmblk = vzquota_inode_qmblk(qrinode);
++	return qmblk;
++}
++
++/* ----------------------------------------------------------------------
++ *
++ * Calls from quota operations
++ *
++ * --------------------------------------------------------------------- */
++
++/**
++ * vzquota_inode_init_call - call from DQUOT_INIT
++ */
++void vzquota_inode_init_call(struct inode *inode)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++
++	/* initializes inode's quota inside */
++	qmblk = vzquota_inode_data(inode, &data);
++	if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++		vzquota_data_unlock(inode, &data);
++
++	/*
++	 * The check is needed for repeated new_inode() calls from a single
++	 * ext3 call like create or mkdir in case of -ENOSPC.
++	 */
++	spin_lock(&dcache_lock);
++	if (!list_empty(&inode->i_dentry))
++		vzquota_cur_qmblk_set(inode);
++	spin_unlock(&dcache_lock);
++}
++
++/**
++ * vzquota_inode_drop_call - call from DQUOT_DROP
++ */
++void vzquota_inode_drop_call(struct inode *inode)
++{
++	vzquota_inode_drop(inode);
++}
++
++/**
++ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs
++ * @inode: the inode
++ * @data: storage space
++ *
++ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk.
++ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD:
++ *   qmblk in inode's qlnk is the same as returned,
++ *   ugid pointers inside inode's qlnk are valid,
++ *   some locks are taken (and should be released by vzquota_data_unlock).
++ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken.
++ */
++struct vz_quota_master *vzquota_inode_data(struct inode *inode,
++		struct vz_quota_datast *data)
++{
++	struct vz_quota_master *qmblk;
++
++	might_sleep();
++
++	vzquota_qlnk_init(&data->qlnk);
++	inode_qmblk_lock(inode->i_sb);
++	if (unlikely(inode->i_flags & S_NOQUOTA)) {
++		inode_qmblk_unlock(inode->i_sb);
++		return NULL;
++	}
++	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++	if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++	    !VZ_QUOTA_IS_ACTUAL(inode))
++		vzquota_inode_qmblk_recalc(inode, &data->qlnk);
++
++	qmblk = INODE_QLNK(inode)->qmblk;
++	if (qmblk != VZ_QUOTA_BAD) {
++		if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) {
++			/*
++			 * Note that in the current implementation,
++			 * inode_qmblk_lock can theoretically be dropped here.
++			 * This place is serialized with quota_off because
++			 * quota_off fails when there are extra dentry
++			 * references and syncs inodes before removing quota
++			 * information from them.
++			 * However, quota usage information should stop being
++			 * updated immediately after vzquota_off.
++			 */
++			qmblk_data_write_lock(qmblk);
++		} else {
++			inode_qmblk_unlock(inode->i_sb);
++			qmblk = NULL;
++		}
++	} else {
++		inode_qmblk_unlock(inode->i_sb);
++	}
++	return qmblk;
++}
++
++void vzquota_data_unlock(struct inode *inode,
++		struct vz_quota_datast *data)
++{
++	qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk);
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(&data->qlnk);
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_inode_transfer_call - call from vzquota_transfer
++ */
++int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_datast data;
++	struct vz_quota_ilink qlnew;
++	int mask;
++	int ret;
++
++	might_sleep();
++	vzquota_qlnk_init(&qlnew);
++start:
++	qmblk = vzquota_inode_data(inode, &data);
++	ret = NO_QUOTA;
++	if (qmblk == VZ_QUOTA_BAD)
++		goto out_destr;
++	ret = QUOTA_OK;
++	if (qmblk == NULL)
++		goto out_destr;
++	qmblk_get(qmblk);
++
++	ret = QUOTA_OK;
++	if (!(qmblk->dq_flags & VZDQUG_ON))
++		/* no ugid quotas */
++		goto out_unlock;
++
++	mask = 0;
++	if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid)
++		mask |= 1 << USRQUOTA;
++	if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid)
++		mask |= 1 << GRPQUOTA;
++	while (1) {
++		if (vzquota_qlnk_is_empty(&qlnew) &&
++		    vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk))
++			break;
++		if (qlnew.qmblk == INODE_QLNK(inode)->qmblk &&
++		    qlnew.qmblk == qmblk)
++			goto finish;
++		if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk))
++			break;
++	}
++
++	/* prepare for restart */
++	vzquota_data_unlock(inode, &data);
++	qmblk_put(qmblk);
++	goto start;
++
++finish:
++	/* all references obtained successfully */
++	ret = vzquota_transfer_usage(inode, mask, &qlnew);
++	if (!ret) {
++		vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode));
++		set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS);
++	}
++out_unlock:
++	vzquota_data_unlock(inode, &data);
++	qmblk_put(qmblk);
++out_destr:
++	vzquota_qlnk_destroy(&qlnew);
++	return ret;
++}
++#endif
++
++int vzquota_rename_check(struct inode *inode,
++		struct inode *old_dir, struct inode *new_dir)
++{
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ilink qlnk1, qlnk2, qlnk3;
++	int c, ret;
++
++	if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb)
++		return -1;
++
++	might_sleep();
++
++	vzquota_qlnk_init(&qlnk1);
++	vzquota_qlnk_init(&qlnk2);
++	vzquota_qlnk_init(&qlnk3);
++	inode_qmblk_lock(inode->i_sb);
++	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++	__vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL);
++	__vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL);
++
++	do {
++		c = 0;
++		if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++		    !VZ_QUOTA_IS_ACTUAL(inode)) {
++			vzquota_inode_qmblk_recalc(inode, &qlnk1);
++			c++;
++		}
++		if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) ||
++		    !VZ_QUOTA_IS_ACTUAL(new_dir)) {
++			vzquota_inode_qmblk_recalc(new_dir, &qlnk2);
++			c++;
++		}
++	} while (c);
++
++	ret = 0;
++	qmblk = INODE_QLNK(inode)->qmblk;
++	if (qmblk != INODE_QLNK(new_dir)->qmblk) {
++		ret = -1;
++		while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) ||
++		       !VZ_QUOTA_IS_ACTUAL(old_dir))
++			vzquota_inode_qmblk_recalc(old_dir, &qlnk3);
++		if (qmblk != VZ_QUOTA_BAD &&
++		    !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
++		    qmblk->dq_root_path.dentry->d_inode == inode &&
++		    VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk,
++			    				inode->i_sb) &&
++		    VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk,
++			    				inode->i_sb))
++			/* quota root rename is allowed */
++			ret = 0;
++	}
++
++	inode_qmblk_unlock(inode->i_sb);
++	vzquota_qlnk_destroy(&qlnk3);
++	vzquota_qlnk_destroy(&qlnk2);
++	vzquota_qlnk_destroy(&qlnk1);
++	return ret;
++}
++
++/*
++ * Scan parent subdirs and find busy dentries names/path
++ * @parent: parent dentry
++ * @buf: buffer to store path.
++ */
++static void vzdquota_read_busy_dentries(struct path *parent,
++		char *buf, int buflen)
++{
++	struct dentry *this_parent = parent->dentry;
++	struct list_head *next;
++	char *res, *end, *start;
++	struct path root, path;
++	int len;
++
++	if (!buf || buflen <= 0)
++		return;
++
++	path.mnt = parent->mnt;
++	/* From d_path() ... */
++	read_lock(&current->fs->lock);
++	path_get(&current->fs->root);
++	root = current->fs->root;
++	read_unlock(&current->fs->lock);
++
++	spin_lock(&dcache_lock);
++
++	end = buf + buflen;
++	start = buf;
++repeat:
++	next = this_parent->d_subdirs.next;
++resume:
++	while (next != &this_parent->d_subdirs) {
++		struct list_head *tmp = next;
++		struct dentry *dentry;
++		int subdirs;
++
++		dentry = list_entry(tmp, struct dentry, d_u.d_child);
++		next = tmp->next;
++		subdirs = !list_empty(&dentry->d_subdirs); 
++
++		if (atomic_read(&dentry->d_count) && !subdirs) {
++			if (!buflen)
++				goto out;
++			/*
++			 * Note: __d_path will store filename at the
++			 * end of buf.
++			 */
++			path.dentry = dentry;
++			res = __d_path(&path, &root, buf, buflen);
++			/* Exit if name is too long */
++			if (IS_ERR(res))
++				goto out;
++
++			/*
++			 * Move the string obtained by __d_path,
++			 * behind the last dentry path in buf.
++			 */
++			len = end - res;
++			BUG_ON(len <= 0);
++
++			memmove(buf, res, len);
++
++			/* Trick: replace \0 by \n */
++			if (buf != start)
++				*(char *)(buf - 1) = '\n';
++
++			buf += len;
++			buflen -= len;
++		}
++
++		/*
++		 * Descend a level if the d_subdirs list is non-empty.
++		 */
++		if (subdirs) {
++			this_parent = dentry;
++			goto repeat;
++		}
++	}
++	/*
++	 * All done at this level ... ascend and resume the search.
++	 */
++	if (this_parent != parent->dentry) {
++		next = this_parent->d_u.d_child.next;
++		this_parent = this_parent->d_parent;
++		goto resume;
++	}
++out:
++	/* From d_path() ... */
++	spin_unlock(&dcache_lock);
++	path_put(&root);
++}
++
++/* ----------------------------------------------------------------------
++ *
++ * qmblk-related parts of on/off operations
++ *
++ * --------------------------------------------------------------------- */
++
++/**
++ * vzquota_check_dtree - check dentry tree if quota on/off is allowed
++ *
++ * This function doesn't allow quota to be turned on/off if some dentries in
++ * the tree have external references.
++ * In addition to technical reasons, it enforces user-space correctness:
++ * current usage (taken from or reported to the user space) can be meaningful
++ * and accurate only if the tree is not being modified.
++ * Side effect: additional vfsmount structures referencing the tree (bind
++ * mounts of tree nodes to some other places) are not allowed at on/off time.
++ *
++ * Store busy dentries path to the buf (if passed) in case of vzquota_off
++ * ioctl fail.
++ */
++int vzquota_check_dtree(struct vz_quota_master *qmblk, int off,
++						char *buf, int buflen)
++{
++	struct dentry *dentry;
++	int err, count;
++
++	err = -EBUSY;
++	dentry = qmblk->dq_root_path.dentry;
++
++	if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root)
++		goto unhashed;
++
++	/* attempt to shrink */
++  	if (!list_empty(&dentry->d_subdirs)) {
++		spin_unlock(&dcache_lock);
++		inode_qmblk_unlock(dentry->d_sb);
++		shrink_dcache_parent(dentry);
++		inode_qmblk_lock(dentry->d_sb);
++		spin_lock(&dcache_lock);
++		if (!list_empty(&dentry->d_subdirs)) {
++        		spin_unlock(&dcache_lock);
++			vzdquota_read_busy_dentries(&qmblk->dq_root_path,
++								buf, buflen);
++			spin_lock(&dcache_lock);
++			goto out;
++		}
++
++		count = 1;
++		if (dentry == dentry->d_sb->s_root)
++			count += 2;	/* sb and mnt refs */
++		if (atomic_read(&dentry->d_count) < count) {
++			printk(KERN_ERR "%s: too small count %d vs %d.\n",
++					__FUNCTION__,
++					atomic_read(&dentry->d_count), count);
++			goto out;
++		}
++		if (atomic_read(&dentry->d_count) > count)
++			goto out;
++	}
++
++	err = 0;
++out:
++	return err;
++
++unhashed:
++	/*
++	 * Quota root is removed.
++	 * Allow to turn quota off, but not on.
++	 */
++	if (off)
++		err = 0;
++	goto out;
++}
++
++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
++		struct vz_quota_master *qmblk, char __user *ubuf)
++{
++	struct vz_quota_ilink qlnk;
++	struct vz_quota_master *qold, *qnew;
++	int err;
++	char *buf;
++
++	buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
++
++	might_sleep();
++
++	qold = NULL;
++	qnew = vzquota_alloc_fake();
++	if (qnew == NULL) {
++		free_page((unsigned long)buf);
++		return -ENOMEM;
++	}
++
++	vzquota_qlnk_init(&qlnk);
++	inode_qmblk_lock(sb);
++	__vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++	spin_lock(&dcache_lock);
++	while (1) {
++		err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE);
++		if (err)
++			break;
++		if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk))
++			break;
++	}
++	set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON);
++	spin_unlock(&dcache_lock);
++
++	if (!err) {
++		qold = __VZ_QUOTA_NOQUOTA(sb);
++		qold->dq_flags |= VZDQ_NOACT;
++		__VZ_QUOTA_NOQUOTA(sb) = qnew;
++	}
++
++	inode_qmblk_unlock(sb);
++	vzquota_qlnk_destroy(&qlnk);
++	if (qold != NULL)
++		qmblk_put(qold);
++
++	if (buf) {
++		if (copy_to_user(ubuf, buf, PAGE_SIZE))
++			;
++		free_page((unsigned long)buf);
++	}
++	return err;
++}
++
++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
++						char __user *ubuf, int force)
++{
++	int ret;
++	char *buf;
++
++	buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
++
++	ret = 0;
++	inode_qmblk_lock(sb);
++
++	spin_lock(&dcache_lock);
++	if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force)
++		ret = -EBUSY;
++	spin_unlock(&dcache_lock);
++
++	if (!ret)
++		qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT;
++	inode_qmblk_unlock(sb);
++
++	if (buf) {
++		if (copy_to_user(ubuf, buf, PAGE_SIZE))
++			;
++		free_page((unsigned long)buf);
++	}
++	return ret;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * External interfaces
++ *
++ * ---------------------------------------------------------------------*/
++
++static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	switch (cmd) {
++	case VZCTL_QUOTA_NEW_CTL: {
++		struct vzctl_quotactl qb;
++
++		err = -EFAULT;
++		if (copy_from_user(&qb, (void __user *)arg, sizeof(qb)))
++			break;
++		err = do_vzquotactl(qb.cmd, qb.quota_id,
++				qb.qstat, qb.ve_root, 0);
++		break;
++	}
++#ifdef CONFIG_VZ_QUOTA_UGID
++	case VZCTL_QUOTA_UGID_CTL: {
++		struct vzctl_quotaugidctl qub;
++
++		err = -EFAULT;
++		if (copy_from_user(&qub, (void __user *)arg, sizeof(qub)))
++			break;
++		err = do_vzquotaugidctl(qub.cmd, qub.quota_id,
++				qub.ugid_index, qub.ugid_size, qub.addr, 0);
++		break;
++	}
++#endif
++	default:
++		err = -ENOTTY;
++	}
++	return err;
++}
++
++#ifdef CONFIG_COMPAT
++static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	switch (cmd) {
++	case VZCTL_COMPAT_QUOTA_CTL: {
++		struct compat_vzctl_quotactl cs;
++
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++		err = do_vzquotactl(cs.cmd, cs.quota_id,
++				compat_ptr(cs.qstat),
++				compat_ptr(cs.ve_root), 1);
++		break;
++	}
++#ifdef CONFIG_VZ_QUOTA_UGID
++	case VZCTL_COMPAT_QUOTA_UGID_CTL: {
++		struct compat_vzctl_quotaugidctl cs;
++
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++
++		err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index,
++				cs.ugid_size, compat_ptr(cs.addr), 1);
++		break;
++	}
++#endif
++	default:
++		err = -ENOIOCTLCMD;
++	}
++	return err;
++}
++#endif
++
++static struct vzioctlinfo vzdqcalls = {
++	.type		= VZDQCTLTYPE,
++	.ioctl		= vzquota_ioctl,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	= compat_vzquota_ioctl,
++#endif
++	.owner		= THIS_MODULE,
++};
++
++/**
++ * vzquota_dstat - get quota usage info for virtual superblock
++ */
++static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat)
++{
++	struct vz_quota_master *qmblk;
++
++	qmblk = vzquota_find_qmblk(super);
++	if (qmblk == NULL)
++		return -ENOENT;
++	if (qmblk == VZ_QUOTA_BAD) {
++		memset(qstat, 0, sizeof(*qstat));
++		return 0;
++	}
++
++	qmblk_data_read_lock(qmblk);
++	memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat));
++	qmblk_data_read_unlock(qmblk);
++	qmblk_put(qmblk);
++	return 0;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Init/exit helpers
++ *
++ * ---------------------------------------------------------------------*/
++
++static int vzquota_cache_init(void)
++{
++	int i;
++
++	vzquota_cachep = kmem_cache_create("vz_quota_master",
++					 sizeof(struct vz_quota_master),
++					 0, SLAB_HWCACHE_ALIGN, NULL);
++	if (vzquota_cachep == NULL) {
++		printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
++		goto nomem2;
++	}
++	for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
++		INIT_LIST_HEAD(&vzquota_hash_table[i]);
++
++	return 0;
++
++nomem2:
++	return -ENOMEM;
++}
++
++static void vzquota_cache_release(void)
++{
++	int i;
++
++	/* sanity check */
++	for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
++		if (!list_empty(&vzquota_hash_table[i]))
++			BUG();
++
++	/* release caches */
++	kmem_cache_destroy(vzquota_cachep);
++	vzquota_cachep = NULL;
++}
++
++static int quota_notifier_call(struct vnotifier_block *self,
++		unsigned long n, void *data, int err)
++{
++	struct virt_info_quota *viq;
++	struct super_block *sb;
++
++	viq = (struct virt_info_quota *)data;
++	switch (n) {
++	case VIRTINFO_QUOTA_ON:
++		err = NOTIFY_BAD;
++		if (!try_module_get(THIS_MODULE))
++			break;
++		sb = viq->super;
++		memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++		INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++		INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++		err = NOTIFY_OK;
++		break;
++	case VIRTINFO_QUOTA_OFF:
++		module_put(THIS_MODULE);
++		err = NOTIFY_OK;
++		break;
++	case VIRTINFO_QUOTA_GETSTAT:
++		err = NOTIFY_BAD;
++		if (vzquota_dstat(viq->super, viq->qstat))
++			break;
++		err = NOTIFY_OK;
++		break;
++	case VIRTINFO_QUOTA_DISABLE:
++		err = NOTIFY_OK;
++		vzquota_inode_off((struct inode *)data);
++		break;
++	}
++	return err;
++}
++
++struct vnotifier_block quota_notifier_block = {
++	.notifier_call = quota_notifier_call,
++	.priority = INT_MAX,
++};
++
++/* ----------------------------------------------------------------------
++ *
++ * Init/exit procedures
++ *
++ * ---------------------------------------------------------------------*/
++
++static int __init vzquota_init(void)
++{
++	int err;
++
++	if ((err = vzquota_cache_init()) != 0)
++		goto out_cache;
++
++	if ((err = vzquota_proc_init()) != 0)
++		goto out_proc;
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++	if ((err = vzquota_ugid_init()) != 0)
++		goto out_ugid;
++#endif
++
++	init_MUTEX(&vz_quota_sem);
++	vzioctl_register(&vzdqcalls);
++	virtinfo_notifier_register(VITYPE_QUOTA, &quota_notifier_block);
++#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS)
++	vzaquota_init();
++#endif
++
++	return 0;
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++out_ugid:
++	vzquota_proc_release();
++#endif
++out_proc:
++	vzquota_cache_release();
++out_cache:
++	return err;
++}
++
++#if defined(VZ_QUOTA_UNLOAD)
++static void __exit vzquota_release(void)
++{
++	virtinfo_notifier_unregister(VITYPE_QUOTA, &quota_notifier_block);
++	vzioctl_unregister(&vzdqcalls);
++#ifdef CONFIG_VZ_QUOTA_UGID
++#ifdef CONFIG_PROC_FS
++	vzaquota_fini();
++#endif
++	vzquota_ugid_release();
++#endif
++	vzquota_proc_release();
++	vzquota_cache_release();
++}
++#endif
++
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Disk Quota");
++MODULE_LICENSE("GPL v2");
++
++module_init(vzquota_init)
++#if defined(VZ_QUOTA_UNLOAD)
++module_exit(vzquota_release)
++#endif
+diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h
+index c73b878..849dbe9 100644
+--- a/include/asm-ia64/mman.h
++++ b/include/asm-ia64/mman.h
+@@ -18,6 +18,7 @@
+ #define MAP_NORESERVE	0x04000		/* don't check for reservations */
+ #define MAP_POPULATE	0x08000		/* populate (prefault) pagetables */
+ #define MAP_NONBLOCK	0x10000		/* do not block on IO */
++#define MAP_EXECPRIO	0x20000		/* soft ubc charge */
+ 
+ #define MCL_CURRENT	1		/* lock all current mappings */
+ #define MCL_FUTURE	2		/* lock all future mappings */
+diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h
+index b9ac1a6..9504729 100644
+--- a/include/asm-ia64/pgalloc.h
++++ b/include/asm-ia64/pgalloc.h
+@@ -20,11 +20,13 @@
+ #include <linux/threads.h>
+ #include <linux/quicklist.h>
+ 
++#include <bc/kmem.h>
++
+ #include <asm/mmu_context.h>
+ 
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+-	return quicklist_alloc(0, GFP_KERNEL, NULL);
++	return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL);
+ }
+ 
+ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+@@ -41,7 +43,7 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
+ 
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+-	return quicklist_alloc(0, GFP_KERNEL, NULL);
++	return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL);
+ }
+ 
+ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+@@ -59,7 +61,7 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
+ 
+ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+-	return quicklist_alloc(0, GFP_KERNEL, NULL);
++	return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL);
+ }
+ 
+ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+@@ -87,7 +89,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr)
+ 	struct page *page;
+ 	void *pg;
+ 
+-	pg = quicklist_alloc(0, GFP_KERNEL, NULL);
++	pg = quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL);
+ 	if (!pg)
+ 		return NULL;
+ 	page = virt_to_page(pg);
+diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
+index 6aff126..c148ef8 100644
+--- a/include/asm-ia64/processor.h
++++ b/include/asm-ia64/processor.h
+@@ -361,7 +361,7 @@ struct thread_struct {
+ 	regs->loadrs = 0;									\
+ 	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
+ 	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
+-	if (unlikely(!get_dumpable(current->mm))) {							\
++	if (unlikely(!get_dumpable(current->mm) || !current->mm->vps_dumpable)) {		\
+ 		/*										\
+ 		 * Zap scratch regs to avoid leaking bits between processes with different	\
+ 		 * uid/privileges.								\
+diff --git a/include/asm-ia64/timex.h b/include/asm-ia64/timex.h
+index 05a6baf..b2dc7e7 100644
+--- a/include/asm-ia64/timex.h
++++ b/include/asm-ia64/timex.h
+@@ -10,6 +10,7 @@
+  *			Also removed cacheflush_time as it's entirely unused.
+  */
+ 
++#ifdef __KERNEL__
+ #include <asm/intrinsics.h>
+ #include <asm/processor.h>
+ 
+@@ -39,4 +40,8 @@ get_cycles (void)
+ 	return ret;
+ }
+ 
++extern unsigned int cpu_khz;
++
++#endif
++
+ #endif /* _ASM_IA64_TIMEX_H */
+diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
+index e603147..f5af201 100644
+--- a/include/asm-ia64/unistd.h
++++ b/include/asm-ia64/unistd.h
+@@ -302,6 +302,16 @@
+ #define __NR_timerfd_create		1310
+ #define __NR_timerfd_settime		1311
+ #define __NR_timerfd_gettime		1312
++#define __NR_fairsched_vcpus		1499
++#define __NR_fairsched_mknod		1500
++#define __NR_fairsched_rmnod		1501
++#define __NR_fairsched_chwt		1502
++#define __NR_fairsched_mvpr		1503
++#define __NR_fairsched_rate		1504
++#define __NR_getluid			1505
++#define __NR_setluid			1506
++#define __NR_setublimit			1507
++#define __NR_ubstat			1508
+ 
+ #ifdef __KERNEL__
+ 
+diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h
+index 24cf664..0d4a60f 100644
+--- a/include/asm-powerpc/mman.h
++++ b/include/asm-powerpc/mman.h
+@@ -23,5 +23,6 @@
+ 
+ #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
+ #define MAP_NONBLOCK	0x10000		/* do not block on IO */
++#define MAP_EXECPRIO   0x20000         /* do soft ubc charge */
+ 
+ #endif	/* _ASM_POWERPC_MMAN_H */
+diff --git a/include/asm-powerpc/pgalloc-64.h b/include/asm-powerpc/pgalloc-64.h
+index 6898099..49075a6 100644
+--- a/include/asm-powerpc/pgalloc-64.h
++++ b/include/asm-powerpc/pgalloc-64.h
+@@ -26,7 +26,8 @@ extern struct kmem_cache *pgtable_cache[];
+ 
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+-	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
++	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM],
++			GFP_KERNEL_UBC | __GFP_SOFT_UBC);
+ }
+ 
+ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+@@ -42,7 +43,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+ 	return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+-				GFP_KERNEL|__GFP_REPEAT);
++				GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT);
+ }
+ 
+ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+@@ -88,10 +89,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+ 	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+ }
+ 
++static inline pte_t *do_pte_alloc(gfp_t flags)
++{
++	return (pte_t *)__get_free_page(flags);
++}
++
+ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+ 					  unsigned long address)
+ {
+-        return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
++        return do_pte_alloc(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
+ }
+ 
+ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+@@ -100,7 +106,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+ 	struct page *page;
+ 	pte_t *pte;
+ 
+-	pte = pte_alloc_one_kernel(mm, address);
++	pte = do_pte_alloc(GFP_KERNEL_UBC | __GFP_REPEAT | __GFP_ZERO);
+ 	if (!pte)
+ 		return NULL;
+ 	page = virt_to_page(pte);
+diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h
+index ae7085c..7ad02b9 100644
+--- a/include/asm-powerpc/systbl.h
++++ b/include/asm-powerpc/systbl.h
+@@ -316,3 +316,19 @@ COMPAT_SYS(fallocate)
+ SYSCALL(subpage_prot)
+ COMPAT_SYS_SPU(timerfd_settime)
+ COMPAT_SYS_SPU(timerfd_gettime)
++SYS_SKIP(313, 400)
++SYSCALL(ni_syscall)
++SYS_SKIP_END()
++SYSCALL(fairsched_mknod) /* 400 */
++SYSCALL(fairsched_rmnod)
++SYSCALL(fairsched_chwt)
++SYSCALL(fairsched_mvpr)
++SYSCALL(fairsched_rate)
++SYSCALL(fairsched_vcpus)
++SYS_SKIP(406, 410)
++SYSCALL(ni_syscall)
++SYS_SKIP_END()
++SYSCALL(getluid) /* 410 */
++SYSCALL(setluid)
++SYSCALL(setublimit)
++SYSCALL(ubstat)
+diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
+index ce91bb6..dac5902 100644
+--- a/include/asm-powerpc/unistd.h
++++ b/include/asm-powerpc/unistd.h
+@@ -336,9 +336,14 @@
+ #define __NR_timerfd_settime	311
+ #define __NR_timerfd_gettime	312
+ 
++#define __NR_getluid            410
++#define __NR_setluid            411
++#define __NR_setublimit         412
++#define __NR_ubstat             413
++
+ #ifdef __KERNEL__
+ 
+-#define __NR_syscalls		313
++#define __NR_syscalls		414
+ 
+ #define __NR__exit __NR_exit
+ #define NR_syscalls	__NR_syscalls
+diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h
+index d2ae67c..51103a7 100644
+--- a/include/asm-sparc64/mman.h
++++ b/include/asm-sparc64/mman.h
+@@ -20,6 +20,7 @@
+ 
+ #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
+ #define MAP_NONBLOCK	0x10000		/* do not block on IO */
++#define MAP_EXECPRIO	0x20000		/* do soft ubc charge */
+ 
+ #ifdef __KERNEL__
+ #ifndef __ASSEMBLY__
+diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
+index 326de10..f556b52 100644
+--- a/include/asm-sparc64/pgalloc.h
++++ b/include/asm-sparc64/pgalloc.h
+@@ -16,7 +16,7 @@
+ 
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+-	return quicklist_alloc(0, GFP_KERNEL, NULL);
++	return quicklist_alloc(0, GFP_KERNEL_UBC, NULL);
+ }
+ 
+ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+@@ -28,7 +28,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ 
+ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+-	return quicklist_alloc(0, GFP_KERNEL, NULL);
++	return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_REPEAT, NULL);
+ }
+ 
+ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+@@ -48,7 +48,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+ 	struct page *page;
+ 	void *pg;
+ 
+-	pg = quicklist_alloc(0, GFP_KERNEL, NULL);
++	pg = quicklist_alloc(0, GFP_KERNEL_UBC, NULL);
+ 	if (!pg)
+ 		return NULL;
+ 	page = virt_to_page(pg);
+diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h
+index e5873e3..4d21580 100644
+--- a/include/asm-sparc64/thread_info.h
++++ b/include/asm-sparc64/thread_info.h
+@@ -161,14 +161,14 @@ register struct thread_info *current_thread_info_reg asm("g6");
+ 	struct thread_info *ret;				\
+ 								\
+ 	ret = (struct thread_info *)				\
+-	  __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER);	\
++	  __get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER);\
+ 	if (ret)						\
+ 		memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER);	\
+ 	ret;							\
+ })
+ #else
+ #define alloc_thread_info(tsk) \
+-	((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER))
++	((struct thread_info *)__get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER))
+ #endif
+ 
+ #define free_thread_info(ti) \
+@@ -235,6 +235,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
+ #define TIF_ABI_PENDING		12
+ #define TIF_MEMDIE		13
+ #define TIF_POLLING_NRFLAG	14
++#define TIF_FREEZE		15	/* Freeze request (atomic PF_FREEZE) */
+ 
+ #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
+ #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
+diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h
+index 13be445..86e6ca9 100644
+--- a/include/asm-sparc64/unistd.h
++++ b/include/asm-sparc64/unistd.h
+@@ -334,8 +334,12 @@
+ #define __NR_fallocate		314
+ #define __NR_timerfd_settime	315
+ #define __NR_timerfd_gettime	316
++#define __NR_getluid		510
++#define __NR_setluid		511
++#define __NR_setublimit		512
++#define __NR_ubstat		513
+ 
+-#define NR_SYSCALLS		317
++#define NR_SYSCALLS		514
+ 
+ #ifdef __KERNEL__
+ #define __ARCH_WANT_IPC_PARSE_VERSION
+diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
+index 8f232dc..f8a3e81 100644
+--- a/include/asm-x86/elf.h
++++ b/include/asm-x86/elf.h
+@@ -279,7 +279,7 @@ struct task_struct;
+ 
+ #define	ARCH_DLINFO_IA32(vdso_enabled)					\
+ do {									\
+-	if (vdso_enabled) {						\
++	if (vdso_enabled && sysctl_at_vsyscall) {			\
+ 		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);			\
+ 		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);	\
+ 	}								\
+@@ -324,9 +324,11 @@ struct linux_binprm;
+ 
+ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+-				       int executable_stack);
++				       int executable_stack,
++				       unsigned long map_address);
+ 
+-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
++extern int syscall32_setup_pages(struct linux_binprm *, int exstack,
++				 unsigned long map_address);
+ #define compat_arch_setup_additional_pages	syscall32_setup_pages
+ 
+ extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+diff --git a/include/asm-x86/mman.h b/include/asm-x86/mman.h
+index c1682b5..cab588d 100644
+--- a/include/asm-x86/mman.h
++++ b/include/asm-x86/mman.h
+@@ -12,6 +12,7 @@
+ #define MAP_NORESERVE	0x4000		/* don't check for reservations */
+ #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
+ #define MAP_NONBLOCK	0x10000		/* do not block on IO */
++#define MAP_EXECPRIO	0x20000		/* soft ubc charge */
+ 
+ #define MCL_CURRENT	1		/* lock all current mappings */
+ #define MCL_FUTURE	2		/* lock all future mappings */
+diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
+index 1e36302..dcc301d 100644
+--- a/include/asm-x86/nmi.h
++++ b/include/asm-x86/nmi.h
+@@ -54,6 +54,10 @@ extern void release_perfctr_nmi(unsigned int);
+ extern int reserve_evntsel_nmi(unsigned int);
+ extern void release_evntsel_nmi(unsigned int);
+ 
++typedef int (*nmi_callback_t)(struct pt_regs *regs, int cpu);
++void set_nmi_ipi_callback(nmi_callback_t callback);
++void unset_nmi_ipi_callback(void);
++
+ extern void setup_apic_nmi_watchdog(void *);
+ extern void stop_apic_nmi_watchdog(void *);
+ extern void disable_timer_nmi_watchdog(void);
+diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
+index 91e4641..1b9caac 100644
+--- a/include/asm-x86/pgalloc.h
++++ b/include/asm-x86/pgalloc.h
+@@ -64,7 +64,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ #if PAGETABLE_LEVELS > 2
+ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+-	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
++	return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT);
+ }
+ 
+ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+@@ -94,7 +94,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+ 
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+-	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
++	return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT);
+ }
+ 
+ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
+index 5591052..5fd8bee 100644
+--- a/include/asm-x86/processor.h
++++ b/include/asm-x86/processor.h
+@@ -878,8 +878,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
+ /* This decides where the kernel will search for a free chunk of vm
+  * space during mmap's.
+  */
+-#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
+-					0xc0000000 : 0xFFFFe000)
++#define IA32_PAGE_OFFSET 0xc0000000
+ 
+ #define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
+ 					IA32_PAGE_OFFSET : TASK_SIZE64)
+diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
+index b633882..2776f8d 100644
+--- a/include/asm-x86/thread_info_32.h
++++ b/include/asm-x86/thread_info_32.h
+@@ -96,10 +96,10 @@ static inline struct thread_info *current_thread_info(void)
+ /* thread information allocation */
+ #ifdef CONFIG_DEBUG_STACK_USAGE
+ #define alloc_thread_info(tsk) ((struct thread_info *)			\
+-	__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(THREAD_SIZE)))
++	__get_free_pages(GFP_KERNEL_UBC | __GFP_ZERO, get_order(THREAD_SIZE)))
+ #else
+ #define alloc_thread_info(tsk) ((struct thread_info *)			\
+-	__get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE)))
++	__get_free_pages(GFP_KERNEL_UBC, get_order(THREAD_SIZE)))
+ #endif
+ 
+ #else /* !__ASSEMBLY__ */
+diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
+index cb69f70..a787bec 100644
+--- a/include/asm-x86/thread_info_64.h
++++ b/include/asm-x86/thread_info_64.h
+@@ -83,7 +83,8 @@ static inline struct thread_info *stack_thread_info(void)
+ #endif
+ 
+ #define alloc_thread_info(tsk)						\
+-	((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
++	((struct thread_info *)__get_free_pages(THREAD_FLAGS | __GFP_UBC,\
++			THREAD_ORDER))
+ 
+ #else /* !__ASSEMBLY__ */
+ 
+@@ -124,6 +125,7 @@ static inline struct thread_info *stack_thread_info(void)
+ #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
+ #define TIF_BTS_TRACE_TS	27      /* record scheduling event timestamps */
+ #define TIF_NOTSC		28	/* TSC is not accessible in userland */
++#define TIF_RESUME		29
+ 
+ #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+ #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
+@@ -145,6 +147,7 @@ static inline struct thread_info *stack_thread_info(void)
+ #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
+ #define _TIF_BTS_TRACE_TS	(1 << TIF_BTS_TRACE_TS)
+ #define _TIF_NOTSC		(1 << TIF_NOTSC)
++#define _TIF_RESUME		(1<<TIF_RESUME)
+ 
+ /* work to do on interrupt/exception return */
+ #define _TIF_WORK_MASK							\
+diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
+index 548873a..6f892de 100644
+--- a/include/asm-x86/tsc.h
++++ b/include/asm-x86/tsc.h
+@@ -24,7 +24,7 @@ static inline cycles_t get_cycles(void)
+ 	unsigned long long ret = 0;
+ 
+ #ifndef CONFIG_X86_TSC
+-	if (!cpu_has_tsc)
++	if (WARN_ON_ONCE(!cpu_has_tsc))
+ 		return 0;
+ #endif
+ 	rdtscll(ret);
+diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
+index 8317d94..a986223 100644
+--- a/include/asm-x86/unistd_32.h
++++ b/include/asm-x86/unistd_32.h
+@@ -332,6 +332,16 @@
+ #define __NR_fallocate		324
+ #define __NR_timerfd_settime	325
+ #define __NR_timerfd_gettime	326
++#define __NR_fairsched_mknod	500     /* FairScheduler syscalls */
++#define __NR_fairsched_rmnod	501
++#define __NR_fairsched_chwt	502
++#define __NR_fairsched_mvpr	503
++#define __NR_fairsched_rate	504
++#define __NR_fairsched_vcpus	505
++#define __NR_getluid		510
++#define __NR_setluid		511
++#define __NR_setublimit		512
++#define __NR_ubstat		513
+ 
+ #ifdef __KERNEL__
+ 
+diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
+index fe26e36..514af9d 100644
+--- a/include/asm-x86/unistd_64.h
++++ b/include/asm-x86/unistd_64.h
+@@ -639,6 +639,26 @@ __SYSCALL(__NR_fallocate, sys_fallocate)
+ __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
+ #define __NR_timerfd_gettime			287
+ __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
++#define __NR_fairsched_vcpus			499
++__SYSCALL(__NR_fairsched_vcpus, sys_fairsched_vcpus)
++#define __NR_getluid				500
++__SYSCALL(__NR_getluid, sys_getluid)
++#define __NR_setluid				501
++__SYSCALL(__NR_setluid, sys_setluid)
++#define __NR_setublimit				502
++__SYSCALL(__NR_setublimit, sys_setublimit)
++#define __NR_ubstat				503
++__SYSCALL(__NR_ubstat, sys_ubstat)
++#define __NR_fairsched_mknod			504 /* FairScheduler syscalls */
++__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod)
++#define __NR_fairsched_rmnod			505
++__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod)
++#define __NR_fairsched_chwt			506
++__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt)
++#define __NR_fairsched_mvpr			507
++__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr)
++#define __NR_fairsched_rate			508
++__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate)
+ 
+ 
+ #ifndef __NO_STUBS
+@@ -664,6 +684,7 @@ __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
+ #define __ARCH_WANT_SYS_RT_SIGSUSPEND
+ #define __ARCH_WANT_SYS_TIME
+ #define __ARCH_WANT_COMPAT_SYS_TIME
++#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+ #endif	/* __NO_STUBS */
+ 
+ #ifdef __KERNEL__
+diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h
+index 86e085e..5fcda30 100644
+--- a/include/asm-x86/vdso.h
++++ b/include/asm-x86/vdso.h
+@@ -18,6 +18,7 @@ extern const char VDSO64_PRELINK[];
+ #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+ extern const char VDSO32_PRELINK[];
+ 
++extern const char VDSO32_SYSENTER_RETURN[];
+ /*
+  * Given a pointer to the vDSO image, find the pointer to VDSO32_name
+  * as that symbol is defined in the vDSO sources or linker script.
+diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h
+new file mode 100644
+index 0000000..7327bcb
+--- /dev/null
++++ b/include/bc/beancounter.h
+@@ -0,0 +1,451 @@
++/*
++ *  include/bc/beancounter.h
++ *
++ *  Copyright (C) 1999-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ *  Andrey Savochkin	saw at sw-soft.com
++ *
++ */
++
++#ifndef _LINUX_BEANCOUNTER_H
++#define _LINUX_BEANCOUNTER_H
++
++/*
++ * Generic ratelimiting stuff.
++ */
++
++struct ub_rate_info {
++	int burst;
++	int interval; /* jiffy_t per event */
++	int bucket; /* kind of leaky bucket */
++	unsigned long last; /* last event */
++};
++
++/* Return true if rate limit permits. */
++int ub_ratelimit(struct ub_rate_info *);
++
++
++/*
++ * This magic is used to distinuish user beancounter and pages beancounter
++ * in struct page. page_ub and page_bc are placed in union and MAGIC
++ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
++ */
++#define UB_MAGIC		0x62756275
++
++/*
++ *	Resource list.
++ */
++
++#define UB_KMEMSIZE	0	/* Unswappable kernel memory size including
++				 * struct task, page directories, etc.
++				 */
++#define UB_LOCKEDPAGES	1	/* Mlock()ed pages. */
++#define UB_PRIVVMPAGES	2	/* Total number of pages, counting potentially
++				 * private pages as private and used.
++				 */
++#define UB_SHMPAGES	3	/* IPC SHM segment size. */
++#define UB_DUMMY	4	/* Dummy resource (compatibility) */
++#define UB_NUMPROC	5	/* Number of processes. */
++#define UB_PHYSPAGES	6	/* All resident pages, for swapout guarantee. */
++#define UB_VMGUARPAGES	7	/* Guarantee for memory allocation,
++				 * checked against PRIVVMPAGES.
++				 */
++#define UB_OOMGUARPAGES	8	/* Guarantees against OOM kill.
++				 * Only limit is used, no accounting.
++				 */
++#define UB_NUMTCPSOCK	9	/* Number of TCP sockets. */
++#define UB_NUMFLOCK	10	/* Number of file locks. */
++#define UB_NUMPTY	11	/* Number of PTYs. */
++#define UB_NUMSIGINFO	12	/* Number of siginfos. */
++#define UB_TCPSNDBUF	13	/* Total size of tcp send buffers. */
++#define UB_TCPRCVBUF	14	/* Total size of tcp receive buffers. */
++#define UB_OTHERSOCKBUF	15	/* Total size of other socket
++				 * send buffers (all buffers for PF_UNIX).
++				 */
++#define UB_DGRAMRCVBUF	16	/* Total size of other socket
++				 * receive buffers.
++				 */
++#define UB_NUMOTHERSOCK	17	/* Number of other sockets. */
++#define UB_DCACHESIZE	18	/* Size of busy dentry/inode cache. */
++#define UB_NUMFILE	19	/* Number of open files. */
++
++#define UB_RESOURCES_COMPAT	24
++
++/* Add new resources here */
++
++#define UB_NUMXTENT	23
++#define UB_RESOURCES	24
++
++#define UB_UNUSEDPRIVVM	(UB_RESOURCES + 0)
++#define UB_TMPFSPAGES	(UB_RESOURCES + 1)
++#define UB_SWAPPAGES	(UB_RESOURCES + 2)
++#define UB_HELDPAGES	(UB_RESOURCES + 3)
++
++struct ubparm {
++	/* 
++	 * A barrier over which resource allocations are failed gracefully.
++	 * If the amount of consumed memory is over the barrier further sbrk()
++	 * or mmap() calls fail, the existing processes are not killed. 
++	 */
++	unsigned long	barrier;
++	/* hard resource limit */
++	unsigned long	limit;
++	/* consumed resources */
++	unsigned long	held;
++	/* maximum amount of consumed resources through the last period */
++	unsigned long	maxheld;
++	/* minimum amount of consumed resources through the last period */
++	unsigned long	minheld;
++	/* count of failed charges */
++	unsigned long	failcnt;
++};
++
++/*
++ * Kernel internal part.
++ */
++
++#ifdef __KERNEL__
++
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <linux/cache.h>
++#include <linux/threads.h>
++#include <linux/percpu.h>
++#include <bc/debug.h>
++#include <bc/decl.h>
++#include <asm/atomic.h>
++#include <bc/io_prio.h>
++
++/*
++ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
++ */
++#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
++
++
++/*
++ *	Resource management structures
++ * Serialization issues:
++ *   beancounter list management is protected via ub_hash_lock
++ *   task pointers are set only for current task and only once
++ *   refcount is managed atomically
++ *   value and limit comparison and change are protected by per-ub spinlock
++ */
++
++struct page_beancounter;
++struct task_beancounter;
++struct sock_beancounter;
++
++struct page_private {
++	unsigned long		ubp_unused_privvmpages;
++	unsigned long		ubp_tmpfs_respages;
++	unsigned long		ubp_swap_pages;
++	unsigned long long	ubp_held_pages;
++};
++
++struct sock_private {
++	unsigned long		ubp_rmem_thres;
++	unsigned long		ubp_wmem_pressure;
++	unsigned long		ubp_maxadvmss;
++	unsigned long		ubp_rmem_pressure;
++	int			ubp_tw_count;
++#define UB_RMEM_EXPAND          0
++#define UB_RMEM_KEEP            1
++#define UB_RMEM_SHRINK          2
++	struct list_head	ubp_other_socks;
++	struct list_head	ubp_tcp_socks;
++	atomic_t		ubp_orphan_count;
++};
++
++struct ub_percpu_struct {
++	unsigned long unmap;
++	unsigned long swapin;
++#ifdef CONFIG_BC_IO_ACCOUNTING
++	unsigned long long bytes_wrote;
++	unsigned long long bytes_read;
++	unsigned long long bytes_cancelled;
++#endif
++#ifdef CONFIG_BC_DEBUG_KMEM
++	long	pages_charged;
++	long	vmalloc_charged;
++	long	pbcs;
++#endif
++	unsigned long	sync;
++	unsigned long	sync_done;
++
++	unsigned long	fsync;
++	unsigned long	fsync_done;
++
++	unsigned long	fdsync;
++	unsigned long	fdsync_done;
++
++	unsigned long	frsync;
++	unsigned long	frsync_done;
++
++	unsigned long		write;
++	unsigned long		read;
++	unsigned long long	wchar;
++	unsigned long long	rchar;
++};
++
++struct user_beancounter
++{
++	unsigned long		ub_magic;
++	atomic_t		ub_refcount;
++	struct list_head	ub_list;
++	struct hlist_node	ub_hash;
++
++	union {
++		struct rcu_head rcu;
++		struct execute_work cleanup;
++	};
++
++	spinlock_t		ub_lock;
++	uid_t			ub_uid;
++
++	struct ub_rate_info	ub_limit_rl;
++	int			ub_oom_noproc;
++
++	struct page_private	ppriv;
++#define ub_unused_privvmpages	ppriv.ubp_unused_privvmpages
++#define ub_tmpfs_respages	ppriv.ubp_tmpfs_respages
++#define ub_swap_pages		ppriv.ubp_swap_pages
++#define ub_held_pages		ppriv.ubp_held_pages
++	struct sock_private	spriv;
++#define ub_rmem_thres		spriv.ubp_rmem_thres
++#define ub_maxadvmss		spriv.ubp_maxadvmss
++#define ub_rmem_pressure	spriv.ubp_rmem_pressure
++#define ub_wmem_pressure	spriv.ubp_wmem_pressure
++#define ub_tcp_sk_list		spriv.ubp_tcp_socks
++#define ub_other_sk_list	spriv.ubp_other_socks
++#define ub_orphan_count		spriv.ubp_orphan_count
++#define ub_tw_count		spriv.ubp_tw_count
++	struct ub_iopriv	iopriv;
++
++	struct user_beancounter *parent;
++	void			*private_data;
++	unsigned long		ub_aflags;
++
++#ifdef CONFIG_PROC_FS
++	struct proc_dir_entry	*proc;
++#endif
++
++	/* resources statistic and settings */
++	struct ubparm		ub_parms[UB_RESOURCES];
++	/* resources statistic for last interval */
++	struct ubparm		ub_store[UB_RESOURCES];
++
++	struct ub_percpu_struct	*ub_percpu;
++#ifdef CONFIG_BC_IO_ACCOUNTING
++	/* these are protected with pb_lock */
++	unsigned long long	bytes_wrote;
++	unsigned long long	bytes_dirtied;
++	unsigned long long	bytes_dirty_missed;
++	unsigned long		io_pb_held;
++#endif
++#ifdef CONFIG_BC_DEBUG_KMEM
++	struct list_head	ub_cclist;
++#endif
++};
++
++enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE };
++
++#define UB_AFLAG_NOTIF_PAGEIN	0
++
++static inline
++struct user_beancounter *top_beancounter(struct user_beancounter *ub)
++{
++	while (ub->parent != NULL)
++		ub = ub->parent;
++	return ub;
++}
++
++static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
++{
++	return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
++}
++
++static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
++{
++	return (ub->ub_parms[resource].held > 
++		((ub->ub_parms[resource].barrier) >> 1));
++}
++
++static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource)
++{
++	struct ubparm *p;
++	p = ub->ub_parms + resource;
++	return p->held <= (p->barrier >> 3);
++}
++
++static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource)
++{
++	struct ubparm *p;
++	p = ub->ub_parms + resource;
++	return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024;
++}
++
++#ifndef CONFIG_BEANCOUNTERS
++
++#define ub_percpu_add(ub, f, v)	do { } while (0)
++#define ub_percpu_sub(ub, f, v)	do { } while (0)
++#define ub_percpu_inc(ub, f)	do { } while (0)
++#define ub_percpu_dec(ub, f)	do { } while (0)
++
++#define mm_ub(mm)	(NULL)
++
++extern inline struct user_beancounter *get_beancounter_byuid
++		(uid_t uid, int create) { return NULL; }
++extern inline struct user_beancounter *get_beancounter
++		(struct user_beancounter *ub) { return NULL; }
++extern inline void put_beancounter(struct user_beancounter *ub) { }
++
++static inline void ub_init_late(void) { };
++static inline void ub_init_early(void) { };
++
++static inline int charge_beancounter(struct user_beancounter *ub,
++			int resource, unsigned long val,
++			enum ub_severity strict) { return 0; }
++static inline void uncharge_beancounter(struct user_beancounter *ub,
++			int resource, unsigned long val) { }
++
++#else /* CONFIG_BEANCOUNTERS */
++
++#define ub_percpu_add(ub, field, v)		do {			\
++		per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v);	\
++		put_cpu();						\
++	} while (0)
++#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1)
++
++#define ub_percpu_sub(ub, field, v)		do {			\
++		per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v);	\
++		put_cpu();						\
++	} while (0)
++#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1)
++
++#define mm_ub(mm)	((mm)->mm_ub)
++/*
++ *  Charge/uncharge operations
++ */
++
++extern int __charge_beancounter_locked(struct user_beancounter *ub,
++		int resource, unsigned long val, enum ub_severity strict);
++
++extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
++		int resource, unsigned long val);
++
++extern void put_beancounter_safe(struct user_beancounter *ub);
++extern void __put_beancounter(struct user_beancounter *ub);
++
++extern void uncharge_warn(struct user_beancounter *ub, int resource,
++		unsigned long val, unsigned long held);
++
++extern const char *ub_rnames[];
++/*
++ *	Put a beancounter reference
++ */
++
++static inline void put_beancounter(struct user_beancounter *ub)
++{
++	if (unlikely(ub == NULL))
++		return;
++
++	/* FIXME - optimize not to disable interrupts and make call */
++	__put_beancounter(ub);
++}
++
++/* fast put, refcount can't reach zero */
++static inline void __put_beancounter_batch(struct user_beancounter *ub, int n)
++{
++	atomic_sub(n, &ub->ub_refcount);
++}
++
++static inline void put_beancounter_batch(struct user_beancounter *ub, int n)
++{
++	if (n > 1)
++		__put_beancounter_batch(ub, n - 1);
++	__put_beancounter(ub);
++}
++
++/*
++ *	Create a new beancounter reference
++ */
++extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
++
++static inline 
++struct user_beancounter *get_beancounter(struct user_beancounter *ub)
++{
++	if (unlikely(ub == NULL))
++		return NULL;
++
++	atomic_inc(&ub->ub_refcount);
++	return ub;
++}
++
++static inline 
++struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub)
++{
++	return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL;
++}
++
++static inline void get_beancounter_batch(struct user_beancounter *ub, int n)
++{
++	atomic_add(n, &ub->ub_refcount);
++}
++
++extern struct user_beancounter *get_subbeancounter_byid(
++		struct user_beancounter *,
++		int id, int create);
++
++extern void ub_init_late(void);
++extern void ub_init_early(void);
++
++extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size);
++
++/*
++ *	Resource charging
++ * Change user's account and compare against limits
++ */
++
++static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
++{
++	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
++		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
++	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
++		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
++}
++
++int charge_beancounter(struct user_beancounter *ub, int resource,
++		unsigned long val, enum ub_severity strict);
++void uncharge_beancounter(struct user_beancounter *ub, int resource,
++		unsigned long val);
++void __charge_beancounter_notop(struct user_beancounter *ub, int resource,
++		unsigned long val);
++void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource,
++		unsigned long val);
++
++static inline void charge_beancounter_notop(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	if (ub->parent != NULL)
++		__charge_beancounter_notop(ub, resource, val);
++}
++
++static inline void uncharge_beancounter_notop(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	if (ub->parent != NULL)
++		__uncharge_beancounter_notop(ub, resource, val);
++}
++
++#endif /* CONFIG_BEANCOUNTERS */
++
++#ifndef CONFIG_BC_RSS_ACCOUNTING
++static inline void ub_ini_pbc(void) { }
++#else
++extern void ub_init_pbc(void);
++#endif
++#endif /* __KERNEL__ */
++#endif /* _LINUX_BEANCOUNTER_H */
+diff --git a/include/bc/dcache.h b/include/bc/dcache.h
+new file mode 100644
+index 0000000..5ebefff
+--- /dev/null
++++ b/include/bc/dcache.h
+@@ -0,0 +1,47 @@
++/*
++ *  include/bc/dcache.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_DCACHE_H_
++#define __BC_DCACHE_H_
++
++#include <bc/decl.h>
++
++/*
++ * UB_DCACHESIZE accounting
++ */
++
++struct dentry_beancounter
++{
++	/*
++	 *  d_inuse =
++	 *         <number of external refs> +
++	 *         <number of 'used' childs>
++	 *
++	 * d_inuse == -1 means that dentry is unused
++	 * state change -1 => 0 causes charge
++	 * state change 0 => -1 causes uncharge
++	 */
++	atomic_t d_inuse;
++	/* charged size, including name length if name is not inline */
++	unsigned long d_ubsize;
++	struct user_beancounter *d_ub;
++};
++
++#ifdef CONFIG_BEANCOUNTERS
++#define ub_dget_testone(d)  (atomic_inc_and_test(&(d)->dentry_bc.d_inuse))
++#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse))
++#define INUSE_INIT		0
++
++extern int ub_dentry_on;
++#else
++#define ub_dget_testone(d)	(0)
++#define ub_dput_testzero(d)	(0)
++#endif
++#endif
+diff --git a/include/bc/dcache_op.h b/include/bc/dcache_op.h
+new file mode 100644
+index 0000000..23306e9
+--- /dev/null
++++ b/include/bc/dcache_op.h
+@@ -0,0 +1,102 @@
++/*
++ *  include/bc/dcache_op.h
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_DCACHE_OP_H_
++#define __BC_DCACHE_OP_H_
++
++struct dentry;
++
++#ifdef CONFIG_BEANCOUNTERS
++
++#include <linux/spinlock.h>
++#include <bc/dcache.h>
++#include <bc/task.h>
++
++extern int ub_dentry_alloc_barrier;
++extern spinlock_t dcache_lock;
++
++static inline int ub_dentry_alloc(struct dentry *d)
++{
++	extern int __ub_dentry_alloc(struct dentry *);
++
++	if (!ub_dentry_on)
++		return 0;
++	return __ub_dentry_alloc(d);
++}
++
++static inline void ub_dentry_alloc_start(void)
++{
++	extern void __ub_dentry_alloc_start(void);
++
++	if (ub_dentry_alloc_barrier)
++		__ub_dentry_alloc_start();
++}
++
++static inline void ub_dentry_alloc_end(void)
++{
++	extern void __ub_dentry_alloc_end(void);
++
++	if (current->task_bc.dentry_alloc)
++		__ub_dentry_alloc_end();
++}
++
++static inline int ub_dentry_charge(struct dentry *d)
++{
++	extern int __ub_dentry_charge(struct dentry *);
++
++	if (!ub_dentry_on)
++		return 0;
++	return __ub_dentry_charge(d);
++}
++
++static inline void ub_dentry_charge_nofail(struct dentry *d)
++{
++	extern void __ub_dentry_charge_nofail(struct dentry *);
++
++	if (!ub_dentry_on)
++		return;
++	__ub_dentry_charge_nofail(d);
++}
++
++static inline void ub_dentry_uncharge_locked(struct dentry *d)
++{
++	extern void __ub_dentry_uncharge(struct dentry *);
++
++	if (!ub_dentry_on)
++		return;
++	__ub_dentry_uncharge(d);
++}
++
++static inline void ub_dentry_uncharge(struct dentry *d)
++{
++	extern void __ub_dentry_uncharge(struct dentry *);
++
++	if (!ub_dentry_on)
++		return;
++	spin_lock(&dcache_lock);
++	__ub_dentry_uncharge(d);
++	spin_unlock(&dcache_lock);
++}
++
++void uncharge_dcache(struct user_beancounter *ub, unsigned long size);
++#else /* CONFIG_BEANCOUNTERS */
++
++static inline int ub_dentry_alloc(struct dentry *d) { return 0; }
++static inline void ub_dentry_alloc_start(void) { }
++static inline void ub_dentry_alloc_end(void) { }
++static inline int ub_dentry_charge(struct dentry *d) { return 0; }
++static inline void ub_dentry_charge_nofail(struct dentry *d) { }
++static inline void ub_dentry_uncharge_locked(struct dentry *d) { }
++static inline void ub_dentry_uncharge(struct dentry *d) { }
++static inline void uncharge_dcache(struct user_beancounter *ub, unsigned long size) { }
++
++#endif /* CONFIG_BEANCOUNTERS */
++
++#endif /* __dcache_op.h_ */
+diff --git a/include/bc/debug.h b/include/bc/debug.h
+new file mode 100644
+index 0000000..7b1feb6
+--- /dev/null
++++ b/include/bc/debug.h
+@@ -0,0 +1,109 @@
++/*
++ *  include/bc/debug.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_DEBUG_H_
++#define __BC_DEBUG_H_
++
++/*
++ * general debugging
++ */
++
++#define UBD_ALLOC	0x1
++#define UBD_CHARGE	0x2
++#define UBD_LIMIT	0x4
++#define UBD_TRACE	0x8
++
++/*
++ * ub_net debugging
++ */
++
++#define UBD_NET_SOCKET	0x10
++#define UBD_NET_SLEEP	0x20
++#define UBD_NET_SEND	0x40
++#define UBD_NET_RECV	0x80
++
++/*
++ * Main routines
++ */
++
++#define UB_DEBUG (0)
++#define DEBUG_RESOURCE (0ULL)
++
++#define ub_dbg_cond(__cond, __str, args...)				\
++	do { 								\
++		if ((__cond) != 0)					\
++			printk(__str, ##args);				\
++	} while(0)
++
++#define ub_debug(__section, __str, args...) 				\
++	ub_dbg_cond(UB_DEBUG & (__section), __str, ##args)
++
++#define ub_debug_resource(__resource, __str, args...)			\
++	ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && 				\
++			(DEBUG_RESOURCE & (1 << (__resource))), 	\
++			__str, ##args)
++
++#if UB_DEBUG & UBD_TRACE
++#define ub_debug_trace(__cond, __b, __r)				\
++		do {							\
++			static struct ub_rate_info ri =	{ __b, __r };	\
++			if ((__cond) != 0 && ub_ratelimit(&ri))		\
++				dump_stack(); 				\
++		} while(0)
++#else
++#define ub_debug_trace(__cond, __burst, __rate)
++#endif
++
++#ifdef CONFIG_BC_DEBUG_KMEM
++#include <linux/list.h>
++
++struct user_beancounter;
++struct ub_cache_counter {
++	struct list_head ulist;
++	struct ub_cache_counter *next;
++	struct user_beancounter *ub;
++	struct kmem_cache *cachep;
++	unsigned long counter;
++};
++
++extern spinlock_t cc_lock;
++extern void init_cache_counters(void);
++extern void ub_free_counters(struct user_beancounter *);
++extern void ub_kmemcache_free(struct kmem_cache *cachep);
++
++struct vm_struct;
++#define inc_vmalloc_charged(vm, flags)	do {				\
++		if (flags & __GFP_UBC)					\
++			ub_percpu_add(get_exec_ub(), vmalloc_charged,	\
++					vm->nr_pages);			\
++	} while (0)
++#define dec_vmalloc_charged(vm)		do {				\
++		struct user_beancounter *ub;				\
++		ub = page_ub(vm->pages[0]);				\
++		if (ub != NULL)						\
++			ub_percpu_sub(ub, vmalloc_charged,		\
++					vm->nr_pages);			\
++	} while (0)
++
++#define inc_pbc_count(ub)	ub_percpu_inc(ub, pbcs)
++#define dec_pbc_count(ub)	ub_percpu_dec(ub, pbcs)
++#else
++#define init_cache_counters()		do { } while (0)
++#define inc_vmalloc_charged(vm, f)	do { } while (0)
++#define dec_vmalloc_charged(vm)		do { } while (0)
++
++#define inc_pbc_count(ub)		do { } while (0)
++#define dec_pbc_count(ub)		do { } while (0)
++
++#define ub_free_counters(ub)		do { } while (0)
++#define ub_kmemcache_free(cachep)	do { } while (0)
++#endif
++
++#endif
+diff --git a/include/bc/decl.h b/include/bc/decl.h
+new file mode 100644
+index 0000000..6dd4cb9
+--- /dev/null
++++ b/include/bc/decl.h
+@@ -0,0 +1,41 @@
++/*
++ *  include/bc/decl.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_DECL_H_
++#define __BC_DECL_H_
++
++#ifdef __KERNEL__
++
++/*
++ * Naming convension:
++ * ub_<section|object>_<operation>
++ */
++
++#ifdef CONFIG_BEANCOUNTERS
++
++#define UB_DECLARE_FUNC(ret_type, decl)	extern ret_type decl;
++#define UB_DECLARE_VOID_FUNC(decl)	extern void decl;
++
++#else /* CONFIG_BEANCOUNTERS */
++
++#define UB_DECLARE_FUNC(ret_type, decl)		\
++	static inline ret_type decl		\
++	{					\
++		return (ret_type)0;		\
++	}
++#define UB_DECLARE_VOID_FUNC(decl)		\
++	static inline void decl			\
++	{					\
++	}
++
++#endif /* CONFIG_BEANCOUNTERS */
++#endif
++
++#endif
+diff --git a/include/bc/hash.h b/include/bc/hash.h
+new file mode 100644
+index 0000000..b2afb69
+--- /dev/null
++++ b/include/bc/hash.h
+@@ -0,0 +1,36 @@
++/*
++ *  include/bc/hash.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_UBHASH_H
++#define _LINUX_UBHASH_H
++
++#ifdef __KERNEL__
++
++#define UB_HASH_SIZE 256
++
++extern struct hlist_head ub_hash[];
++extern spinlock_t ub_hash_lock;
++extern struct list_head ub_list_head;
++
++#ifdef CONFIG_BEANCOUNTERS
++
++/*
++ * Iterate over beancounters
++ * @__ubp - beancounter ptr
++ * Can use break :)
++ */
++#define for_each_beancounter(__ubp)				\
++	list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list)	\
++
++#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash)
++
++#endif /* CONFIG_BEANCOUNTERS */
++#endif /* __KERNEL__ */
++#endif /* _LINUX_UBHASH_H */
+diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h
+new file mode 100644
+index 0000000..d84bf5a
+--- /dev/null
++++ b/include/bc/io_acct.h
+@@ -0,0 +1,113 @@
++/*
++ *  include/bc/io_acct.h
++ *
++ *  Copyright (C) 2006 SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ *  Pavel Emelianov <xemul at openvz.org>
++ *
++ */
++
++#ifndef __UB_IO_ACCT_H_
++#define __UB_IO_ACCT_H_
++
++#ifdef CONFIG_BC_IO_ACCOUNTING
++#include <bc/beancounter.h>
++#include <bc/rss_pages.h>
++
++#define page_iopb(page)	({			\
++		struct page_beancounter *pb;	\
++		pb = page_pbc(page);		\
++		rmb();				\
++		pb;				\
++	})
++
++/*
++ * IO ub is required in task context only, so if exec_ub is set
++ * to NULL this means that uses doesn't need to charge some
++ * resources. nevertheless IO activity must be accounted, so we
++ * account it to current's task beancounter.
++ */
++
++static inline struct user_beancounter *get_io_ub(void)
++{
++	struct user_beancounter *ub;
++
++	ub = get_exec_ub();
++	if (unlikely(ub == NULL))
++		ub = get_task_ub(current);
++
++	return top_beancounter(ub);
++}
++
++extern struct page_beancounter **page_pblist(struct page *);
++
++extern void ub_io_save_context(struct page *, size_t);
++extern void ub_io_release_context(struct page *pg, size_t size);
++
++#define PAGE_IO_MARK	(0x1UL)
++
++static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb)
++{
++	if (!((unsigned long)pb & PAGE_IO_MARK))
++		return NULL;
++
++	return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK);
++}
++
++static inline void ub_io_account_read(size_t bytes)
++{
++	ub_percpu_add(get_io_ub(), bytes_read, bytes);
++}
++
++static inline void ub_io_account_write(size_t bytes)
++{
++	ub_percpu_add(get_io_ub(), bytes_wrote, bytes);
++}
++
++static inline void ub_io_account_dirty(struct page *page, size_t bytes)
++{
++	ub_io_save_context(page, bytes);
++}
++
++static inline void ub_io_account_write_cancelled(size_t bytes)
++{
++	ub_percpu_add(get_io_ub(), bytes_cancelled, bytes);
++}
++
++void ub_init_io(struct kmem_cache *);
++#else /* BC_IO_ACCOUNTING */
++#define page_iopb(page)		(NULL)
++#define page_pblist(page)	(&page_pbc(page))
++
++static inline void ub_io_release_context(struct page *pg, size_t bytes)
++{
++}
++
++static inline void ub_io_account_dirty(struct page *p, size_t bytes)
++{
++}
++
++static inline void ub_io_account_read(size_t bytes)
++{
++}
++
++static inline void ub_io_account_write(size_t bytes)
++{
++}
++
++static inline void ub_io_account_write_cancelled(size_t bytes)
++{
++}
++
++static inline void ub_init_io(struct kmem_cache *pb_cachep) { };
++#endif
++
++#ifdef CONFIG_BC_DEBUG_IO
++extern void ub_io_release_debug(struct page *pg);
++#else
++#define ub_io_release_debug(pg)	do { } while (0)
++#endif
++#endif
+diff --git a/include/bc/io_prio.h b/include/bc/io_prio.h
+new file mode 100644
+index 0000000..8c1d1e3
+--- /dev/null
++++ b/include/bc/io_prio.h
+@@ -0,0 +1,82 @@
++/*
++ *  include/bc/io_prio.h
++ *
++ *  Copyright (C) 2007 SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ *  Vasily Tarasov <vtaras at openvz.org>
++ *
++ */
++
++#ifndef _UB_IO_PRIO_H
++#define _UB_IO_PRIO_H
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/cfq-iosched.h>
++
++#define UB_IOPRIO_MIN 0
++#define UB_IOPRIO_MAX IOPRIO_BE_NR
++#define UB_IOPRIO_BASE 4
++
++struct ub_iopriv {
++	struct list_head	cfq_bc_head;
++	rwlock_t		cfq_bc_list_lock;
++
++	unsigned int		ioprio;
++};
++
++struct cfq_data;
++struct cfq_queue;
++
++#ifdef CONFIG_BC_IO_SCHED
++extern void bc_init_ioprio(struct ub_iopriv *);
++extern void bc_fini_ioprio(struct ub_iopriv *);
++extern struct cfq_bc_data * bc_find_cfq_bc(struct ub_iopriv *,
++					struct cfq_data *);
++extern struct cfq_bc_data * bc_findcreate_cfq_bc(struct ub_iopriv *,
++					struct cfq_data *, gfp_t gfp_mask);
++extern void bc_cfq_exit_queue(struct cfq_data *);
++extern int bc_expired(struct cfq_data *);
++extern void bc_schedule_active(struct cfq_data *);
++extern void  bc_inc_rqnum(struct cfq_queue *);
++extern void bc_dec_rqnum(struct cfq_queue *);
++extern unsigned long bc_set_ioprio(int, int);
++extern struct cfq_bc_data *
++__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd);
++extern struct user_beancounter *bc_io_switch_context(struct page *);
++extern void bc_io_restore_context(struct user_beancounter *);
++#else
++#include <linux/cfq-iosched.h>
++static inline void bc_init_ioprio(struct ub_iopriv *iopriv) { ; }
++static inline void bc_fini_ioprio(struct ub_iopriv *iopriv) { ; }
++static inline struct cfq_bc_data *
++bc_findcreate_cfq_bc(struct ub_iopriv *iopriv,
++			struct cfq_data *cfqd, gfp_t mask)
++{
++	return &cfqd->cfq_bc;
++}
++static inline void bc_cfq_exit_queue(struct cfq_data *cfqd) { ; }
++static inline int bc_expired(struct cfq_data *cfqd) { return 0; }
++static inline void bc_schedule_active(struct cfq_data *cfqd)
++{
++	cfqd->active_cfq_bc = &cfqd->cfq_bc;
++}
++static inline void bc_inc_rqnum(struct cfq_queue *cfqq) { ; }
++static inline void bc_dec_rqnum(struct cfq_queue *cfqq) { ; }
++static inline unsigned long bc_set_ioprio(int ubid, int ioprio)
++{
++	return -EINVAL;
++}
++static inline struct cfq_bc_data *
++__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd)
++{
++	return &cfqd->cfq_bc;
++}
++static inline struct user_beancounter *
++bc_io_switch_context(struct page *page) { return NULL; }
++static inline void bc_io_restore_context(struct user_beancounter *ub) { ; }
++#endif /* CONFIG_BC_IO_SCHED */
++#endif /* _UB_IO_PRIO_H */
+diff --git a/include/bc/kmem.h b/include/bc/kmem.h
+new file mode 100644
+index 0000000..c0ea26a
+--- /dev/null
++++ b/include/bc/kmem.h
+@@ -0,0 +1,69 @@
++/*
++ *  include/bc/kmem.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_SLAB_H_
++#define __UB_SLAB_H_
++
++#include <bc/beancounter.h>
++#include <bc/decl.h>
++
++/*
++ * UB_KMEMSIZE accounting
++ */
++
++#ifdef CONFIG_BC_DEBUG_ITEMS
++#define CHARGE_ORDER(__o)		(1 << (__o))
++#define CHARGE_SIZE(__s)		1
++#else
++#define CHARGE_ORDER(__o)		(PAGE_SIZE << (__o))
++#define CHARGE_SIZE(__s)		(__s)
++#endif
++
++#ifdef CONFIG_BEANCOUNTERS
++#define page_ub(__page)	((__page)->bc.page_ub)
++#else
++#define page_ub(__page)	NULL
++#endif
++
++struct mm_struct;
++struct page;
++struct kmem_cache;
++
++UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj))
++UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj))
++
++UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub,
++		unsigned long size, enum ub_severity strict))
++UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub,
++		unsigned long size))
++
++UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask))
++UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order))
++UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep,
++			void *objp, gfp_t flags))
++UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj))
++
++#ifdef CONFIG_BEANCOUNTERS
++static inline int should_charge(struct kmem_cache *cachep, gfp_t flags)
++{
++	if (!(cachep->flags & SLAB_UBC))
++		return 0;
++	if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC))
++		return 0;
++	return 1;
++}
++
++#define should_uncharge(cachep)	should_charge(cachep, __GFP_UBC)
++#else
++#define should_charge(cache, f)	0
++#define should_uncharge(cache)	0
++#endif
++
++#endif /* __UB_SLAB_H_ */
+diff --git a/include/bc/misc.h b/include/bc/misc.h
+new file mode 100644
+index 0000000..84082b2
+--- /dev/null
++++ b/include/bc/misc.h
+@@ -0,0 +1,55 @@
++/*
++ *  include/bc/misc.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_MISC_H_
++#define __BC_MISC_H_
++
++#include <bc/decl.h>
++
++struct tty_struct;
++struct file;
++struct file_lock;
++struct sigqueue;
++
++UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
++UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
++UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
++UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
++UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q,
++			struct user_beancounter *ub))
++UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q))
++UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent,
++			struct task_struct *task))
++UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task))
++UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task))
++UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
++UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
++
++#ifdef CONFIG_BEANCOUNTERS
++#define set_flock_charged(fl)	do { (fl)->fl_charged = 1; } while (0)
++#define unset_flock_charged(fl)	do {		\
++		WARN_ON((fl)->fl_charged == 0);	\
++		(fl)->fl_charged = 0;		\
++	} while (0)
++#define set_mm_ub(mm, tsk)	do {				\
++		(mm)->mm_ub = get_beancounter(tsk != current ?	\
++			tsk->task_bc.task_ub : get_exec_ub());	\
++	} while (0)
++#define put_mm_ub(mm)		do {				\
++		put_beancounter((mm)->mm_ub);			\
++		(mm)->mm_ub = NULL;				\
++	} while (0)
++#else
++#define set_flock_charged(fl)	do { } while (0)
++#define unset_flock_charged(fl)	do { } while (0)
++#define set_mm_ub(mm, tsk)	do { } while (0)
++#define put_mm_ub(mm)		do { } while (0)
++#endif
++#endif
+diff --git a/include/bc/net.h b/include/bc/net.h
+new file mode 100644
+index 0000000..5330a88
+--- /dev/null
++++ b/include/bc/net.h
+@@ -0,0 +1,215 @@
++/*
++ *  include/bc/net.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_NET_H_
++#define __BC_NET_H_
++
++/*
++ * UB_NUMXXXSOCK, UB_XXXBUF accounting
++ */
++
++#include <bc/decl.h>
++#include <bc/sock.h>
++#include <bc/beancounter.h>
++
++#define bid2sid(__bufid) \
++	((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK)
++
++#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \
++			~(SMP_CACHE_BYTES-1)))
++#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE)
++
++static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	memset(skb_bc(skb), 0, sizeof(struct skb_beancounter));
++#endif
++	return 0;
++}
++
++static inline void ub_skb_free_bc(struct sk_buff *skb)
++{
++}
++
++#define IS_TCP_SOCK(__family, __type) \
++		(((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM)
++
++/* number of sockets */
++UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type))
++UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) 
++UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk))
++UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk))
++
++/* management of queue for send space */
++UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, 
++			unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, 
++			unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk))
++
++/* send space */
++UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid,
++			unsigned long size))
++UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid,
++			unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid,
++			unsigned long size, unsigned long ressize))
++UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk,
++			struct sk_buff *skb, enum ub_severity strict))
++UB_DECLARE_VOID_FUNC(ub_sock_tcp_unchargesend(struct sock *sk,
++			unsigned long size))
++UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk))
++UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk))
++
++UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk))
++
++/* receive space */
++UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk,
++			struct sk_buff *skb, enum ub_severity strict))
++
++/* skb destructor */
++UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb))
++
++static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size)
++{
++	return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size);
++}
++
++static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size)
++{
++	return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size);
++}
++
++UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk,
++			unsigned long size))
++
++static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size)
++{
++	return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size);
++}
++
++UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk,
++			unsigned long size, unsigned long ressize))
++
++static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size,
++		unsigned long ressize)
++{
++	ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize);
++}
++
++static inline void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz)
++{
++	ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz);
++}
++
++static inline void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)
++{
++	ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz);
++}
++
++static inline int ub_tcpsndbuf_charge(struct sock *sk,
++		struct sk_buff *skb)
++{
++	return ub_sock_tcp_chargesend(sk, skb, UB_HARD);
++}
++
++static inline int ub_tcpsndbuf_charge_forced(struct sock *sk,
++		struct sk_buff *skb)
++{
++	return ub_sock_tcp_chargesend(sk, skb, UB_FORCE);
++}
++
++static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++	return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT);
++}
++
++static inline int ub_tcprcvbuf_charge_forced(struct sock *sk,
++		struct sk_buff *skb)
++{
++	return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE);
++}
++
++/* Charge size */
++static inline unsigned long skb_charge_datalen(unsigned long chargesize)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	unsigned long slabsize;
++
++	chargesize -= sizeof(struct sk_buff);
++	slabsize = 64;
++	do { 
++		slabsize <<= 1; 
++	} while (slabsize <= chargesize);
++
++	slabsize >>= 1;
++	return (slabsize - sizeof(struct skb_shared_info)) &
++		~(SMP_CACHE_BYTES-1);
++#else
++	return 0;
++#endif
++}
++
++static inline unsigned long skb_charge_size_gen(unsigned long size)
++{ 
++#ifdef CONFIG_BEANCOUNTERS
++	unsigned int slabsize;
++
++	size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info);
++	slabsize = 32; /* min size is 64 because of skb_shared_info */
++	do { 
++		slabsize <<= 1; 
++	} while (slabsize < size);
++
++	return slabsize + sizeof(struct sk_buff);
++#else
++	return 0;
++#endif
++
++}
++	
++static inline unsigned long skb_charge_size_const(unsigned long size)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	unsigned int ret;
++	if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64)
++		ret = 64 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128)
++		ret = 128 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256)
++		ret = 256 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512)
++		ret = 512 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024)
++		ret = 1024 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048)
++		ret = 2048 + sizeof(struct sk_buff);
++	else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096)
++		ret = 4096 + sizeof(struct sk_buff);
++	else
++		ret = skb_charge_size_gen(size);
++	return ret;
++#else
++	return 0;
++#endif
++}
++
++
++#define skb_charge_size(__size)			\
++	(__builtin_constant_p(__size)	?	\
++	 skb_charge_size_const(__size)	:	\
++	 skb_charge_size_gen(__size))
++
++UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb))
++UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, 
++			struct sock *sk, unsigned long size, int res))
++
++#endif
+diff --git a/include/bc/oom_kill.h b/include/bc/oom_kill.h
+new file mode 100644
+index 0000000..c07608f
+--- /dev/null
++++ b/include/bc/oom_kill.h
+@@ -0,0 +1,26 @@
++#include <bc/decl.h>
++#include <bc/task.h>
++
++UB_DECLARE_FUNC(int, ub_oom_lock(void))
++UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void))
++UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub))
++UB_DECLARE_VOID_FUNC(ub_oom_unlock(void))
++UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub))
++UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk))
++UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub,
++			struct task_struct *tsk))
++
++#ifdef CONFIG_BEANCOUNTERS
++extern int oom_generation;
++extern int oom_kill_counter;
++#define ub_oom_start() do {						\
++		current->task_bc.oom_generation = oom_generation;	\
++	} while (0)
++#define ub_oom_task_killed(p) do { 					\
++		oom_kill_counter++;					\
++		wake_up_process(p);					\
++	} while (0)
++#else
++#define ub_oom_start()			do { } while (0)
++#define ub_oom_task_killed(p)		do { } while (0)
++#endif
+diff --git a/include/bc/proc.h b/include/bc/proc.h
+new file mode 100644
+index 0000000..f244523
+--- /dev/null
++++ b/include/bc/proc.h
+@@ -0,0 +1,40 @@
++/*
++ *  include/bc/proc.h
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_PROC_H_
++#define __UB_PROC_H_
++
++#include <linux/seq_file.h>
++
++struct bc_proc_entry {
++	char *name;
++	union {
++		int (*show)(struct seq_file *, void *);
++		struct file_operations *fops;
++	} u;
++	struct bc_proc_entry *next;
++	int cookie;
++};
++
++struct user_beancounter;
++
++void bc_register_proc_entry(struct bc_proc_entry *);
++void bc_register_proc_root_entry(struct bc_proc_entry *);
++
++static inline struct user_beancounter *seq_beancounter(struct seq_file *f)
++{
++	return (struct user_beancounter *)(f->private);
++}
++
++extern const char *bc_proc_lu_fmt;
++extern const char *bc_proc_lu_lfmt;
++extern const char *bc_proc_llu_fmt;
++extern const char *bc_proc_lu_lu_fmt;
++#endif
+diff --git a/include/bc/rss_pages.h b/include/bc/rss_pages.h
+new file mode 100644
+index 0000000..b195961
+--- /dev/null
++++ b/include/bc/rss_pages.h
+@@ -0,0 +1,57 @@
++/*
++ *  include/bc/rss_pages.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __RSS_PAGES_H_
++#define __RSS_PAGES_H_
++
++/*
++ * Page_beancounters
++ */
++
++struct page;
++struct user_beancounter;
++
++#define PB_MAGIC 0x62700001UL
++
++struct page_beancounter {
++	unsigned long pb_magic;
++	struct page *page;
++	struct user_beancounter *ub;
++	union {
++		struct page_beancounter *next_hash;
++		struct page_beancounter *page_pb_list;
++	};
++	union {
++		unsigned refcount;
++		unsigned io_debug;
++	};
++	union {
++		struct list_head page_list;
++		struct list_head io_list;
++	};
++};
++
++#define PB_REFCOUNT_BITS 24
++#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS)
++#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS))
++#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS))
++#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1))
++#define PB_COUNT_INC(c) ((c)++)
++#define PB_COUNT_DEC(c) ((c)--)
++#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c))
++
++#define page_pbc(__page)        ((__page)->bc.page_pb)
++
++extern spinlock_t pb_lock;
++
++struct address_space;
++extern int is_shmem_mapping(struct address_space *);
++
++#endif
+diff --git a/include/bc/sock.h b/include/bc/sock.h
+new file mode 100644
+index 0000000..b314c9b
+--- /dev/null
++++ b/include/bc/sock.h
+@@ -0,0 +1,47 @@
++/*
++ *  include/bc/sock.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_SOCK_H_
++#define __BC_SOCK_H_
++
++#include <bc/task.h>
++
++struct sock;
++struct sk_buff;
++
++struct skb_beancounter {
++	struct user_beancounter *ub;
++	unsigned long charged:27, resource:5;
++};
++
++struct sock_beancounter {
++	struct user_beancounter *ub;
++	/*
++	 * poll_reserv accounts space already charged for future sends.
++	 * It is required to make poll agree with sendmsg.
++	 * Additionally, it makes real charges (with taking bc spinlock)
++	 * in the send path rarer, speeding networking up.
++	 * For TCP (only): changes are protected by socket lock (not bc!)
++	 * For all proto: may be read without serialization in poll.
++	 */
++	unsigned long           poll_reserv;
++	unsigned long		forw_space;
++	/* fields below are protected by bc spinlock */
++	unsigned long           ub_waitspc;     /* space waiting for */
++	unsigned long           ub_wcharged;
++	struct list_head        ub_sock_list;
++};
++
++#define sock_bc(__sk)		(&(__sk)->sk_bc)
++#define skb_bc(__skb)		(&(__skb)->skb_bc)
++#define skbc_sock(__skbc)	(container_of(__skbc, struct sock, sk_bc))
++#define sock_has_ubc(__sk)	(sock_bc(__sk)->ub != NULL)
++
++#endif
+diff --git a/include/bc/sock_orphan.h b/include/bc/sock_orphan.h
+new file mode 100644
+index 0000000..038d52b
+--- /dev/null
++++ b/include/bc/sock_orphan.h
+@@ -0,0 +1,106 @@
++/*
++ *  include/bc/sock_orphan.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_SOCK_ORPHAN_H_
++#define __BC_SOCK_ORPHAN_H_
++
++#include <net/tcp.h>
++
++#include "bc/beancounter.h"
++#include "bc/net.h"
++
++
++static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	if (sock_has_ubc(sk))
++		return &sock_bc(sk)->ub->ub_orphan_count;
++#endif
++	return sk->sk_prot->orphan_count;
++}
++
++static inline void ub_inc_orphan_count(struct sock *sk)
++{
++	atomic_inc(__ub_get_orphan_count_ptr(sk));
++}
++
++static inline void ub_dec_orphan_count(struct sock *sk)
++{
++	atomic_dec(__ub_get_orphan_count_ptr(sk));
++}
++
++static inline int ub_get_orphan_count(struct sock *sk)
++{
++	return atomic_read(__ub_get_orphan_count_ptr(sk));
++}
++
++extern int __ub_too_many_orphans(struct sock *sk, int count);
++static inline int ub_too_many_orphans(struct sock *sk, int count)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	if (__ub_too_many_orphans(sk, count))
++		return 1;
++#endif
++	return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans ||
++		(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
++		 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]));
++}
++
++#include <bc/kmem.h>
++
++struct inet_timewait_sock;
++
++static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *ub;
++
++	ub = slab_ub(tw);
++	if (ub != NULL)
++		ub->ub_tw_count += incdec;
++#endif
++}
++
++static inline int __ub_timewait_check(struct sock *sk)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *ub;
++	unsigned long mem_max, mem;
++	int tw_count;
++
++	ub = sock_bc(sk)->ub;
++	if (ub == NULL)
++		return 1;
++
++	tw_count = ub->ub_tw_count;
++	mem_max = sysctl_tcp_max_tw_kmem_fraction *
++		((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1);
++	mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab);
++	mem *= tw_count;
++	return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max;
++#else
++	return 1;
++#endif
++}
++
++#define ub_timewait_inc(tw, twdr) do {			\
++		if ((twdr)->ub_managed)			\
++			ub_timewait_mod(tw, 1);		\
++	} while (0)
++
++#define ub_timewait_dec(tw, twdr) do {			\
++		if ((twdr)->ub_managed)			\
++			ub_timewait_mod(tw, -1);	\
++	} while (0)
++
++#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \
++					__ub_timewait_check(sk))
++
++#endif
+diff --git a/include/bc/statd.h b/include/bc/statd.h
+new file mode 100644
+index 0000000..9dafc5e
+--- /dev/null
++++ b/include/bc/statd.h
+@@ -0,0 +1,70 @@
++/*
++ *  include/bc/statd.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_STATD_H_
++#define __BC_STATD_H_
++
++/* sys_ubstat commands list */
++#define UBSTAT_READ_ONE			0x010000
++#define UBSTAT_READ_ALL			0x020000
++#define UBSTAT_READ_FULL		0x030000
++#define UBSTAT_UBLIST			0x040000
++#define UBSTAT_UBPARMNUM		0x050000
++#define UBSTAT_GETTIME			0x060000
++
++#define UBSTAT_CMD(func)		((func) & 0xF0000)
++#define UBSTAT_PARMID(func)		((func) & 0x0FFFF)
++
++#define TIME_MAX_SEC		(LONG_MAX / HZ)
++#define TIME_MAX_JIF		(TIME_MAX_SEC * HZ)
++
++typedef unsigned long ubstattime_t;
++
++typedef struct {
++	ubstattime_t	start_time;
++	ubstattime_t	end_time;
++	ubstattime_t	cur_time;
++} ubgettime_t;
++
++typedef struct {
++	long		maxinterval;
++	int		signum;
++} ubnotifrq_t;
++
++typedef struct {
++	unsigned long	maxheld;
++	unsigned long	failcnt;
++} ubstatparm_t;
++
++typedef struct {
++	unsigned long	barrier;
++	unsigned long	limit;
++	unsigned long	held;
++	unsigned long	maxheld;
++	unsigned long	minheld;
++	unsigned long	failcnt;
++	unsigned long __unused1;
++	unsigned long __unused2;
++} ubstatparmf_t;
++
++typedef struct {
++	ubstattime_t	start_time;
++	ubstattime_t	end_time;
++	ubstatparmf_t	param[0];
++} ubstatfull_t;
++
++#ifdef __KERNEL__
++struct ub_stat_notify {
++	struct list_head	list;
++	struct task_struct	*task;
++	int			signum;
++};
++#endif
++#endif
+diff --git a/include/bc/task.h b/include/bc/task.h
+new file mode 100644
+index 0000000..f5a2915
+--- /dev/null
++++ b/include/bc/task.h
+@@ -0,0 +1,69 @@
++/*
++ *  include/bc/task.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_TASK_H_
++#define __BC_TASK_H_
++
++struct user_beancounter;
++
++
++#ifdef CONFIG_BEANCOUNTERS
++struct task_beancounter {
++	struct user_beancounter	*exec_ub;
++	struct user_beancounter *saved_ub;
++	struct user_beancounter	*task_ub;
++	struct user_beancounter *fork_sub;
++	unsigned long file_precharged, file_quant, file_count;
++	unsigned long kmem_precharged;
++	char dentry_alloc, pgfault_handle;
++	void *task_fnode, *task_freserv;
++	unsigned long oom_generation;
++	unsigned long task_data[4];
++	unsigned long pgfault_allot;
++};
++
++#define get_task_ub(__task)	((__task)->task_bc.task_ub)
++
++extern struct user_beancounter ub0;
++#define get_ub0()	(&ub0)
++
++#define ub_save_context(t)	do {				\
++		t->task_bc.saved_ub = t->task_bc.exec_ub;	\
++		t->task_bc.exec_ub = get_ub0();			\
++	} while (0)
++#define ub_restore_context(t)	do {				\
++		t->task_bc.exec_ub = t->task_bc.saved_ub;	\
++	} while (0)
++
++#define get_exec_ub()		(current->task_bc.exec_ub)
++#define set_exec_ub(__newub)		\
++({					\
++	struct user_beancounter *old;	\
++	struct task_beancounter *tbc;	\
++ 					\
++	tbc = &current->task_bc;	\
++	old = tbc->exec_ub;		\
++	tbc->exec_ub = __newub;		\
++	old;				\
++})
++
++void ub_init_task_bc(struct task_beancounter *);
++
++#else /* CONFIG_BEANCOUNTERS */
++
++#define get_ub0()		(NULL)
++#define get_exec_ub()		(NULL)
++#define get_task_ub(task)	(NULL)
++#define set_exec_ub(__ub)	(NULL)
++#define ub_save_context(t)	do { } while (0)
++#define ub_restore_context(t)	do { } while (0)
++
++#endif /* CONFIG_BEANCOUNTERS */
++#endif /* __task.h_ */
+diff --git a/include/bc/tcp.h b/include/bc/tcp.h
+new file mode 100644
+index 0000000..d2bf748
+--- /dev/null
++++ b/include/bc/tcp.h
+@@ -0,0 +1,76 @@
++/*
++ *  include/bc/tcp.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __BC_TCP_H_
++#define __BC_TCP_H_
++
++/*
++ * UB_NUMXXXSOCK, UB_XXXBUF accounting
++ */
++
++#include <bc/sock.h>
++#include <bc/beancounter.h>
++
++static inline void ub_tcp_update_maxadvmss(struct sock *sk)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	if (!sock_has_ubc(sk))
++		return;
++	if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss)
++		return;
++
++	sock_bc(sk)->ub->ub_maxadvmss =
++		skb_charge_size(MAX_HEADER + sizeof(struct iphdr)
++				+ sizeof(struct tcphdr)	+ tcp_sk(sk)->advmss);
++#endif
++}
++
++static inline int ub_tcp_rmem_allows_expand(struct sock *sk)
++{
++	if (tcp_memory_pressure)
++		return 0;
++#ifdef CONFIG_BEANCOUNTERS
++	if (sock_has_ubc(sk)) {
++		struct user_beancounter *ub;
++
++		ub = sock_bc(sk)->ub;
++		if (ub->ub_rmem_pressure == UB_RMEM_EXPAND)
++			return 1;
++		if (ub->ub_rmem_pressure == UB_RMEM_SHRINK)
++			return 0;
++		return sk->sk_rcvbuf <= ub->ub_rmem_thres;
++	}
++#endif
++	return 1;
++}
++
++static inline int ub_tcp_memory_pressure(struct sock *sk)
++{
++	if (tcp_memory_pressure)
++		return 1;
++#ifdef CONFIG_BEANCOUNTERS
++	if (sock_has_ubc(sk))
++		return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND;
++#endif
++	return 0;
++}
++
++static inline int ub_tcp_shrink_rcvbuf(struct sock *sk)
++{
++	if (tcp_memory_pressure)
++		return 1;
++#ifdef CONFIG_BEANCOUNTERS
++	if (sock_has_ubc(sk))
++		return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK;
++#endif
++	return 0;
++}
++
++#endif
+diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h
+new file mode 100644
+index 0000000..09642e3
+--- /dev/null
++++ b/include/bc/vmpages.h
+@@ -0,0 +1,152 @@
++/*
++ *  include/bc/vmpages.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_PAGES_H_
++#define __UB_PAGES_H_
++
++#include <linux/linkage.h>
++#include <bc/beancounter.h>
++#include <bc/decl.h>
++
++/*
++ * Check whether vma has private or copy-on-write mapping.
++ * Should match checks in ub_protected_charge().
++ */
++#define VM_UB_PRIVATE(__flags, __file)					\
++		( ((__flags) & VM_WRITE) ?				\
++			(__file) == NULL || !((__flags) & VM_SHARED) :	\
++			0						\
++		)
++
++/* Mprotect charging result */
++#define PRIVVM_ERROR		-1
++#define PRIVVM_NO_CHARGE	 0 /* UB_DECLARE_FUNC retval with ubc off */
++#define PRIVVM_TO_PRIVATE	 1
++#define PRIVVM_TO_SHARED	 2
++
++UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm,
++			unsigned long size,
++			unsigned long newflags,
++			struct vm_area_struct *vma))
++
++UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm,
++			struct vm_area_struct *vma,
++			unsigned long num))
++#define ub_unused_privvm_inc(mm, vma)	ub_unused_privvm_add(mm, vma, 1)
++UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm,
++			struct vm_area_struct *vma,
++			unsigned long num))
++#define ub_unused_privvm_dec(mm, vma)	ub_unused_privvm_sub(mm, vma, 1)
++
++UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm,
++			long sz))
++
++UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
++			unsigned long size,
++			unsigned vm_flags,
++			struct file *vm_file,
++			int strict))
++UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
++			unsigned long size,
++			unsigned vm_flags,
++			struct file *vm_file))
++
++struct shmem_inode_info;
++UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i,
++			unsigned long sz))
++UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i,
++			unsigned long sz))
++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi))
++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
++			unsigned long size))
++#define ub_tmpfs_respages_dec(shi)	ub_tmpfs_respages_sub(shi, 1)
++
++#ifdef CONFIG_BEANCOUNTERS
++#define shmi_ub_set(shi, ub)	do {			\
++		(shi)->shmi_ub = get_beancounter(ub);	\
++	} while (0)
++#define shmi_ub_put(shi)	do {			\
++		put_beancounter((shi)->shmi_ub);	\
++		(shi)->shmi_ub = NULL;			\
++	} while (0)
++#else
++#define shmi_ub_set(shi, ub)	do { } while (0)
++#define shmi_ub_put(shi)	do { } while (0)
++#endif
++
++UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
++			unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
++			unsigned long size))
++UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
++			unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
++			unsigned long size))
++
++UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma,
++			unsigned long addr, unsigned long end))
++#define pages_in_vma(vma)	(pages_in_vma_range(vma, \
++			vma->vm_start, vma->vm_end))
++
++#define UB_PAGE_WEIGHT_SHIFT 24
++#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT)
++
++struct page_beancounter;
++#define PBC_COPY_SAME	((struct page_beancounter *) 1)
++
++/* Mprotect charging result */
++#define PRIVVM_ERROR		-1
++#define PRIVVM_NO_CHARGE	0
++#define PRIVVM_TO_PRIVATE	1
++#define PRIVVM_TO_SHARED	2
++
++extern void __ub_update_physpages(struct user_beancounter *ub);
++extern void __ub_update_oomguarpages(struct user_beancounter *ub);
++extern void __ub_update_privvm(struct user_beancounter *ub);
++
++#ifdef CONFIG_BC_RSS_ACCOUNTING
++#define PB_DECLARE_FUNC(ret, decl)	UB_DECLARE_FUNC(ret, decl)
++#define PB_DECLARE_VOID_FUNC(decl)	UB_DECLARE_VOID_FUNC(decl)
++#else
++#define PB_DECLARE_FUNC(ret, decl)	static inline ret decl {return (ret)0;}
++#define PB_DECLARE_VOID_FUNC(decl)	static inline void decl { }
++#endif
++
++PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc))
++PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num))
++PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page,
++			struct mm_struct *mm,
++			struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, 
++			struct mm_struct *mm, 
++			struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb))
++PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb))
++PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, 
++			struct mm_struct *mm))
++
++PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page))
++#endif
++
++#ifdef CONFIG_BC_SWAP_ACCOUNTING
++#define SWP_DECLARE_FUNC(ret, decl)	UB_DECLARE_FUNC(ret, decl)
++#define SWP_DECLARE_VOID_FUNC(decl)	UB_DECLARE_VOID_FUNC(decl)
++#else
++#define SWP_DECLARE_FUNC(ret, decl)	static inline ret decl {return (ret)0;}
++#define SWP_DECLARE_VOID_FUNC(decl)	static inline void decl { }
++#endif
++
++struct swap_info_struct;
++SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n))
++SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si))
++SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n,
++			struct user_beancounter *ub))
++SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n))
+diff --git a/include/linux/aio.h b/include/linux/aio.h
+index b51ddd2..6fb5195 100644
+--- a/include/linux/aio.h
++++ b/include/linux/aio.h
+@@ -225,4 +225,8 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
+ extern unsigned long aio_nr;
+ extern unsigned long aio_max_nr;
+ 
++void wait_for_all_aios(struct kioctx *ctx);
++extern struct kmem_cache *kioctx_cachep;
++extern void aio_kick_handler(struct work_struct *);
++
+ #endif /* __LINUX__AIO_H */
+diff --git a/include/linux/capability.h b/include/linux/capability.h
+index 0267384..ab16bc6 100644
+--- a/include/linux/capability.h
++++ b/include/linux/capability.h
+@@ -186,12 +186,9 @@ typedef struct kernel_cap_struct {
+ 
+ #define CAP_NET_BROADCAST    11
+ 
+-/* Allow interface configuration */
+ /* Allow administration of IP firewall, masquerading and accounting */
+ /* Allow setting debug option on sockets */
+ /* Allow modification of routing tables */
+-/* Allow setting arbitrary process / process group ownership on
+-   sockets */
+ /* Allow binding to any address for transparent proxying */
+ /* Allow setting TOS (type of service) */
+ /* Allow setting promiscuous mode */
+@@ -221,6 +218,7 @@ typedef struct kernel_cap_struct {
+ #define CAP_SYS_MODULE       16
+ 
+ /* Allow ioperm/iopl access */
++/* Allow O_DIRECT access */
+ /* Allow sending USB messages to any device via /proc/bus/usb */
+ 
+ #define CAP_SYS_RAWIO        17
+@@ -239,24 +237,19 @@ typedef struct kernel_cap_struct {
+ 
+ /* Allow configuration of the secure attention key */
+ /* Allow administration of the random device */
+-/* Allow examination and configuration of disk quotas */
+ /* Allow configuring the kernel's syslog (printk behaviour) */
+ /* Allow setting the domainname */
+ /* Allow setting the hostname */
+ /* Allow calling bdflush() */
+-/* Allow mount() and umount(), setting up new smb connection */
++/* Allow setting up new smb connection */
+ /* Allow some autofs root ioctls */
+ /* Allow nfsservctl */
+ /* Allow VM86_REQUEST_IRQ */
+ /* Allow to read/write pci config on alpha */
+ /* Allow irix_prctl on mips (setstacksize) */
+ /* Allow flushing all cache on m68k (sys_cacheflush) */
+-/* Allow removing semaphores */
+-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+-   and shared memory */
+ /* Allow locking/unlocking of shared memory segment */
+ /* Allow turning swap on/off */
+-/* Allow forged pids on socket credentials passing */
+ /* Allow setting readahead and flushing buffers on block devices */
+ /* Allow setting geometry in floppy driver */
+ /* Allow turning DMA on/off in xd driver */
+@@ -329,6 +322,50 @@ typedef struct kernel_cap_struct {
+ 
+ #define CAP_SETFCAP	     31
+ 
++#ifdef __KERNEL__
++/*
++ * Important note: VZ capabilities do intersect with CAP_AUDIT
++ * this is due to compatibility reasons. Nothing bad.
++ * Both VZ and Audit/SELinux caps are disabled in VPSs.
++ */
++
++/* Allow access to all information. In the other case some structures will be
++   hiding to ensure different Virtual Environment non-interaction on the same
++   node */
++#define CAP_SETVEID	     29
++
++#define CAP_VE_ADMIN	     30
++
++#ifdef CONFIG_VE
++
++/* Replacement for CAP_NET_ADMIN:
++   delegated rights to the Virtual environment of its network administration.
++   For now the following rights have been delegated:
++
++   Allow setting arbitrary process / process group ownership on sockets
++   Allow interface configuration
++ */
++#define CAP_VE_NET_ADMIN     CAP_VE_ADMIN
++
++/* Replacement for CAP_SYS_ADMIN:
++   delegated rights to the Virtual environment of its administration.
++   For now the following rights have been delegated:
++ */
++/* Allow mount/umount/remount */
++/* Allow examination and configuration of disk quotas */
++/* Allow removing semaphores */
++/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
++   and shared memory */
++/* Allow locking/unlocking of shared memory segment */
++/* Allow forged pids on socket credentials passing */
++
++#define CAP_VE_SYS_ADMIN     CAP_VE_ADMIN
++#else
++#define CAP_VE_NET_ADMIN     CAP_NET_ADMIN
++#define CAP_VE_SYS_ADMIN     CAP_SYS_ADMIN
++#endif
++#endif
++
+ /* Override MAC access.
+    The base kernel enforces no MAC policy.
+    An LSM may enforce a MAC policy, and if it does and it chooses
+@@ -390,7 +427,16 @@ typedef struct kernel_cap_struct {
+ #define CAP_INIT_INH_SET    CAP_EMPTY_SET
+ 
+ # define cap_clear(c)         do { (c) = __cap_empty_set; } while (0)
++#ifndef CONFIG_VE
+ # define cap_set_full(c)      do { (c) = __cap_full_set; } while (0)
++#else
++# define cap_set_full(c)      do {			\
++		if (ve_is_super(get_exec_env()))	\
++			(c) = __cap_full_set;		\
++		else					\
++			(c) = get_exec_env()->ve_cap_bset;\
++	} while (0)
++#endif
+ # define cap_set_init_eff(c)  do { (c) = __cap_init_eff_set; } while (0)
+ 
+ #define cap_raise(c, flag)  ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
+@@ -503,6 +549,10 @@ extern const kernel_cap_t __cap_init_eff_set;
+ 
+ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
+ 
++#include <linux/spinlock_types.h>
++
++extern spinlock_t task_capability_lock;
++
+ int capable(int cap);
+ int __capable(struct task_struct *t, int cap);
+ 
+diff --git a/include/linux/cfq-iosched.h b/include/linux/cfq-iosched.h
+new file mode 100644
+index 0000000..b414c4a
+--- /dev/null
++++ b/include/linux/cfq-iosched.h
+@@ -0,0 +1,148 @@
++#ifndef _LINUX_CFQ_IOSCHED_H
++#define _LINUX_CFQ_IOSCHED_H
++
++#include <linux/ioprio.h>
++#include <linux/rbtree.h>
++#include <linux/blkdev.h>
++
++extern struct kmem_cache *cfq_pool;
++
++#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
++
++/*
++ * Most of our rbtree usage is for sorting with min extraction, so
++ * if we cache the leftmost node we don't have to walk down the tree
++ * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
++ * move this into the elevator for the rq sorting as well.
++ */
++struct cfq_rb_root {
++	struct rb_root rb;
++	struct rb_node *left;
++};
++#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
++
++/*
++ * Per (Device, UBC) queue data
++ */
++struct cfq_bc_data {
++	/* for ub.iopriv->cfq_bc_head */
++	struct list_head	cfq_bc_list;
++	/* for cfqd->act_cfq_bc_head */
++	struct list_head	act_cfq_bc_list;
++
++	struct cfq_data		*cfqd;
++	struct ub_iopriv	*ub_iopriv;
++
++	/*
++	 * rr list of queues with requests and the count of them
++	 */
++	struct cfq_rb_root	service_tree;
++
++	int			cur_prio;
++	int			cur_end_prio;
++
++	unsigned long		rqnum;
++	unsigned long		on_dispatch;
++
++	/*
++	 * async queue for each priority case
++	 */
++	struct cfq_queue	*async_cfqq[2][CFQ_PRIO_LISTS];
++	struct cfq_queue	*async_idle_cfqq;
++};
++
++/*
++ * Per block device queue structure
++ */
++struct cfq_data {
++	struct request_queue *queue;
++
++#ifndef CONFIG_BC_IO_SCHED
++	struct cfq_bc_data cfq_bc;
++#endif
++	unsigned int busy_queues;
++
++	int rq_in_driver;
++	int sync_flight;
++	int hw_tag;
++
++	/*
++	 * idle window management
++	 */
++	struct timer_list idle_slice_timer;
++	struct work_struct unplug_work;
++
++	struct cfq_queue *active_queue;
++	struct cfq_io_context *active_cic;
++
++	sector_t last_position;
++	unsigned long last_end_request;
++
++	/*
++	 * tunables, see top of file
++	 */
++	unsigned int cfq_quantum;
++	unsigned int cfq_fifo_expire[2];
++	unsigned int cfq_back_penalty;
++	unsigned int cfq_back_max;
++	unsigned int cfq_slice[2];
++	unsigned int cfq_slice_async_rq;
++	unsigned int cfq_slice_idle;
++
++	struct list_head cic_list;
++
++	/* list of ub that have requests */
++	struct list_head act_cfq_bc_head;
++	/* ub that owns a timeslice at the moment */
++	struct cfq_bc_data *active_cfq_bc;
++	unsigned int cfq_ub_slice;
++	unsigned long slice_end;
++	int virt_mode;
++	int write_virt_mode;
++};
++
++/*
++ * Per process-grouping structure
++ */
++struct cfq_queue {
++	/* reference count */
++	atomic_t ref;
++	/* various state flags, see below */
++	unsigned int flags;
++	/* parent cfq_data */
++	struct cfq_data *cfqd;
++	/* service_tree member */
++	struct rb_node rb_node;
++	/* service_tree key */
++	unsigned long rb_key;
++	/* sorted list of pending requests */
++	struct rb_root sort_list;
++	/* if fifo isn't expired, next request to serve */
++	struct request *next_rq;
++	/* requests queued in sort_list */
++	int queued[2];
++	/* currently allocated requests */
++	int allocated[2];
++	/* fifo list of requests in sort_list */
++	struct list_head fifo;
++
++	unsigned long slice_end;
++	long slice_resid;
++
++	/* pending metadata requests */
++	int meta_pending;
++	/* number of requests that are on the dispatch list or inside driver */
++	int dispatched;
++
++	/* io prio of this group */
++	unsigned short ioprio, org_ioprio;
++	unsigned short ioprio_class, org_ioprio_class;
++
++	struct cfq_bc_data *cfq_bc;
++};
++
++static void inline cfq_init_cfq_bc(struct cfq_bc_data *cfq_bc)
++{
++	cfq_bc->service_tree = CFQ_RB_ROOT;
++}
++#endif /* _LINUX_CFQ_IOSCHED_H */
+diff --git a/include/linux/compat.h b/include/linux/compat.h
+index cf8d11c..3c778e2 100644
+--- a/include/linux/compat.h
++++ b/include/linux/compat.h
+@@ -238,6 +238,7 @@ extern int put_compat_itimerspec(struct compat_itimerspec __user *dst,
+ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
+ 
+ extern int compat_printk(const char *fmt, ...);
++extern int ve_compat_printk(int dst, const char *fmt, ...);
+ extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+ 
+ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+diff --git a/include/linux/cpt_image.h b/include/linux/cpt_image.h
+new file mode 100644
+index 0000000..ae331be
+--- /dev/null
++++ b/include/linux/cpt_image.h
+@@ -0,0 +1,1762 @@
++/*
++ *
++ *  include/linux/cpt_image.h
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __CPT_IMAGE_H_
++#define __CPT_IMAGE_H_ 1
++
++#define CPT_NULL (~0ULL)
++#define CPT_NOINDEX (~0U)
++
++/*
++ * Image file layout.
++ *
++ * - major header
++ * - sections[]
++ *
++ *	Each section is:
++ *	- section header
++ *	- array of objects
++ *
++ * All data records are arch independent, 64 bit aligned.
++ */
++
++enum _cpt_object_type
++{
++	CPT_OBJ_TASK = 0,
++	CPT_OBJ_MM,
++	CPT_OBJ_FS,
++	CPT_OBJ_FILES,
++	CPT_OBJ_FILE,
++	CPT_OBJ_SIGHAND_STRUCT,
++	CPT_OBJ_SIGNAL_STRUCT,
++	CPT_OBJ_TTY,
++	CPT_OBJ_SOCKET,
++	CPT_OBJ_SYSVSEM_UNDO,
++	CPT_OBJ_NAMESPACE,
++	CPT_OBJ_SYSV_SHM,
++	CPT_OBJ_INODE,
++	CPT_OBJ_UBC,
++	CPT_OBJ_SLM_SGREG,
++	CPT_OBJ_SLM_REGOBJ,
++	CPT_OBJ_SLM_MM,
++	CPT_OBJ_MAX,
++	/* The objects above are stored in memory while checkpointing */
++
++	CPT_OBJ_VMA = 1024,
++	CPT_OBJ_FILEDESC,
++	CPT_OBJ_SIGHANDLER,
++	CPT_OBJ_SIGINFO,
++	CPT_OBJ_LASTSIGINFO,
++	CPT_OBJ_SYSV_SEM,
++	CPT_OBJ_SKB,
++	CPT_OBJ_FLOCK,
++	CPT_OBJ_OPENREQ,
++	CPT_OBJ_VFSMOUNT,
++	CPT_OBJ_TRAILER,
++	CPT_OBJ_SYSVSEM_UNDO_REC,
++	CPT_OBJ_NET_DEVICE,
++	CPT_OBJ_NET_IFADDR,
++	CPT_OBJ_NET_ROUTE,
++	CPT_OBJ_NET_CONNTRACK,
++	CPT_OBJ_NET_CONNTRACK_EXPECT,
++	CPT_OBJ_AIO_CONTEXT,
++	CPT_OBJ_VEINFO,
++	CPT_OBJ_EPOLL,
++	CPT_OBJ_EPOLL_FILE,
++	CPT_OBJ_SKFILTER,
++	CPT_OBJ_SIGALTSTACK,
++  	CPT_OBJ_SOCK_MCADDR,
++	CPT_OBJ_BIND_MNT,
++	CPT_OBJ_SYSVMSG,
++	CPT_OBJ_SYSVMSG_MSG,
++
++	CPT_OBJ_X86_REGS = 4096,
++	CPT_OBJ_X86_64_REGS,
++	CPT_OBJ_PAGES,
++	CPT_OBJ_COPYPAGES,
++	CPT_OBJ_REMAPPAGES,
++	CPT_OBJ_LAZYPAGES,
++	CPT_OBJ_NAME,
++	CPT_OBJ_BITS,
++	CPT_OBJ_REF,
++	CPT_OBJ_ITERPAGES,
++	CPT_OBJ_ITERYOUNGPAGES,
++	CPT_OBJ_VSYSCALL,
++	CPT_OBJ_IA64_REGS,
++	CPT_OBJ_INOTIFY,
++	CPT_OBJ_INOTIFY_WATCH,
++	CPT_OBJ_INOTIFY_EVENT,
++	CPT_OBJ_TASK_AUX,
++	CPT_OBJ_NET_TUNTAP,
++	CPT_OBJ_NET_HWADDR,
++	CPT_OBJ_NET_VETH,
++	CPT_OBJ_NET_STATS,
++};
++
++#define CPT_ALIGN(n) (((n)+7)&~7)
++
++struct cpt_major_hdr
++{
++	__u8	cpt_signature[4];	/* Magic number */
++	__u16	cpt_hdrlen;		/* Length of this header */
++	__u16	cpt_image_version;	/* Format of this file */
++#define CPT_VERSION_MINOR(a)	((a) & 0xf)
++#define CPT_VERSION_8		0
++#define CPT_VERSION_9		0x100
++#define CPT_VERSION_9_1		0x101
++#define CPT_VERSION_9_2		0x102
++#define CPT_VERSION_16		0x200
++#define CPT_VERSION_18		0x300
++#define CPT_VERSION_18_1	0x301
++#define CPT_VERSION_20		0x400
++#define CPT_VERSION_24		0x500
++#define CPT_VERSION_26		0x600
++	__u16	cpt_os_arch;		/* Architecture */
++#define CPT_OS_ARCH_I386	0
++#define CPT_OS_ARCH_EMT64	1
++#define CPT_OS_ARCH_IA64	2
++	__u16	__cpt_pad1;
++	__u32	cpt_ve_features;	/* VE features */
++	__u32	cpt_ve_features2;	/* VE features */
++	__u16	cpt_pagesize;		/* Page size used by OS */
++	__u16	cpt_hz;			/* HZ used by OS */
++	__u64	cpt_start_jiffies64;	/* Jiffies */
++	__u32	cpt_start_sec;		/* Seconds */
++	__u32	cpt_start_nsec;		/* Nanoseconds */
++	__u32	cpt_cpu_caps[4];	/* CPU capabilities */
++	__u32	cpt_kernel_config[4];	/* Kernel config */
++	__u64	cpt_iptables_mask;	/* Used netfilter modules */
++} __attribute__ ((aligned (8)));
++
++#define CPT_SIGNATURE0 0x79
++#define CPT_SIGNATURE1 0x1c
++#define CPT_SIGNATURE2 0x01
++#define CPT_SIGNATURE3 0x63
++
++/* CPU capabilities */
++#define CPT_CPU_X86_CMOV	0
++#define CPT_CPU_X86_FXSR	1
++#define CPT_CPU_X86_SSE		2
++#define CPT_CPU_X86_SSE2	3
++#define CPT_CPU_X86_MMX		4
++#define CPT_CPU_X86_3DNOW	5
++#define CPT_CPU_X86_3DNOW2	6
++#define CPT_CPU_X86_SEP		7
++#define CPT_CPU_X86_EMT64	8
++#define CPT_CPU_X86_IA64	9
++#define CPT_CPU_X86_SYSCALL	10
++#define CPT_CPU_X86_SYSCALL32	11
++#define CPT_CPU_X86_SEP32	12
++
++/* Unsupported features */
++#define CPT_EXTERNAL_PROCESS	16
++#define CPT_NAMESPACES		17
++#define CPT_SCHEDULER_POLICY	18
++#define CPT_PTRACED_FROM_VE0	19
++#define CPT_UNSUPPORTED_FSTYPE	20
++#define CPT_BIND_MOUNT		21
++#define CPT_UNSUPPORTED_NETDEV	22
++#define CPT_UNSUPPORTED_MISC	23
++
++/* This mask is used to determine whether VE
++   has some unsupported features or not */
++#define CPT_UNSUPPORTED_MASK	0xffff0000UL
++
++#define CPT_KERNEL_CONFIG_PAE	0
++
++struct cpt_section_hdr
++{
++	__u64	cpt_next;
++	__u32	cpt_section;
++	__u16	cpt_hdrlen;
++	__u16	cpt_align;
++} __attribute__ ((aligned (8)));
++
++enum
++{
++	CPT_SECT_ERROR,			/* Error section, content is string */
++	CPT_SECT_VEINFO,
++	CPT_SECT_FILES,			/* Files. Content is array of file objects */
++	CPT_SECT_TASKS,
++	CPT_SECT_MM,
++	CPT_SECT_FILES_STRUCT,
++	CPT_SECT_FS,
++	CPT_SECT_SIGHAND_STRUCT,
++	CPT_SECT_TTY,
++	CPT_SECT_SOCKET,
++	CPT_SECT_NAMESPACE,
++	CPT_SECT_SYSVSEM_UNDO,
++	CPT_SECT_INODE,			/* Inodes with i->i_nlink==0 and
++					 * deleted dentires with inodes not
++					 * referenced inside dumped process.
++					 */
++	CPT_SECT_SYSV_SHM,
++	CPT_SECT_SYSV_SEM,
++	CPT_SECT_ORPHANS,
++	CPT_SECT_NET_DEVICE,
++	CPT_SECT_NET_IFADDR,
++	CPT_SECT_NET_ROUTE,
++	CPT_SECT_NET_IPTABLES,
++	CPT_SECT_NET_CONNTRACK,
++	CPT_SECT_NET_CONNTRACK_VE0,
++	CPT_SECT_UTSNAME,
++	CPT_SECT_TRAILER,
++	CPT_SECT_UBC,
++	CPT_SECT_SLM_SGREGS,
++	CPT_SECT_SLM_REGOBJS,
++/* Due to silly mistake we cannot index sections beyond this value */
++#define	CPT_SECT_MAX_INDEX	(CPT_SECT_SLM_REGOBJS+1)
++	CPT_SECT_EPOLL,
++	CPT_SECT_VSYSCALL,
++	CPT_SECT_INOTIFY,
++	CPT_SECT_SYSV_MSG,
++	CPT_SECT_MAX
++};
++
++struct cpt_major_tail
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_lazypages;
++	__u32	cpt_64bit;
++	__u64	cpt_sections[CPT_SECT_MAX_INDEX];
++	__u32	cpt_nsect;
++	__u8	cpt_signature[4];	/* Magic number */
++} __attribute__ ((aligned (8)));
++
++
++/* Common object header. */
++struct cpt_object_hdr
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++} __attribute__ ((aligned (8)));
++
++enum _cpt_content_type {
++	CPT_CONTENT_VOID,
++	CPT_CONTENT_ARRAY,
++	CPT_CONTENT_DATA,
++	CPT_CONTENT_NAME,
++
++	CPT_CONTENT_STACK,
++	CPT_CONTENT_X86_FPUSTATE_OLD,
++	CPT_CONTENT_X86_FPUSTATE,
++	CPT_CONTENT_MM_CONTEXT,
++	CPT_CONTENT_SEMARRAY,
++	CPT_CONTENT_SEMUNDO,
++	CPT_CONTENT_NLMARRAY,
++	CPT_CONTENT_MAX
++};
++
++/* CPT_OBJ_BITS: encode array of bytes */ 
++struct cpt_obj_bits
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_size;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_REF: a reference to another object */ 
++struct cpt_obj_ref
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_pos;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_VEINFO: various ve specific data */
++struct cpt_veinfo_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	/* ipc ctls */
++	__u32	shm_ctl_max;
++	__u32	shm_ctl_all;
++	__u32	shm_ctl_mni;
++	__u32	msg_ctl_max;
++	__u32	msg_ctl_mni;
++	__u32	msg_ctl_mnb;
++	__u32	sem_ctl_arr[4];
++
++	/* start time */
++	__u64	start_timespec_delta;
++	__u64	start_jiffies_delta;
++
++	/* later extension */
++	__u32	last_pid;
++	__u32	pad1;
++	__u64	reserved[8];
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_FILE: one struct file */ 
++struct cpt_file_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_flags;
++	__u32	cpt_mode;
++	__u64	cpt_pos;
++	__u32	cpt_uid;
++	__u32	cpt_gid;
++
++	__u32	cpt_i_mode;
++	__u32	cpt_lflags;
++#define CPT_DENTRY_DELETED	1
++#define CPT_DENTRY_ROOT		2
++#define CPT_DENTRY_CLONING	4
++#define CPT_DENTRY_PROC		8
++#define CPT_DENTRY_EPOLL	0x10
++#define CPT_DENTRY_REPLACED	0x20
++#define CPT_DENTRY_INOTIFY	0x40
++#define CPT_DENTRY_FUTEX	0x80
++#define CPT_DENTRY_TUNTAP	0x100
++	__u64	cpt_inode;
++	__u64	cpt_priv;
++
++	__u32	cpt_fown_fd;
++	__u32	cpt_fown_pid;
++#define CPT_FOWN_STRAY_PID	0
++	__u32	cpt_fown_uid;
++	__u32	cpt_fown_euid;
++	__u32	cpt_fown_signo;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by file name, encoded as CPT_OBJ_NAME */
++
++struct cpt_epoll_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_file;
++} __attribute__ ((aligned (8)));
++/* Followed by array of struct cpt_epoll_file */
++
++struct cpt_epoll_file_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_file;
++	__u32	cpt_fd;
++	__u32	cpt_events;
++	__u64	cpt_data;
++	__u32	cpt_revents;
++	__u32	cpt_ready;
++} __attribute__ ((aligned (8)));
++
++struct cpt_inotify_wd_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_wd;
++	__u32	cpt_mask;
++} __attribute__ ((aligned (8)));
++/* Followed by cpt_file_image of inode to watch */
++
++struct cpt_inotify_ev_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_wd;
++	__u32	cpt_mask;
++	__u32	cpt_cookie;
++	__u32	cpt_namelen;
++} __attribute__ ((aligned (8)));
++/* Followed by name */
++
++struct cpt_inotify_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_file;
++	__u32	cpt_user;
++	__u32	cpt_max_events;
++	__u32	cpt_last_wd;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */
++
++
++/* CPT_OBJ_FILEDESC: one file descriptor */
++struct cpt_fd_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_fd;
++	__u32	cpt_flags;
++#define CPT_FD_FLAG_CLOSEEXEC	1
++	__u64	cpt_file;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_FILES: one files_struct */
++struct cpt_files_struct_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_index;
++	__u32	cpt_max_fds;
++	__u32	cpt_next_fd;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by array of cpt_fd_image */
++
++/* CPT_OBJ_FS: one fs_struct */
++struct cpt_fs_struct_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_umask;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */
++
++/* CPT_OBJ_INODE: one struct inode */
++struct cpt_inode_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_dev;
++	__u64	cpt_ino;
++	__u32	cpt_mode;
++	__u32	cpt_nlink;
++	__u32	cpt_uid;
++	__u32	cpt_gid;
++	__u64	cpt_rdev;
++	__u64	cpt_size;
++	__u64	cpt_blksize;
++	__u64	cpt_atime;
++	__u64	cpt_mtime;
++	__u64	cpt_ctime;
++	__u64	cpt_blocks;
++	__u32	cpt_sb;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_VFSMOUNT: one vfsmount */
++struct cpt_vfsmount_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_mntflags;
++#define CPT_MNT_BIND	0x80000000
++#define CPT_MNT_EXT	0x40000000
++	__u32	cpt_flags;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_flock_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_owner;
++	__u32	cpt_pid;
++	__u64	cpt_start;
++	__u64	cpt_end;
++	__u32	cpt_flags;
++	__u32	cpt_type;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_tty_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_flags;
++	__u32	cpt_link;
++	__u32	cpt_index;
++	__u32	cpt_drv_type;
++	__u32	cpt_drv_subtype;
++	__u32	cpt_drv_flags;
++	__u8	cpt_packet;
++	__u8	cpt_stopped;
++	__u8	cpt_hw_stopped;
++	__u8	cpt_flow_stopped;
++
++	__u32	cpt_canon_data;
++	__u32	cpt_canon_head;
++	__u32	cpt_canon_column;
++	__u32	cpt_column;
++	__u8	cpt_ctrl_status;
++	__u8	cpt_erasing;
++	__u8	cpt_lnext;
++	__u8	cpt_icanon;
++	__u8	cpt_raw;
++	__u8	cpt_real_raw;
++	__u8	cpt_closing;
++	__u8	__cpt_pad1;
++	__u16	cpt_minimum_to_wake;
++	__u16	__cpt_pad2;
++	__u32	cpt_pgrp;
++	__u32	cpt_session;
++	__u32	cpt_c_line;
++	__u8	cpt_name[64];	
++	__u16	cpt_ws_row;
++	__u16	cpt_ws_col;
++	__u16	cpt_ws_prow;
++	__u16	cpt_ws_pcol;
++	__u8	cpt_c_cc[32];
++	__u32	cpt_c_iflag;
++	__u32	cpt_c_oflag;
++	__u32	cpt_c_cflag;
++	__u32	cpt_c_lflag;
++	__u32	cpt_read_flags[4096/32];
++} __attribute__ ((aligned (8)));
++
++struct cpt_sock_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_file;
++	__u32	cpt_parent;
++	__u32	cpt_index;
++
++	__u64	cpt_ssflags;
++	__u16	cpt_type;
++	__u16	cpt_family;
++	__u8	cpt_sstate;
++	__u8	cpt_passcred;
++	__u8	cpt_state;
++	__u8	cpt_reuse;
++
++	__u8	cpt_zapped;
++	__u8	cpt_shutdown;
++	__u8	cpt_userlocks;
++	__u8	cpt_no_check;
++	__u8	cpt_debug;
++	__u8	cpt_rcvtstamp;
++	__u8	cpt_localroute;
++	__u8	cpt_protocol;
++
++	__u32	cpt_err;
++	__u32	cpt_err_soft;
++
++	__u16	cpt_max_ack_backlog;
++	__u16   __cpt_pad1;
++	__u32	cpt_priority;
++
++	__u32	cpt_rcvlowat;
++	__u32	cpt_bound_dev_if;
++
++	__u64	cpt_rcvtimeo;
++	__u64	cpt_sndtimeo;
++	__u32	cpt_rcvbuf;
++	__u32	cpt_sndbuf;
++	__u64	cpt_flags;
++	__u64	cpt_lingertime;
++	__u32	cpt_peer_pid;
++	__u32	cpt_peer_uid;
++
++	__u32	cpt_peer_gid;
++	__u32	cpt_laddrlen;
++	__u32	cpt_laddr[128/4];
++	__u32	cpt_raddrlen;
++	__u32	cpt_raddr[128/4];
++	/* AF_UNIX */
++	__u32	cpt_peer;
++
++	__u8	cpt_socketpair;
++	__u8	cpt_deleted;
++	__u16	__cpt_pad4;
++	__u32	__cpt_pad5;
++/*
++	struct sk_filter      	*sk_filter;
++ */
++
++	__u64			cpt_stamp;
++	__u32			cpt_daddr;
++	__u16			cpt_dport;
++	__u16			cpt_sport;
++
++	__u32			cpt_saddr;
++	__u32			cpt_rcv_saddr;
++
++	__u32			cpt_uc_ttl;
++	__u32			cpt_tos;
++
++	__u32			cpt_cmsg_flags;
++	__u32			cpt_mc_index;
++
++	__u32			cpt_mc_addr;
++/*
++	struct ip_options	*opt;
++ */
++	__u8			cpt_hdrincl;
++	__u8			cpt_mc_ttl;
++	__u8			cpt_mc_loop;
++	__u8			cpt_pmtudisc;
++
++	__u8			cpt_recverr;
++	__u8			cpt_freebind;
++	__u16			cpt_idcounter;
++	__u32			cpt_cork_flags;
++
++	__u32			cpt_cork_fragsize;
++	__u32			cpt_cork_length;
++	__u32			cpt_cork_addr;
++	__u32			cpt_cork_saddr;
++	__u32			cpt_cork_daddr;
++	__u32			cpt_cork_oif;
++
++	__u32			cpt_udp_pending;
++	__u32			cpt_udp_corkflag;
++	__u16			cpt_udp_encap;
++	__u16			cpt_udp_len;
++	__u32			__cpt_pad7;
++
++	__u64			cpt_saddr6[2];
++	__u64			cpt_rcv_saddr6[2];
++	__u64			cpt_daddr6[2];
++	__u32			cpt_flow_label6;
++	__u32			cpt_frag_size6;
++	__u32			cpt_hop_limit6;
++	__u32			cpt_mcast_hops6;
++
++	__u32			cpt_mcast_oif6;
++	__u8			cpt_rxopt6;
++	__u8			cpt_mc_loop6;
++	__u8			cpt_recverr6;
++	__u8			cpt_sndflow6;
++
++	__u8			cpt_pmtudisc6;
++	__u8			cpt_ipv6only6;
++	__u8			cpt_mapped;
++	__u8			__cpt_pad8;
++	__u32	cpt_pred_flags;
++
++	__u32	cpt_rcv_nxt;
++	__u32	cpt_snd_nxt;
++
++	__u32	cpt_snd_una;
++	__u32	cpt_snd_sml;
++
++	__u32	cpt_rcv_tstamp;
++	__u32	cpt_lsndtime;
++
++	__u8	cpt_tcp_header_len;
++	__u8	cpt_ack_pending;
++	__u8	cpt_quick;
++	__u8	cpt_pingpong;
++	__u8	cpt_blocked;
++	__u8	__cpt_pad9;
++	__u16	__cpt_pad10;
++
++	__u32	cpt_ato;
++	__u32	cpt_ack_timeout;
++
++	__u32	cpt_lrcvtime;
++	__u16	cpt_last_seg_size;
++	__u16	cpt_rcv_mss;
++
++	__u32	cpt_snd_wl1;
++	__u32	cpt_snd_wnd;
++
++	__u32	cpt_max_window;
++	__u32	cpt_pmtu_cookie;
++
++	__u32	cpt_mss_cache;
++	__u16	cpt_mss_cache_std;
++	__u16	cpt_mss_clamp;
++
++	__u16	cpt_ext_header_len;
++	__u16	cpt_ext2_header_len;
++	__u8	cpt_ca_state;
++	__u8	cpt_retransmits;
++	__u8	cpt_reordering;
++	__u8	cpt_frto_counter;
++
++	__u32	cpt_frto_highmark;
++	__u8	cpt_adv_cong;
++	__u8	cpt_defer_accept;
++	__u8	cpt_backoff;
++	__u8	__cpt_pad11;
++
++	__u32	cpt_srtt;
++	__u32	cpt_mdev;
++
++	__u32	cpt_mdev_max;
++	__u32	cpt_rttvar;
++
++	__u32	cpt_rtt_seq;
++	__u32	cpt_rto;
++
++	__u32	cpt_packets_out;
++	__u32	cpt_left_out;
++
++	__u32	cpt_retrans_out;
++ 	__u32	cpt_snd_ssthresh;
++
++ 	__u32	cpt_snd_cwnd;
++ 	__u16	cpt_snd_cwnd_cnt;
++	__u16	cpt_snd_cwnd_clamp;
++
++	__u32	cpt_snd_cwnd_used;
++	__u32	cpt_snd_cwnd_stamp;
++
++	__u32	cpt_timeout;
++	__u32	cpt_ka_timeout;
++
++ 	__u32	cpt_rcv_wnd;
++	__u32	cpt_rcv_wup;
++
++	__u32	cpt_write_seq;
++	__u32	cpt_pushed_seq;
++
++	__u32	cpt_copied_seq;
++	__u8	cpt_tstamp_ok;
++	__u8	cpt_wscale_ok;
++	__u8	cpt_sack_ok;
++	__u8	cpt_saw_tstamp;
++
++        __u8	cpt_snd_wscale;
++        __u8	cpt_rcv_wscale;
++	__u8	cpt_nonagle;
++	__u8	cpt_keepalive_probes;
++        __u32	cpt_rcv_tsval;
++
++        __u32	cpt_rcv_tsecr;
++        __u32	cpt_ts_recent;
++
++	__u64	cpt_ts_recent_stamp;
++	__u16	cpt_user_mss;
++	__u8	cpt_dsack;
++	__u8	cpt_eff_sacks;
++	__u32	cpt_sack_array[2*5];
++	__u32	cpt_window_clamp;
++
++	__u32	cpt_rcv_ssthresh;
++	__u8	cpt_probes_out;
++	__u8	cpt_num_sacks;
++	__u16	cpt_advmss;
++
++	__u8	cpt_syn_retries;
++	__u8	cpt_ecn_flags;
++	__u16	cpt_prior_ssthresh;
++	__u32	cpt_lost_out;
++
++	__u32   cpt_sacked_out;
++	__u32   cpt_fackets_out;
++
++	__u32   cpt_high_seq;
++	__u32	cpt_retrans_stamp;
++
++	__u32	cpt_undo_marker;
++	__u32	cpt_undo_retrans;
++
++	__u32	cpt_urg_seq;
++	__u16	cpt_urg_data;
++	__u8	cpt_pending;
++	__u8	cpt_urg_mode;
++
++	__u32	cpt_snd_up;
++	__u32	cpt_keepalive_time;
++
++	__u32   cpt_keepalive_intvl;
++	__u32   cpt_linger2;
++
++	__u32	cpt_rcvrtt_rtt;
++	__u32	cpt_rcvrtt_seq;
++
++	__u32	cpt_rcvrtt_time;
++	__u32	__cpt_pad12;
++} __attribute__ ((aligned (8)));
++
++struct cpt_sockmc_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u16	cpt_family;
++	__u16	cpt_mode;
++	__u32	cpt_ifindex;
++	__u32	cpt_mcaddr[4];
++} __attribute__ ((aligned (8)));
++/* Followed by array of source addresses, each zero padded to 16 bytes */
++
++struct cpt_openreq_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_rcv_isn;
++	__u32	cpt_snt_isn;
++
++	__u16	cpt_rmt_port;
++	__u16	cpt_mss;
++	__u8	cpt_family;
++	__u8	cpt_retrans;
++	__u8	cpt_snd_wscale;
++	__u8	cpt_rcv_wscale;
++
++	__u8	cpt_tstamp_ok;
++	__u8	cpt_sack_ok;
++	__u8	cpt_wscale_ok;
++	__u8	cpt_ecn_ok;
++	__u8	cpt_acked;
++	__u8	__cpt_pad1;
++	__u16	__cpt_pad2;
++
++	__u32	cpt_window_clamp;
++	__u32	cpt_rcv_wnd;
++	__u32	cpt_ts_recent;
++	__u32	cpt_iif;
++	__u64	cpt_expires;
++
++	__u64	cpt_loc_addr[2];
++	__u64	cpt_rmt_addr[2];
++/*
++	struct ip_options	*opt;
++ */
++	
++} __attribute__ ((aligned (8)));
++
++struct cpt_skb_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_owner;
++	__u32	cpt_queue;
++#define CPT_SKB_NQ	0
++#define CPT_SKB_RQ	1
++#define CPT_SKB_WQ	2
++#define CPT_SKB_OFOQ	3
++
++	__u64	cpt_stamp;
++	__u32	cpt_len;
++	__u32	cpt_hspace;
++	__u32	cpt_tspace;
++	__u32	cpt_h;
++	__u32	cpt_nh;
++	__u32	cpt_mac;
++	
++	__u64	cpt_cb[5];
++	__u32	cpt_mac_len;
++	__u32	cpt_csum;
++	__u8	cpt_local_df;
++	__u8	cpt_pkt_type;
++	__u8	cpt_ip_summed;
++	__u8	__cpt_pad1;
++	__u32	cpt_priority;
++	__u16	cpt_protocol;
++	__u16	cpt_security;
++	__u16	cpt_gso_segs;
++	__u16	cpt_gso_size;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_sysvshm_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_key;
++	__u64	cpt_uid;
++	__u64	cpt_gid;
++	__u64	cpt_cuid;
++	__u64	cpt_cgid;
++	__u64	cpt_mode;
++	__u64	cpt_seq;
++
++	__u32	cpt_id;
++	__u32	cpt_mlockuser;
++	__u64	cpt_segsz;
++	__u64	cpt_atime;
++	__u64	cpt_ctime;
++	__u64	cpt_dtime;
++	__u64	cpt_creator;
++	__u64	cpt_last;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_sysvsem_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_key;
++	__u64	cpt_uid;
++	__u64	cpt_gid;
++	__u64	cpt_cuid;
++	__u64	cpt_cgid;
++	__u64	cpt_mode;
++	__u64	cpt_seq;
++	__u32	cpt_id;
++	__u32	__cpt_pad1;
++
++	__u64	cpt_otime;
++	__u64	cpt_ctime;
++} __attribute__ ((aligned (8)));
++/* Content is array of pairs semval/sempid */
++
++struct cpt_sysvsem_undo_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_id;
++	__u32	cpt_nsem;
++} __attribute__ ((aligned (8)));
++
++struct cpt_sysvmsg_msg_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_type;
++	__u64	cpt_size;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_sysvmsg_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_key;
++	__u64	cpt_uid;
++	__u64	cpt_gid;
++	__u64	cpt_cuid;
++	__u64	cpt_cgid;
++	__u64	cpt_mode;
++	__u64	cpt_seq;
++	__u32	cpt_id;
++	__u32	__cpt_pad1;
++
++	__u64	cpt_stime;
++	__u64	cpt_rtime;
++	__u64	cpt_ctime;
++	__u64	cpt_last_sender;
++	__u64	cpt_last_receiver;
++	__u64	cpt_qbytes;
++} __attribute__ ((aligned (8)));
++/* Content is array of sysv msg */
++
++
++struct cpt_mm_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start_code;
++	__u64	cpt_end_code;
++	__u64	cpt_start_data;
++	__u64	cpt_end_data;
++	__u64	cpt_start_brk;
++	__u64	cpt_brk;
++	__u64	cpt_start_stack;
++	__u64	cpt_start_arg;
++	__u64	cpt_end_arg;
++	__u64	cpt_start_env;
++	__u64	cpt_end_env;
++	__u64	cpt_def_flags;
++	__u64	cpt_mmub;
++	__u8	cpt_dumpable;
++	__u8	cpt_vps_dumpable;
++	__u8	cpt_used_hugetlb;
++	__u8	__cpt_pad;
++	__u32	cpt_vdso;
++} __attribute__ ((aligned (8)));
++
++struct cpt_page_block
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++} __attribute__ ((aligned (8)));
++
++struct cpt_remappage_block
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++	__u64	cpt_pgoff;
++} __attribute__ ((aligned (8)));
++
++struct cpt_copypage_block
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++	__u64	cpt_source;
++} __attribute__ ((aligned (8)));
++
++struct cpt_lazypage_block
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++	__u64	cpt_index;
++} __attribute__ ((aligned (8)));
++
++struct cpt_iterpage_block
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++} __attribute__ ((aligned (8)));
++/* Followed by array of PFNs */
++
++struct cpt_vma_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_file;
++	__u32	cpt_type;
++#define CPT_VMA_TYPE_0		0
++#define CPT_VMA_TYPE_SHM	1
++#define CPT_VMA_VDSO		2
++	__u32	cpt_anonvma;
++	__u64	cpt_anonvmaid;
++
++	__u64	cpt_start;
++	__u64	cpt_end;
++	__u64	cpt_flags;
++	__u64	cpt_pgprot;
++	__u64	cpt_pgoff;
++} __attribute__ ((aligned (8)));
++
++struct cpt_aio_ctx_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_max_reqs;
++	__u32	cpt_ring_pages;
++	__u32	cpt_tail;
++	__u32	cpt_nr;
++	__u64	cpt_mmap_base;
++	/* Data (io_event's) and struct aio_ring are stored in user space VM */
++} __attribute__ ((aligned (8)));
++
++
++/* Format of MM section.
++ *
++ * It is array of MM objects (mm_struct). Each MM object is
++ * header, encoding mm_struct, followed by array of VMA objects.
++ * Each VMA consists of VMA header, encoding vm_area_struct, and
++ * if the VMA contains copied pages, the header is followed by
++ * array of tuples start-end each followed by data.
++ *
++ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good?
++ */
++
++struct cpt_restart_block {
++	__u64	fn;
++#define CPT_RBL_0			0
++#define CPT_RBL_NANOSLEEP		1
++#define CPT_RBL_COMPAT_NANOSLEEP	2
++#define CPT_RBL_POLL			3
++#define CPT_RBL_FUTEX_WAIT		4
++	__u64	arg0;
++	__u64	arg1;
++	__u64	arg2;
++	__u64	arg3;
++} __attribute__ ((aligned (8)));
++
++struct cpt_siginfo_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_qflags;
++	__u32	cpt_signo;
++	__u32	cpt_errno;
++	__u32	cpt_code;
++
++	__u64	cpt_sigval;
++	__u32	cpt_pid;
++	__u32	cpt_uid;
++	__u64	cpt_utime;
++	__u64	cpt_stime;
++
++	__u64	cpt_user;
++} __attribute__ ((aligned (8)));
++
++/* Portable presentaions for segment registers */
++
++#define CPT_SEG_ZERO		0
++#define CPT_SEG_TLS1		1
++#define CPT_SEG_TLS2		2
++#define CPT_SEG_TLS3		3
++#define CPT_SEG_USER32_DS	4
++#define CPT_SEG_USER32_CS	5
++#define CPT_SEG_USER64_DS	6
++#define CPT_SEG_USER64_CS	7
++#define CPT_SEG_LDT		256
++
++struct cpt_x86_regs
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_debugreg[8];
++	__u32	cpt_fs;
++	__u32	cpt_gs;
++
++	__u32	cpt_ebx;
++	__u32	cpt_ecx;
++	__u32	cpt_edx;
++	__u32	cpt_esi;
++	__u32	cpt_edi;
++	__u32	cpt_ebp;
++	__u32	cpt_eax;
++	__u32	cpt_xds;
++	__u32	cpt_xes;
++	__u32	cpt_orig_eax;
++	__u32	cpt_eip;
++	__u32	cpt_xcs;
++	__u32	cpt_eflags;
++	__u32	cpt_esp;
++	__u32	cpt_xss;
++	__u32	pad;
++};
++
++struct cpt_x86_64_regs
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_debugreg[8];
++
++	__u64	cpt_fsbase;
++	__u64	cpt_gsbase;
++	__u32	cpt_fsindex;
++	__u32	cpt_gsindex;
++	__u32	cpt_ds;
++	__u32	cpt_es;
++
++	__u64	cpt_r15;
++	__u64	cpt_r14;
++	__u64	cpt_r13;
++	__u64	cpt_r12;
++	__u64	cpt_rbp;
++	__u64	cpt_rbx;
++	__u64	cpt_r11;
++	__u64	cpt_r10;	
++	__u64	cpt_r9;
++	__u64	cpt_r8;
++	__u64	cpt_rax;
++	__u64	cpt_rcx;
++	__u64	cpt_rdx;
++	__u64	cpt_rsi;
++	__u64	cpt_rdi;
++	__u64	cpt_orig_rax;
++	__u64	cpt_rip;
++	__u64	cpt_cs;
++	__u64	cpt_eflags;
++	__u64	cpt_rsp;
++	__u64	cpt_ss;
++};
++
++struct cpt_ia64_regs
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	gr[128];
++	__u64	fr[256];
++	__u64	br[8];
++	__u64	nat[2];
++
++	__u64	ar_bspstore;
++	__u64	num_regs;
++	__u64	loadrs;
++	__u64	ar_bsp;
++	__u64	ar_unat;
++	__u64	ar_pfs;
++	__u64	ar_ccv;
++	__u64	ar_fpsr;
++	__u64	ar_csd;
++	__u64	ar_ssd;
++	__u64	ar_ec;
++	__u64	ar_lc;
++	__u64	ar_rsc;
++	__u64	ar_rnat;
++
++	__u64	cr_iip;
++	__u64	cr_ipsr;
++
++	__u64	cfm;
++	__u64	pr;
++
++	__u64	ibr[8];
++	__u64	dbr[8];
++};
++
++
++struct cpt_task_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_state;
++	__u64	cpt_flags;
++	__u64	cpt_ptrace;
++	__u32	cpt_prio;
++	__u32	cpt_static_prio;
++	__u32	cpt_policy;
++	__u32	cpt_rt_priority;
++
++	/* struct thread_info */
++	__u64	cpt_exec_domain;
++	__u64	cpt_thrflags;
++	__u64	cpt_thrstatus;
++	__u64	cpt_addr_limit;
++
++	__u64	cpt_personality;
++
++	__u64	cpt_mm;
++	__u64	cpt_files;
++	__u64	cpt_fs;
++	__u64	cpt_signal;
++	__u64	cpt_sighand;
++	__u64	cpt_sigblocked;
++	__u64	cpt_sigrblocked;
++	__u64	cpt_sigpending;
++	__u64	cpt_namespace;
++	__u64	cpt_sysvsem_undo;
++	__u32	cpt_pid;
++	__u32	cpt_tgid;
++	__u32	cpt_ppid;
++	__u32	cpt_rppid;
++	__u32	cpt_pgrp;
++	__u32	cpt_session;
++	__u32	cpt_old_pgrp;
++	__u32	__cpt_pad;
++	__u32	cpt_leader;
++	__u8	cpt_pn_state;
++	__u8	cpt_stopped_state;
++	__u8	cpt_sigsuspend_state;
++	__u8	cpt_64bit;
++	__u64	cpt_set_tid;
++	__u64	cpt_clear_tid;
++	__u32	cpt_exit_code;
++	__u32	cpt_exit_signal;
++	__u32	cpt_pdeath_signal;
++	__u32	cpt_user;
++	__u32	cpt_uid;
++	__u32	cpt_euid;
++	__u32	cpt_suid;
++	__u32	cpt_fsuid;
++	__u32	cpt_gid;
++	__u32	cpt_egid;
++	__u32	cpt_sgid;
++	__u32	cpt_fsgid;
++	__u32	cpt_ngids;
++	__u32	cpt_gids[32];
++	__u8	cpt_prctl_uac;
++	__u8	cpt_prctl_fpemu;
++	__u16	__cpt_pad1;
++	__u64	cpt_ecap;
++	__u64	cpt_icap;
++	__u64	cpt_pcap;
++	__u8	cpt_comm[16];
++	__u64	cpt_tls[3];
++	struct cpt_restart_block cpt_restart;
++	__u64	cpt_it_real_value;	/* V8: jiffies, V9..: nsec */
++	__u64	cpt_it_real_incr;	/* V8: jiffies, V9..: nsec */
++	__u64	cpt_it_prof_value;
++	__u64	cpt_it_prof_incr;
++	__u64	cpt_it_virt_value;
++	__u64	cpt_it_virt_incr;
++
++	__u16	cpt_used_math;
++	__u8	cpt_keepcap;
++	__u8	cpt_did_exec;
++	__u32	cpt_ptrace_message;
++
++	__u64	cpt_utime;
++	__u64	cpt_stime;
++	__u64	cpt_starttime;		/* V8: jiffies, V9...: timespec */
++	__u64	cpt_nvcsw;
++	__u64	cpt_nivcsw;
++	__u64	cpt_min_flt;
++	__u64	cpt_maj_flt;
++
++	__u64	cpt_sigsuspend_blocked;
++	__u64	cpt_cutime, cpt_cstime;
++	__u64	cpt_cnvcsw, cpt_cnivcsw;
++	__u64	cpt_cmin_flt, cpt_cmaj_flt;
++
++#define CPT_RLIM_NLIMITS 16
++	__u64	cpt_rlim_cur[CPT_RLIM_NLIMITS];
++	__u64	cpt_rlim_max[CPT_RLIM_NLIMITS];
++
++	__u64	cpt_task_ub;
++	__u64	cpt_exec_ub;
++	__u64	cpt_mm_ub;
++	__u64	cpt_fork_sub;
++} __attribute__ ((aligned (8)));
++
++struct cpt_sigaltstack_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_stack;
++	__u32	cpt_stacksize;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++struct cpt_task_aux_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_robust_list;
++	__u64	__cpt_future[16];
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_signal_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_leader;
++	__u8	cpt_pgrp_type;
++	__u8	cpt_old_pgrp_type;
++	__u8	cpt_session_type;
++#define CPT_PGRP_NORMAL		0
++#define CPT_PGRP_ORPHAN		1
++#define CPT_PGRP_STRAY		2
++	__u8	__cpt_pad1;
++	__u64	cpt_pgrp;
++	__u64	cpt_old_pgrp;
++	__u64	cpt_session;
++	__u64	cpt_sigpending;
++	__u64	cpt_ctty;
++
++	__u32	cpt_curr_target;
++	__u32	cpt_group_exit;
++	__u32	cpt_group_exit_code;
++	__u32	cpt_group_exit_task;
++	__u32	cpt_notify_count;
++	__u32	cpt_group_stop_count;
++	__u32	cpt_stop_state;
++	__u32	__cpt_pad2;
++
++	__u64	cpt_utime, cpt_stime, cpt_cutime, cpt_cstime;
++	__u64	cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw;
++	__u64	cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt;
++
++	__u64	cpt_rlim_cur[CPT_RLIM_NLIMITS];
++	__u64	cpt_rlim_max[CPT_RLIM_NLIMITS];
++} __attribute__ ((aligned (8)));
++/* Followed by list of posix timers. */
++
++struct cpt_sighand_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++} __attribute__ ((aligned (8)));
++/* Followed by list of sighandles. */
++
++struct cpt_sighandler_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++	
++	__u32	cpt_signo;
++	__u32	__cpt_pad1;
++	__u64	cpt_handler;
++	__u64	cpt_restorer;
++	__u64	cpt_flags;
++	__u64	cpt_mask;
++} __attribute__ ((aligned (8)));
++
++struct cpt_netdev_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_index;
++	__u32	cpt_flags;
++	__u8	cpt_name[16];
++} __attribute__ ((aligned (8)));
++
++struct cpt_tuntap_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_owner;
++	__u32	cpt_attached;
++	__u64	cpt_flags;
++	__u64	cpt_bindfile;
++	__u64	cpt_if_flags;
++	__u8	cpt_dev_addr[6];
++	__u16	cpt_pad;
++	__u32	cpt_chr_filter[2];
++	__u32	cpt_net_filter[2];
++} __attribute__ ((aligned (8)));
++
++struct cpt_veth_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_allow_mac_change;
++	__u32	__cpt_pad;
++} __attribute__ ((aligned (8)));
++
++struct cpt_hwaddr_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u8	cpt_dev_addr[32];
++} __attribute__ ((aligned (8)));
++
++struct cpt_netstats_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_rx_packets;
++	__u64	cpt_tx_packets;
++	__u64	cpt_rx_bytes;
++	__u64	cpt_tx_bytes;
++	__u64	cpt_rx_errors;
++	__u64	cpt_tx_errors;
++	__u64	cpt_rx_dropped;
++	__u64	cpt_tx_dropped;
++	__u64	cpt_multicast;
++	__u64	cpt_collisions;
++	__u64	cpt_rx_length_errors;
++	__u64	cpt_rx_over_errors;
++	__u64	cpt_rx_crc_errors;
++	__u64	cpt_rx_frame_errors;
++	__u64	cpt_rx_fifo_errors;
++	__u64	cpt_rx_missed_errors;
++	__u64	cpt_tx_aborted_errors;
++	__u64	cpt_tx_carrier_errors;
++	__u64	cpt_tx_fifo_errors;
++	__u64	cpt_tx_heartbeat_errors;
++	__u64	cpt_tx_window_errors;
++	__u64	cpt_rx_compressed;
++	__u64	cpt_tx_compressed;
++	__u64	pad[4];
++} __attribute__ ((aligned (8)));
++
++struct cpt_ifaddr_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u32	cpt_index;
++	__u8	cpt_family;
++	__u8	cpt_masklen;
++	__u8	cpt_flags;
++	__u8	cpt_scope;
++	__u32	cpt_address[4];
++	__u32	cpt_peer[4];
++	__u32	cpt_broadcast[4];
++	__u8	cpt_label[16];
++	__u32	cpt_valid_lft;
++	__u32	cpt_prefered_lft;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ipct_tuple
++{
++	__u32	cpt_src;
++	__u16	cpt_srcport;
++	__u16	__cpt_pad1;
++
++	__u32	cpt_dst;
++	__u16	cpt_dstport;
++	__u8	cpt_protonum;
++	__u8	cpt_dir;	/* TEMPORARY HACK TO VALIDATE CODE */
++} __attribute__ ((aligned (8)));
++
++struct cpt_nat_manip
++{
++	__u8	cpt_direction;
++	__u8	cpt_hooknum;
++	__u8	cpt_maniptype;
++	__u8	__cpt_pad1;
++
++	__u32	cpt_manip_addr;
++	__u16	cpt_manip_port;
++	__u16	__cpt_pad2;
++	__u32	__cpt_pad3;
++} __attribute__ ((aligned (8)));
++
++struct cpt_nat_seq
++{
++	__u32	cpt_correction_pos;
++	__u32	cpt_offset_before;
++	__u32	cpt_offset_after;
++	__u32	__cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ip_connexpect_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_timeout;
++	__u32	cpt_sibling_conntrack;	/* Index of child conntrack */
++	__u32	cpt_seq;		/* id in 2.6.15 */
++
++	struct cpt_ipct_tuple	cpt_ct_tuple;	/* NU 2.6.15 */
++	struct cpt_ipct_tuple	cpt_tuple;
++	struct cpt_ipct_tuple	cpt_mask;
++
++	/* union ip_conntrack_expect_help. Used by ftp, irc, amanda */
++	__u32	cpt_help[3];			/* NU 2.6.15 */
++	__u16	cpt_manip_proto;
++	__u8	cpt_dir;
++	__u8	cpt_flags;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ip_conntrack_image
++{
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	struct cpt_ipct_tuple cpt_tuple[2];
++	__u64	cpt_status;
++	__u64	cpt_timeout;
++	__u32	cpt_index;
++	__u8	cpt_ct_helper;
++	__u8	cpt_nat_helper;
++	__u16	cpt_pad1;
++
++	/* union ip_conntrack_proto. Used by tcp and icmp. */
++	__u32	cpt_proto_data[12];
++
++	/* union ip_conntrack_help. Used by ftp and pptp helper.
++	 * We do not support pptp...
++	 */
++	__u32	cpt_help_data[6];
++
++	/* nat info */
++	__u32	cpt_initialized;	/* NU 2.6.15 */
++	__u32	cpt_num_manips;		/* NU 2.6.15 */
++	struct  cpt_nat_manip	cpt_nat_manips[6];	/* NU 2.6.15 */
++
++	struct	cpt_nat_seq	cpt_nat_seq[2];
++
++	__u32	cpt_masq_index;
++	__u32	cpt_id;
++	__u32	cpt_mark;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ubparm
++{
++	__u64	barrier;
++	__u64	limit;
++	__u64	held;
++	__u64	maxheld;
++	__u64	minheld;
++	__u64	failcnt;
++} __attribute__ ((aligned (8)));
++
++struct cpt_beancounter_image {
++	__u64	cpt_next;
++	__u32	cpt_object;
++	__u16	cpt_hdrlen;
++	__u16	cpt_content;
++
++	__u64	cpt_parent;
++	__u32	cpt_id;
++	__u32	__cpt_pad;
++	struct	cpt_ubparm	cpt_parms[32 * 2];
++} __attribute__ ((aligned (8)));
++
++struct cpt_slm_sgreg_image {
++	__u64   cpt_next;
++	__u32   cpt_object;
++	__u16   cpt_hdrlen;
++	__u16   cpt_content;
++
++	__u32   cpt_size;
++	__u32   __cpt_pad1;
++	__u32   cpt_id;
++	__u16   cpt_resource;
++	__u8    cpt_regname[32];
++	__u8	__cpt_pad2[2];
++} __attribute__ ((aligned (8)));
++
++struct cpt_slm_obj_image {      
++	__u64   cpt_next;
++	__u32   cpt_object;
++	__u16   cpt_hdrlen;
++	__u16   cpt_content;
++
++	__u32   cpt_size;
++	__u32   __cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++#ifdef __KERNEL__
++
++static inline void __user * cpt_ptr_import(__u64 ptr)
++{
++	return (void*)(unsigned long)ptr;
++}
++
++static inline __u64 cpt_ptr_export(void __user *ptr)
++{
++	return (__u64)(unsigned long)ptr;
++}
++
++static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr)
++{
++	memcpy(sig, &ptr, sizeof(*sig));
++}
++
++static inline __u64 cpt_sigset_export(sigset_t *sig)
++{
++	return *(__u64*)sig;
++}
++
++static inline __u64 cpt_timespec_export(struct timespec *tv)
++{
++	return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
++}
++
++static inline void cpt_timespec_import(struct timespec *tv, __u64 val)
++{
++	tv->tv_sec = val>>32;
++	tv->tv_nsec = (val&0xFFFFFFFF);
++}
++
++static inline __u64 cpt_timeval_export(struct timeval *tv)
++{
++	return (((u64)tv->tv_sec) << 32) + tv->tv_usec;
++}
++
++static inline void cpt_timeval_import(struct timeval *tv, __u64 val)
++{
++	tv->tv_sec = val>>32;
++	tv->tv_usec = (val&0xFFFFFFFF);
++}
++
++#endif
++
++#endif /* __CPT_IMAGE_H_ */
+diff --git a/include/linux/cpt_ioctl.h b/include/linux/cpt_ioctl.h
+new file mode 100644
+index 0000000..b8e83cc
+--- /dev/null
++++ b/include/linux/cpt_ioctl.h
+@@ -0,0 +1,43 @@
++/*
++ *
++ *  include/linux/cpt_ioctl.h
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _CPT_IOCTL_H_
++#define _CPT_IOCTL_H_ 1
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#define CPTCTLTYPE '-'
++#define CPT_SET_DUMPFD	_IOW(CPTCTLTYPE, 1, int)
++#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int)
++#define CPT_SET_LOCKFD	_IOW(CPTCTLTYPE, 3, int)
++#define CPT_SET_VEID	_IOW(CPTCTLTYPE, 4, int)
++#define CPT_SUSPEND	_IO(CPTCTLTYPE, 5)
++#define CPT_DUMP	_IO(CPTCTLTYPE, 6)
++#define CPT_UNDUMP	_IO(CPTCTLTYPE, 7)
++#define CPT_RESUME	_IO(CPTCTLTYPE, 8)
++#define CPT_KILL	_IO(CPTCTLTYPE, 9)
++#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10)
++#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int)
++#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12)
++#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int)
++#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int)
++#define CPT_PAGEIND	_IO(CPTCTLTYPE, 15)
++#define CPT_VMPREP	_IOW(CPTCTLTYPE, 16, int)
++#define CPT_SET_LAZY	_IOW(CPTCTLTYPE, 17, int)
++#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int)
++#define CPT_TEST_CAPS	_IOW(CPTCTLTYPE, 19, unsigned int)
++#define CPT_TEST_VECAPS	_IOW(CPTCTLTYPE, 20, unsigned int)
++#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int)
++
++#define CPT_ITER	_IOW(CPTCTLTYPE, 23, int)
++
++#endif
+diff --git a/include/linux/dcache.h b/include/linux/dcache.h
+index d982eb8..2fceb53 100644
+--- a/include/linux/dcache.h
++++ b/include/linux/dcache.h
+@@ -7,6 +7,8 @@
+ #include <linux/cache.h>
+ #include <linux/rcupdate.h>
+ 
++#include <bc/dcache.h>
++
+ struct nameidata;
+ struct path;
+ struct vfsmount;
+@@ -110,6 +112,9 @@ struct dentry {
+ 	struct dcookie_struct *d_cookie; /* cookie, if any */
+ #endif
+ 	int d_mounted;
++#ifdef CONFIG_BEANCOUNTERS
++	struct dentry_beancounter dentry_bc;
++#endif
+ 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
+ };
+ 
+@@ -173,9 +178,13 @@ d_iput:		no		no		no       yes
+ 
+ #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED		0x0010	
++#define DCACHE_VIRTUAL		0x0100	/* ve accessible */
++
++extern void mark_tree_virtual(struct path *path);
+ 
+ #define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+ 
++extern struct kmem_cache *dentry_cache;
+ extern spinlock_t dcache_lock;
+ extern seqlock_t rename_lock;
+ 
+@@ -302,6 +311,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+ extern char *__d_path(const struct path *path, struct path *root, char *, int);
+ extern char *d_path(const struct path *, char *, int);
+ extern char *dentry_path(struct dentry *, char *, int);
++extern int d_root_check(struct path *path);
+ 
+ /* Allocation counts.. */
+ 
+@@ -321,6 +331,12 @@ extern char *dentry_path(struct dentry *, char *, int);
+ static inline struct dentry *dget(struct dentry *dentry)
+ {
+ 	if (dentry) {
++#ifdef CONFIG_BEANCOUNTERS
++		preempt_disable();
++		if (ub_dentry_on && ub_dget_testone(dentry))
++			BUG();
++		preempt_enable_no_resched();
++#endif
+ 		BUG_ON(!atomic_read(&dentry->d_count));
+ 		atomic_inc(&dentry->d_count);
+ 	}
+@@ -363,4 +379,5 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
+ 
+ extern int sysctl_vfs_cache_pressure;
+ 
++extern int check_area_access_ve(struct path *);
+ #endif	/* __LINUX_DCACHE_H */
+diff --git a/include/linux/device.h b/include/linux/device.h
+index 6a2d04c..72a6aa3 100644
+--- a/include/linux/device.h
++++ b/include/linux/device.h
+@@ -234,6 +234,15 @@ extern void class_interface_unregister(struct class_interface *);
+ extern struct class *class_create(struct module *owner, const char *name);
+ extern void class_destroy(struct class *cls);
+ 
++extern struct class net_class;
++extern struct kset *class_kset;
++
++int classes_init(void);
++void classes_fini(void);
++
++int devices_init(void);
++void devices_fini(void);
++
+ /*
+  * The type of device, "struct device" is embedded in. A class
+  * or bus can contain devices of different types
+diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
+index 154769c..ee767ed 100644
+--- a/include/linux/devpts_fs.h
++++ b/include/linux/devpts_fs.h
+@@ -23,6 +23,16 @@ int devpts_pty_new(struct tty_struct *tty);      /* mknod in devpts */
+ struct tty_struct *devpts_get_tty(int number);	 /* get tty structure */
+ void devpts_pty_kill(int number);		 /* unlink */
+ 
++struct devpts_config {
++	int setuid;
++	int setgid;
++	uid_t   uid;
++	gid_t   gid;
++	umode_t mode;
++};
++
++extern struct devpts_config devpts_config;
++extern struct file_system_type devpts_fs_type;
+ #else
+ 
+ /* Dummy stubs in the no-pty case */
+diff --git a/include/linux/elevator.h b/include/linux/elevator.h
+index 639624b..be231eb 100644
+--- a/include/linux/elevator.h
++++ b/include/linux/elevator.h
+@@ -56,6 +56,11 @@ struct elevator_ops
+ 	elevator_init_fn *elevator_init_fn;
+ 	elevator_exit_fn *elevator_exit_fn;
+ 	void (*trim)(struct io_context *);
++	/* In original cfq design task holds a cfqq refcount and puts it
++	 * on exit via io context. Now async cfqqs are hold by UB,
++	 * so we need somehow to put these queues. Use this function.
++	 */
++	void (*put_queue)(struct cfq_queue *);
+ };
+ 
+ #define ELV_NAME_MAX	(16)
+diff --git a/include/linux/elf.h b/include/linux/elf.h
+index ff9fbed..f7f8507 100644
+--- a/include/linux/elf.h
++++ b/include/linux/elf.h
+@@ -403,4 +403,6 @@ extern int elf_coredump_extra_notes_size(void);
+ extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset);
+ #endif
+ 
++extern int sysctl_at_vsyscall;
++
+ #endif /* _LINUX_ELF_H */
+diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
+index cf79853..919ebfd 100644
+--- a/include/linux/eventpoll.h
++++ b/include/linux/eventpoll.h
+@@ -15,6 +15,7 @@
+ #define _LINUX_EVENTPOLL_H
+ 
+ #include <linux/types.h>
++#include <linux/fs.h>
+ 
+ 
+ /* Valid opcodes to issue to sys_epoll_ctl() */
+@@ -60,6 +61,88 @@ static inline void eventpoll_init_file(struct file *file)
+ 	spin_lock_init(&file->f_ep_lock);
+ }
+ 
++struct epoll_filefd {
++	struct file *file;
++	int fd;
++};
++
++/*
++ * This structure is stored inside the "private_data" member of the file
++ * structure and rapresent the main data sructure for the eventpoll
++ * interface.
++ */
++struct eventpoll {
++	/* Protect the this structure access */
++	spinlock_t lock;
++
++	/*
++	 * This mutex is used to ensure that files are not removed
++	 * while epoll is using them. This is held during the event
++	 * collection loop, the file cleanup path, the epoll file exit
++	 * code and the ctl operations.
++	 */
++	struct mutex mtx;
++
++	/* Wait queue used by sys_epoll_wait() */
++	wait_queue_head_t wq;
++
++	/* Wait queue used by file->poll() */
++	wait_queue_head_t poll_wait;
++
++	/* List of ready file descriptors */
++	struct list_head rdllist;
++
++	/* RB tree root used to store monitored fd structs */
++	struct rb_root rbr;
++
++	/*
++	 * This is a single linked list that chains all the "struct epitem" that
++	 * happened while transfering ready events to userspace w/out
++	 * holding ->lock.
++	 */
++	struct epitem *ovflist;
++};
++
++/*
++ * Each file descriptor added to the eventpoll interface will
++ * have an entry of this type linked to the "rbr" RB tree.
++ */
++struct epitem {
++	/* RB tree node used to link this structure to the eventpoll RB tree */
++	struct rb_node rbn;
++
++	/* List header used to link this structure to the eventpoll ready list */
++	struct list_head rdllink;
++
++	/*
++	 * Works together "struct eventpoll"->ovflist in keeping the
++	 * single linked chain of items.
++	 */
++	struct epitem *next;
++
++	/* The file descriptor information this item refers to */
++	struct epoll_filefd ffd;
++
++	/* Number of active wait queue attached to poll operations */
++	int nwait;
++
++	/* List containing poll wait queues */
++	struct list_head pwqlist;
++
++	/* The "container" of this item */
++	struct eventpoll *ep;
++
++	/* List header used to link this item to the "struct file" items list */
++	struct list_head fllink;
++
++	/* The structure that describe the interested events and the source fd */
++	struct epoll_event event;
++};
++
++extern struct semaphore epsem;
++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
++int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++		     struct file *tfile, int fd);
+ 
+ /* Used to release the epoll bits inside the "struct file" */
+ void eventpoll_release_file(struct file *file);
+@@ -92,6 +175,8 @@ static inline void eventpoll_release(struct file *file)
+ 	eventpoll_release_file(file);
+ }
+ 
++extern struct mutex epmutex;
++
+ #else
+ 
+ static inline void eventpoll_init_file(struct file *file) {}
+diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h
+new file mode 100644
+index 0000000..e08c84d
+--- /dev/null
++++ b/include/linux/fairsched.h
+@@ -0,0 +1,86 @@
++/*
++ * Fair Scheduler
++ *
++ * Copyright (C) 2000-2008  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __LINUX_FAIRSCHED_H__
++#define __LINUX_FAIRSCHED_H__
++
++#define FAIRSCHED_SET_RATE      0
++#define FAIRSCHED_DROP_RATE     1
++#define FAIRSCHED_GET_RATE      2
++
++#ifdef __KERNEL__
++
++/* refcnt change protected with tasklist write lock */
++struct fairsched_node {
++	struct task_group *tg;
++	int refcnt;
++	unsigned id;
++	struct list_head nodelist;
++
++	unsigned weight;
++	unsigned char rate_limited;
++	unsigned rate;
++#ifdef CONFIG_VE
++	struct ve_struct *owner_env;
++#endif
++};
++
++#ifdef CONFIG_VZ_FAIRSCHED
++
++#define FAIRSCHED_INIT_NODE_ID		INT_MAX
++
++extern struct fairsched_node fairsched_init_node;
++
++void fairsched_init_early(void);
++void fairsched_init_late(void);
++
++static inline int task_fairsched_node_id(struct task_struct *p)
++{
++	return p->fsched_node->id;
++}
++
++/* must called with tasklist write locked */
++static inline void get_task_fairsched_node(struct task_struct *p)
++{
++	p->fsched_node->refcnt++;
++}
++static inline void put_task_fairsched_node(struct task_struct *p)
++{
++	p->fsched_node->refcnt--;
++}
++
++#define	INIT_VZ_FAIRSCHED		.fsched_node = &fairsched_init_node,
++
++#define FSCHWEIGHT_MAX                  ((1 << 16) - 1)
++#define FSCHRATE_SHIFT                  10
++#define FSCH_TIMESLICE                  16
++
++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
++		unsigned int newid);
++asmlinkage int sys_fairsched_rmnod(unsigned int id);
++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid);
++asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus);
++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight);
++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate);
++
++#else /* CONFIG_VZ_FAIRSCHED */
++
++static inline void fairsched_init_early(void) { }
++static inline void fairsched_init_late(void) { }
++static inline int task_fairsched_node_id(struct task_struct *p) { return 0; }
++static inline void get_task_fairsched_node(struct task_struct *p) { }
++static inline void put_task_fairsched_node(struct task_struct *p) { }
++
++#define	INIT_VZ_FAIRSCHED
++
++#endif /* CONFIG_VZ_FAIRSCHED */
++#endif /* __KERNEL__ */
++
++#endif /* __LINUX_FAIRSCHED_H__ */
+diff --git a/include/linux/faudit.h b/include/linux/faudit.h
+new file mode 100644
+index 0000000..631c42e
+--- /dev/null
++++ b/include/linux/faudit.h
+@@ -0,0 +1,45 @@
++/*
++ *  include/linux/faudit.h
++ *
++ *  Copyright (C) 2005  SWSoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __FAUDIT_H_
++#define __FAUDIT_H_
++
++#include <linux/virtinfo.h>
++
++struct vfsmount;
++struct dentry;
++struct super_block;
++struct kstatfs;
++struct kstat;
++struct pt_regs;
++
++struct faudit_regs_arg {
++	int err;
++	struct pt_regs *regs;
++};
++
++struct faudit_stat_arg {
++	int err;
++	struct vfsmount *mnt;
++	struct dentry *dentry;
++	struct kstat *stat;
++};
++
++struct faudit_statfs_arg {
++	int err;
++	struct super_block *sb;
++	struct kstatfs *stat;
++};
++
++#define VIRTINFO_FAUDIT			(0)
++#define VIRTINFO_FAUDIT_STAT		(VIRTINFO_FAUDIT + 0)
++#define VIRTINFO_FAUDIT_STATFS		(VIRTINFO_FAUDIT + 1)
++
++#endif
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index d8e2762..f2c30f6 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -50,6 +50,7 @@ extern struct inodes_stat_t inodes_stat;
+ 
+ extern int leases_enable, lease_break_time;
+ 
++extern int odirect_enable;
+ #ifdef CONFIG_DNOTIFY
+ extern int dir_notify_enable;
+ #endif
+@@ -60,6 +61,7 @@ extern int dir_notify_enable;
+ #define MAY_WRITE 2
+ #define MAY_READ 4
+ #define MAY_APPEND 8
++#define MAY_QUOTACTL 16 /* for devgroup-vs-openvz only */
+ 
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
+@@ -68,6 +70,7 @@ extern int dir_notify_enable;
+ #define FMODE_LSEEK	4
+ #define FMODE_PREAD	8
+ #define FMODE_PWRITE	FMODE_PREAD	/* These go hand in hand */
++#define FMODE_QUOTACTL	4
+ 
+ /* File is being opened for execution. Primary users of this flag are
+    distributed filesystems that can use it to achieve correct ETXTBUSY
+@@ -94,6 +97,8 @@ extern int dir_notify_enable;
+ #define FS_REQUIRES_DEV 1 
+ #define FS_BINARY_MOUNTDATA 2
+ #define FS_HAS_SUBTYPE 4
++#define FS_VIRTUALIZED	64	/* Can mount this fstype inside ve */
++#define FS_MANGLE_PROC	128	/* hide some /proc/mounts info inside VE */
+ #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
+ #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move()
+ 					 * during rename() internally.
+@@ -366,6 +371,9 @@ struct iattr {
+  * Includes for diskquotas.
+  */
+ #include <linux/quota.h>
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++#include <linux/vzquota_qlnk.h>
++#endif
+ 
+ /** 
+  * enum positive_aop_returns - aop return codes with specific semantics
+@@ -625,6 +633,9 @@ struct inode {
+ #ifdef CONFIG_QUOTA
+ 	struct dquot		*i_dquot[MAXQUOTAS];
+ #endif
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++	struct vz_quota_ilink	i_qlnk;
++#endif
+ 	struct list_head	i_devices;
+ 	union {
+ 		struct pipe_inode_info	*i_pipe;
+@@ -680,6 +691,8 @@ enum inode_i_mutex_lock_class
+ extern void inode_double_lock(struct inode *inode1, struct inode *inode2);
+ extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
+ 
++extern struct kmem_cache *inode_cachep;
++
+ /*
+  * NOTE: in a 32bit arch with a preemptable kernel and
+  * an UP compile the i_size_read/write must be atomic
+@@ -799,6 +812,7 @@ struct file {
+ 	struct fown_struct	f_owner;
+ 	unsigned int		f_uid, f_gid;
+ 	struct file_ra_state	f_ra;
++	struct user_beancounter	*f_ub;
+ 
+ 	u64			f_version;
+ #ifdef CONFIG_SECURITY
+@@ -816,6 +830,7 @@ struct file {
+ #ifdef CONFIG_DEBUG_WRITECOUNT
+ 	unsigned long f_mnt_write_state;
+ #endif
++	struct ve_struct	*owner_env;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -924,6 +939,9 @@ struct file_lock {
+ 	struct file *fl_file;
+ 	unsigned char fl_flags;
+ 	unsigned char fl_type;
++#ifdef CONFIG_BEANCOUNTERS
++	unsigned char fl_charged;
++#endif
+ 	loff_t fl_start;
+ 	loff_t fl_end;
+ 
+@@ -1245,6 +1263,7 @@ struct file_operations {
+ 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
+ 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+ 	int (*setlease)(struct file *, long, struct file_lock **);
++	struct file * (*get_host)(struct file *);
+ };
+ 
+ struct inode_operations {
+@@ -1311,6 +1330,7 @@ struct super_operations {
+ #ifdef CONFIG_QUOTA
+ 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
++	struct inode *(*get_quota_root)(struct super_block *);
+ #endif
+ };
+ 
+@@ -1487,8 +1507,14 @@ struct file_system_type {
+ 	struct lock_class_key i_mutex_key;
+ 	struct lock_class_key i_mutex_dir_key;
+ 	struct lock_class_key i_alloc_sem_key;
++
++	struct file_system_type *proto;
++	struct ve_struct *owner_env;
+ };
+ 
++void get_filesystem(struct file_system_type *fs);
++void put_filesystem(struct file_system_type *fs);
++
+ extern int get_sb_bdev(struct file_system_type *fs_type,
+ 	int flags, const char *dev_name, void *data,
+ 	int (*fill_super)(struct super_block *, void *, int),
+@@ -1528,6 +1554,11 @@ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
+ #define kern_mount(type) kern_mount_data(type, NULL)
++extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *,
++		struct file_system_type **, struct vfsmount **);
++extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *);
++extern void umount_ve_fs_type(struct file_system_type *local_fs_type);
++#define kern_umount mntput
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+@@ -1535,6 +1566,7 @@ extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
+ extern void drop_collected_mounts(struct vfsmount *);
+ 
+ extern int vfs_statfs(struct dentry *, struct kstatfs *);
++extern int faudit_statfs(struct super_block *, struct kstatfs *);
+ 
+ /* /sys/fs */
+ extern struct kobject *fs_kobj;
+@@ -1707,7 +1739,8 @@ extern int check_disk_change(struct block_device *);
+ extern int __invalidate_device(struct block_device *);
+ extern int invalidate_partition(struct gendisk *, int);
+ #endif
+-extern int invalidate_inodes(struct super_block *);
++extern int invalidate_inodes_check(struct super_block *, int check);
++#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0)
+ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ 					pgoff_t start, pgoff_t end,
+ 					bool be_atomic);
+@@ -2128,6 +2161,17 @@ static inline void free_secdata(void *secdata)
+ { }
+ #endif	/* CONFIG_SECURITY */
+ 
++static inline void *file_private(struct file *file)
++{
++	struct file *host = file;
++
++	while (host->f_op->get_host) {
++		host = host->f_op->get_host(host);
++		BUG_ON(host->f_mapping != file->f_mapping);
++	}
++	return host->private_data;
++}
++
+ struct ctl_table;
+ int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos);
+diff --git a/include/linux/futex.h b/include/linux/futex.h
+index 586ab56..9bf4c37 100644
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -124,7 +124,7 @@ struct robust_list_head {
+ #ifdef __KERNEL__
+ long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
+ 	      u32 __user *uaddr2, u32 val2, u32 val3);
+-
++long futex_wait_restart(struct restart_block *restart);
+ extern int
+ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
+ 
+diff --git a/include/linux/gfp.h b/include/linux/gfp.h
+index b414be3..fb1ad5a 100644
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -50,20 +50,25 @@ struct vm_area_struct;
+ #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
+ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
+ #define __GFP_MOVABLE	((__force gfp_t)0x100000u)  /* Page is movable */
++#define __GFP_UBC	((__force gfp_t)0x200000u)/* charge kmem in buddy and slab */
++#define __GFP_SOFT_UBC	((__force gfp_t)0x400000u)/* use soft charging */
+ 
+-#define __GFP_BITS_SHIFT 21	/* Room for 21 __GFP_FOO bits */
++#define __GFP_BITS_SHIFT 23	/* Room for __GFP_FOO bits */
+ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+ 
+ /* This equals 0, but use constants in case they ever change */
+ #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
+ /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
+ #define GFP_ATOMIC	(__GFP_HIGH)
++#define GFP_ATOMIC_UBC	(__GFP_HIGH | __GFP_UBC)
+ #define GFP_NOIO	(__GFP_WAIT)
+ #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
+ #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
++#define GFP_KERNEL_UBC	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC)
+ #define GFP_TEMPORARY	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
+ 			 __GFP_RECLAIMABLE)
+ #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
++#define GFP_USER_UBC	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC)
+ #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
+ 			 __GFP_HIGHMEM)
+ #define GFP_HIGHUSER_MOVABLE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
+diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
+index 181006c..5d48dcb 100644
+--- a/include/linux/hardirq.h
++++ b/include/linux/hardirq.h
+@@ -7,6 +7,9 @@
+ #include <asm/hardirq.h>
+ #include <asm/system.h>
+ 
++#include <bc/task.h>
++#include <linux/ve_task.h>
++
+ /*
+  * We put the hardirq and softirq counter into the preemption
+  * counter. The bitmask has the following meaning:
+@@ -126,6 +129,24 @@ extern void rcu_irq_exit(void);
+ # define rcu_irq_exit() do { } while (0)
+ #endif /* CONFIG_PREEMPT_RCU */
+ 
++#define save_context()		do {				\
++		struct task_struct *tsk;			\
++		if (hardirq_count() == HARDIRQ_OFFSET) {	\
++			tsk = current;				\
++			ve_save_context(tsk);			\
++			ub_save_context(tsk);			\
++		}						\
++	} while (0)
++
++#define restore_context()		do {			\
++		struct task_struct *tsk;			\
++		if (hardirq_count() == HARDIRQ_OFFSET) {	\
++			tsk = current;				\
++			ve_restore_context(tsk);		\
++			ub_restore_context(tsk);		\
++		}						\
++	} while (0)
++
+ /*
+  * It is safe to do non-atomic ops on ->hardirq_context,
+  * because NMI handlers may not preempt and the ops are
+@@ -137,6 +158,7 @@ extern void rcu_irq_exit(void);
+ 		rcu_irq_enter();			\
+ 		account_system_vtime(current);		\
+ 		add_preempt_count(HARDIRQ_OFFSET);	\
++		save_context();				\
+ 		trace_hardirq_enter();			\
+ 	} while (0)
+ 
+@@ -152,6 +174,7 @@ extern void irq_enter(void);
+ 	do {						\
+ 		trace_hardirq_exit();			\
+ 		account_system_vtime(current);		\
++		restore_context();			\
+ 		sub_preempt_count(HARDIRQ_OFFSET);	\
+ 		rcu_irq_exit();				\
+ 	} while (0)
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
+index 6d93dce..b2c6f88 100644
+--- a/include/linux/hrtimer.h
++++ b/include/linux/hrtimer.h
+@@ -342,6 +342,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp,
+ 			      const enum hrtimer_mode mode,
+ 			      const clockid_t clockid);
+ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
++#ifdef CONFIG_COMPAT
++long compat_nanosleep_restart(struct restart_block *restart);
++#endif
+ 
+ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ 				 struct task_struct *tsk);
+diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
+index 950e13d..0ca89e7 100644
+--- a/include/linux/if_bridge.h
++++ b/include/linux/if_bridge.h
+@@ -44,6 +44,7 @@
+ #define BRCTL_SET_PORT_PRIORITY 16
+ #define BRCTL_SET_PATH_COST 17
+ #define BRCTL_GET_FDB_ENTRIES 18
++#define BRCTL_SET_VIA_ORIG_DEV 19
+ 
+ #define BR_STATE_DISABLED 0
+ #define BR_STATE_LISTENING 1
+@@ -72,6 +73,7 @@ struct __bridge_info
+ 	__u32 tcn_timer_value;
+ 	__u32 topology_change_timer_value;
+ 	__u32 gc_timer_value;
++	__u8 via_phys_dev;
+ };
+ 
+ struct __port_info
+@@ -106,9 +108,12 @@ struct __fdb_entry
+ 
+ #include <linux/netdevice.h>
+ 
++#define BR_ALREADY_SEEN 1
++
+ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
+ extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
+ 					       struct sk_buff *skb);
++extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
+ extern int (*br_should_route_hook)(struct sk_buff *skb);
+ 
+ #endif
+diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
+index 8c71fe2..df256e8 100644
+--- a/include/linux/if_tun.h
++++ b/include/linux/if_tun.h
+@@ -19,6 +19,7 @@
+ #define __IF_TUN_H
+ 
+ #include <linux/types.h>
++#include <linux/if_ether.h>
+ 
+ /* Read queue size */
+ #define TUN_READQ_SIZE	500
+@@ -55,4 +56,40 @@ struct tun_pi {
+ };
+ #define TUN_PKT_STRIP	0x0001
+ 
++struct sk_buff_head;
++struct tun_struct {
++	struct list_head        list;
++	unsigned long 		flags;
++	int			attached;
++	void			*bind_file;
++	uid_t			owner;
++	gid_t			group;
++
++	wait_queue_head_t	read_wait;
++	struct sk_buff_head	readq;
++
++	struct net_device	*dev;
++
++	struct fasync_struct    *fasync;
++
++	unsigned long if_flags;
++	u8 dev_addr[ETH_ALEN];
++	u32 chr_filter[2];
++	u32 net_filter[2];
++
++#ifdef TUN_DEBUG
++	int debug;
++#endif
++};
++
++struct tun_net {
++	struct list_head dev_list;
++};
++
++extern int tun_net_open(struct net_device *dev);
++extern int tun_chr_open(struct inode *inode, struct file * file);
++extern void tun_net_init(struct net_device *dev);
++extern void tun_setup(struct net_device *dev);
++extern struct list_head tun_dev_list;
++
+ #endif /* __IF_TUN_H */
+diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
+index 15ace02..ea02489 100644
+--- a/include/linux/if_vlan.h
++++ b/include/linux/if_vlan.h
+@@ -88,6 +88,9 @@ struct vlan_group {
+ 	struct hlist_node	hlist;	/* linked list */
+ 	struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
+ 	struct rcu_head		rcu;
++#ifdef CONFIG_VE
++	struct ve_struct	*owner;
++#endif
+ };
+ 
+ static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
+diff --git a/include/linux/init_task.h b/include/linux/init_task.h
+index 9927a88..f235468 100644
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -10,6 +10,7 @@
+ #include <linux/user_namespace.h>
+ #include <linux/securebits.h>
+ #include <net/net_namespace.h>
++#include <linux/fairsched.h>
+ 
+ extern struct files_struct init_files;
+ 
+@@ -49,10 +50,17 @@ extern struct files_struct init_files;
+ 	.rlim		= INIT_RLIMITS,					\
+ }
+ 
++#ifdef CONFIG_VE
++/* one for ve0, one for init_task */
++#define INIT_NSPROXY_COUNT	ATOMIC_INIT(2)
++#else
++#define INIT_NSPROXY_COUNT	ATOMIC_INIT(1)
++#endif
++
+ extern struct nsproxy init_nsproxy;
+ #define INIT_NSPROXY(nsproxy) {						\
+ 	.pid_ns		= &init_pid_ns,					\
+-	.count		= ATOMIC_INIT(1),				\
++	.count		= INIT_NSPROXY_COUNT,				\
+ 	.uts_ns		= &init_uts_ns,					\
+ 	.mnt_ns		= NULL,						\
+ 	INIT_NET_NS(net_ns)                                             \
+@@ -179,6 +187,7 @@ extern struct group_info init_groups;
+ 	INIT_IDS							\
+ 	INIT_TRACE_IRQFLAGS						\
+ 	INIT_LOCKDEP							\
++	INIT_VZ_FAIRSCHED						\
+ }
+ 
+ 
+diff --git a/include/linux/inotify.h b/include/linux/inotify.h
+index 742b917..a935bb3 100644
+--- a/include/linux/inotify.h
++++ b/include/linux/inotify.h
+@@ -67,6 +67,7 @@ struct inotify_event {
+ 
+ #include <linux/dcache.h>
+ #include <linux/fs.h>
++#include <linux/idr.h>
+ 
+ /*
+  * struct inotify_watch - represents a watch request on a specific inode
+@@ -84,6 +85,7 @@ struct inotify_watch {
+ 	struct list_head	i_list;	/* entry in inode's list */
+ 	atomic_t		count;	/* reference count */
+ 	struct inotify_handle	*ih;	/* associated inotify handle */
++	struct path		path;
+ 	struct inode		*inode;	/* associated inode */
+ 	__s32			wd;	/* watch descriptor */
+ 	__u32			mask;	/* event mask for this watch */
+@@ -120,6 +122,8 @@ extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *,
+ 				       u32);
+ extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *,
+ 			       struct inode *, __u32);
++extern __s32 inotify_add_watch_dget(struct inotify_handle *, struct inotify_watch *,
++			       struct path *, __u32);
+ extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *);
+ extern void inotify_evict_watch(struct inotify_watch *);
+ extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *);
+@@ -129,6 +133,66 @@ extern void inotify_remove_watch_locked(struct inotify_handle *,
+ extern void get_inotify_watch(struct inotify_watch *);
+ extern void put_inotify_watch(struct inotify_watch *);
+ 
++/*
++ * struct inotify_handle - represents an inotify instance
++ *
++ * This structure is protected by the mutex 'mutex'.
++ */
++struct inotify_handle {
++	struct idr		idr;		/* idr mapping wd -> watch */
++	struct mutex		mutex;		/* protects this bad boy */
++	struct list_head	watches;	/* list of watches */
++	atomic_t		count;		/* reference count */
++	u32			last_wd;	/* the last wd allocated */
++	const struct inotify_operations *in_ops; /* inotify caller operations */
++};
++
++
++/*
++ * struct inotify_device - represents an inotify instance
++ *
++ * This structure is protected by the mutex 'mutex'.
++ */
++struct inotify_device {
++	wait_queue_head_t 	wq;		/* wait queue for i/o */
++	struct mutex		ev_mutex;	/* protects event queue */
++	struct mutex		up_mutex;	/* synchronizes watch updates */
++	struct list_head 	events;		/* list of queued events */
++	atomic_t		count;		/* reference count */
++	struct user_struct	*user;		/* user who opened this dev */
++	struct inotify_handle	*ih;		/* inotify handle */
++	struct fasync_struct    *fa;            /* async notification */
++	unsigned int		queue_size;	/* size of the queue (bytes) */
++	unsigned int		event_count;	/* number of pending events */
++	unsigned int		max_events;	/* maximum number of events */
++};
++
++/*
++ * struct inotify_kernel_event - An inotify event, originating from a watch and
++ * queued for user-space.  A list of these is attached to each instance of the
++ * device.  In read(), this list is walked and all events that can fit in the
++ * buffer are returned.
++ *
++ * Protected by dev->ev_mutex of the device in which we are queued.
++ */
++struct inotify_kernel_event {
++	struct inotify_event	event;	/* the user-space event */
++	struct list_head        list;	/* entry in inotify_device's list */
++	char			*name;	/* filename, if any */
++};
++
++/*
++ * struct inotify_user_watch - our version of an inotify_watch, we add
++ * a reference to the associated inotify_device.
++ */
++struct inotify_user_watch {
++	struct inotify_device	*dev;	/* associated device */
++	struct inotify_watch	wdata;	/* inotify watch data */
++};
++
++int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask);
++
++
+ #else
+ 
+ static inline void inotify_d_instantiate(struct dentry *dentry,
+@@ -198,6 +262,13 @@ static inline __s32 inotify_add_watch(struct inotify_handle *ih,
+ 	return -EOPNOTSUPP;
+ }
+ 
++static inline __s32 inotify_add_watch_dget(struct inotify_handle *h,
++					   struct inotify_watch *w,
++					   struct path *p, __u32 mask)
++{
++	return -EOPNOTSUPP;
++}
++
+ static inline int inotify_rm_watch(struct inotify_handle *ih,
+ 				   struct inotify_watch *watch)
+ {
+diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
+index f98a656..2d86ade 100644
+--- a/include/linux/ioprio.h
++++ b/include/linux/ioprio.h
+@@ -39,6 +39,7 @@ enum {
+ 	IOPRIO_WHO_PROCESS = 1,
+ 	IOPRIO_WHO_PGRP,
+ 	IOPRIO_WHO_USER,
++	IOPRIO_WHO_UBC = 1000,
+ };
+ 
+ /*
+diff --git a/include/linux/ipc.h b/include/linux/ipc.h
+index b882610..67d186c 100644
+--- a/include/linux/ipc.h
++++ b/include/linux/ipc.h
+@@ -81,6 +81,7 @@ struct ipc_kludge {
+ 
+ #include <linux/kref.h>
+ #include <linux/spinlock.h>
++#include <linux/rcupdate.h>
+ 
+ #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
+ 
+@@ -100,6 +101,15 @@ struct kern_ipc_perm
+ 	void		*security;
+ };
+ 
++struct ipc_ids;
++
++struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
++static inline void ipc_unlock(struct kern_ipc_perm *perm)
++{
++	spin_unlock(&perm->lock);
++	rcu_read_unlock();
++}
++
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_IPC_H */
+diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
+index 2dacab8..91783a7 100644
+--- a/include/linux/kdev_t.h
++++ b/include/linux/kdev_t.h
+@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 dev)
+ 	return dev & 0x3ffff;
+ }
+ 
++#define UNNAMED_MAJOR_COUNT	16
++
++#if UNNAMED_MAJOR_COUNT > 1
++
++extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT];
++
++static inline dev_t make_unnamed_dev(int idx)
++{
++	/*
++	 * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the
++	 * unnamed device index into major number.
++	 */
++	return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)],
++		     idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8));
++}
++
++static inline int unnamed_dev_idx(dev_t dev)
++{
++	int i;
++	for (i = 0; i < UNNAMED_MAJOR_COUNT &&
++				MAJOR(dev) != unnamed_dev_majors[i]; i++);
++	return MINOR(dev) | (i << 8);
++}
++
++static inline int is_unnamed_dev(dev_t dev)
++{
++	int i;
++	for (i = 0; i < UNNAMED_MAJOR_COUNT &&
++				MAJOR(dev) != unnamed_dev_majors[i]; i++);
++	return i < UNNAMED_MAJOR_COUNT;
++}
++
++#else /* UNNAMED_MAJOR_COUNT */
++
++static inline dev_t make_unnamed_dev(int idx)
++{
++	return MKDEV(0, idx);
++}
++
++static inline int unnamed_dev_idx(dev_t dev)
++{
++	return MINOR(dev);
++}
++
++static inline int is_unnamed_dev(dev_t dev)
++{
++	return MAJOR(dev) == 0;
++}
++
++#endif /* UNNAMED_MAJOR_COUNT */
++
+ #else /* __KERNEL__ */
+ 
+ /*
+diff --git a/include/linux/kernel.h b/include/linux/kernel.h
+index 2e70006..5112a04 100644
+--- a/include/linux/kernel.h
++++ b/include/linux/kernel.h
+@@ -191,6 +191,12 @@ extern int log_buf_get_len(void);
+ extern int log_buf_read(int idx);
+ extern int log_buf_copy(char *dest, int idx, int len);
+ 
++asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
++	__attribute__ ((format (printf, 2, 0)));
++asmlinkage int ve_printk(int, const char * fmt, ...)
++	__attribute__ ((format (printf, 2, 3)));
++void prepare_printk(void);
++
+ extern int printk_ratelimit_jiffies;
+ extern int printk_ratelimit_burst;
+ extern int printk_ratelimit(void);
+@@ -208,6 +214,15 @@ static inline int __cold printk(const char *s, ...) { return 0; }
+ static inline int log_buf_get_len(void) { return 0; }
+ static inline int log_buf_read(int idx) { return 0; }
+ static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
++static inline int ve_printk(int d, const char *s, ...)
++	__attribute__ ((format (printf, 2, 3)));
++static inline int ve_printk(int d, const char *s, ...)
++{
++	return 0;
++}
++static inline void prepare_printk(void)
++{
++}
+ static inline int printk_ratelimit(void) { return 0; }
+ static inline int __printk_ratelimit(int ratelimit_jiffies, \
+ 				     int ratelimit_burst) { return 0; }
+@@ -216,14 +231,23 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
+ 		{ return false; }
+ #endif
+ 
++#define VE0_LOG		1
++#define VE_LOG		2
++#define VE_LOG_BOTH	(VE0_LOG | VE_LOG)
++
+ extern void __attribute__((format(printf, 1, 2)))
+ 	early_printk(const char *fmt, ...);
+ 
+ unsigned long int_sqrt(unsigned long);
+ 
++extern int console_silence_loglevel;
++
+ static inline void console_silent(void)
+ {
+-	console_loglevel = 0;
++	if (console_loglevel > console_silence_loglevel) {
++		printk(KERN_EMERG "console shuts up ...\n");
++		console_loglevel = 0;
++	}
+ }
+ 
+ static inline void console_verbose(void)
+@@ -237,6 +261,7 @@ extern void wake_up_klogd(void);
+ extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
+ extern int panic_timeout;
+ extern int panic_on_oops;
++extern int decode_call_traces;
+ extern int panic_on_unrecovered_nmi;
+ extern int tainted;
+ extern const char *print_tainted(void);
+diff --git a/include/linux/kobject.h b/include/linux/kobject.h
+index 39e709f..74ee66d 100644
+--- a/include/linux/kobject.h
++++ b/include/linux/kobject.h
+@@ -52,6 +52,8 @@ enum kobject_action {
+ 	KOBJ_REMOVE,
+ 	KOBJ_CHANGE,
+ 	KOBJ_MOVE,
++	KOBJ_START,
++	KOBJ_STOP,
+ 	KOBJ_ONLINE,
+ 	KOBJ_OFFLINE,
+ 	KOBJ_MAX
+diff --git a/include/linux/major.h b/include/linux/major.h
+index 0cb9805..93c234c 100644
+--- a/include/linux/major.h
++++ b/include/linux/major.h
+@@ -170,4 +170,7 @@
+ 
+ #define VIOTAPE_MAJOR		230
+ 
++#define UNNAMED_EXTRA_MAJOR		130
++#define UNNAMED_EXTRA_MAJOR_COUNT	120
++
+ #endif
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 586a943..484bd0b 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -702,15 +702,7 @@ static inline int page_mapped(struct page *page)
+ 
+ extern void show_free_areas(void);
+ 
+-#ifdef CONFIG_SHMEM
+-int shmem_lock(struct file *file, int lock, struct user_struct *user);
+-#else
+-static inline int shmem_lock(struct file *file, int lock,
+-			     struct user_struct *user)
+-{
+-	return 0;
+-}
+-#endif
++#define shmem_nopage filemap_nopage
+ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+ 
+ int shmem_zero_setup(struct vm_area_struct *);
+@@ -776,7 +768,9 @@ void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
+ 		unsigned long floor, unsigned long ceiling);
+ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+-			struct vm_area_struct *vma);
++		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma,
++		      unsigned long addr, size_t size);
+ void unmap_mapping_range(struct address_space *mapping,
+ 		loff_t const holebegin, loff_t const holelen, int even_cows);
+ 
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 02a27ae..b9c983e 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -94,6 +94,14 @@ struct page {
+ #ifdef CONFIG_CGROUP_MEM_RES_CTLR
+ 	unsigned long page_cgroup;
+ #endif
++#ifdef CONFIG_BEANCOUNTERS
++	/* FIXME: switch to mainline memcgroup */
++	union {
++		struct user_beancounter *page_ub;
++		struct page_beancounter *page_pb;
++		struct user_beancounter **slub_ubs;
++	} bc;
++#endif
+ };
+ 
+ /*
+@@ -219,12 +227,18 @@ struct mm_struct {
+ 
+ 	unsigned long flags; /* Must use atomic bitops to access the bits */
+ 
++	unsigned int vps_dumpable:2;
++	unsigned int oom_killed:1;
++
+ 	/* coredumping support */
+ 	struct completion *core_startup_done, core_done;
+ 
+ 	/* aio bits */
+ 	rwlock_t		ioctx_list_lock;	/* aio lock */
+ 	struct kioctx		*ioctx_list;
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *mm_ub;
++#endif
+ #ifdef CONFIG_MM_OWNER
+ 	/*
+ 	 * "owner" points to a task that is regarded as the canonical
+diff --git a/include/linux/mman.h b/include/linux/mman.h
+index dab8892..a8528e1 100644
+--- a/include/linux/mman.h
++++ b/include/linux/mman.h
+@@ -61,6 +61,9 @@ static inline unsigned long
+ calc_vm_flag_bits(unsigned long flags)
+ {
+ 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
++#ifdef MAP_GROWSUP
++	       _calc_vm_trans(flags, MAP_GROWSUP,    VM_GROWSUP ) |
++#endif
+ 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
+ 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+ 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
+index 830bbcd..fdc1225 100644
+--- a/include/linux/mnt_namespace.h
++++ b/include/linux/mnt_namespace.h
+@@ -24,6 +24,8 @@ struct proc_mounts {
+ 
+ extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
+ 		struct fs_struct *);
++extern struct rw_semaphore namespace_sem;
++
+ extern void __put_mnt_ns(struct mnt_namespace *ns);
+ 
+ static inline void put_mnt_ns(struct mnt_namespace *ns)
+diff --git a/include/linux/mount.h b/include/linux/mount.h
+index 4374d1a..af1d137 100644
+--- a/include/linux/mount.h
++++ b/include/linux/mount.h
+@@ -71,6 +71,7 @@ struct vfsmount {
+ 	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+ 	 */
+ 	atomic_t __mnt_writers;
++	unsigned owner;
+ };
+ 
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
+diff --git a/include/linux/msg.h b/include/linux/msg.h
+index 56abf15..050f740 100644
+--- a/include/linux/msg.h
++++ b/include/linux/msg.h
+@@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext,
+ extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
+ 			size_t msgsz, long msgtyp, int msgflg);
+ 
++int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg);
++int sysvipc_setup_msg(key_t key, int msqid, int msgflg);
++int sysv_msg_store(struct msg_msg *msg,
++		   int (*store)(void * src, int len, int offset, void * data),
++		   int len, void * data);
++struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
++					  void * data), int len, void * data);
++
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_MSG_H */
+diff --git a/include/linux/namei.h b/include/linux/namei.h
+index 24d88e9..5ac5b00 100644
+--- a/include/linux/namei.h
++++ b/include/linux/namei.h
+@@ -56,6 +56,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+ #define LOOKUP_CREATE		(0x0200)
+ #define LOOKUP_ACCESS		(0x0400)
+ #define LOOKUP_CHDIR		(0x0800)
++#define LOOKUP_NOAREACHECK	(0x1000)	/* no area check on lookup */
++#define LOOKUP_STRICT		(0x2000)	/* no symlinks or other filesystems */
+ 
+ extern int __user_walk(const char __user *, unsigned, struct nameidata *);
+ extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *);
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index 25f8710..bacc0a0 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -291,6 +291,11 @@ enum netdev_state_t
+ 	__LINK_STATE_QDISC_RUNNING,
+ };
+ 
++struct netdev_bc {
++	struct user_beancounter *exec_ub, *owner_ub;
++};
++
++#define netdev_bc(dev)		(&(dev)->dev_bc)
+ 
+ /*
+  * This structure holds at boot time configured netdevice settings. They
+@@ -527,6 +532,10 @@ struct net_device
+ #define NETIF_F_GSO_ROBUST	(SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
+ #define NETIF_F_TSO_ECN		(SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
+ #define NETIF_F_TSO6		(SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
++/* device is venet device */
++#define NETIF_F_VENET		(1 << (NETIF_F_GSO_SHIFT - 1))
++/* can be registered inside VE */
++#define NETIF_F_VIRTUAL		(1 << (NETIF_F_GSO_SHIFT - 2))
+ 
+ 	/* List of features with software fallbacks. */
+ #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
+@@ -741,6 +750,9 @@ struct net_device
+ 	/* macvlan */
+ 	struct macvlan_port	*macvlan_port;
+ 
++	struct ve_struct	*owner_env; /* Owner VE of the interface */
++	struct netdev_bc	dev_bc;
++
+ 	/* class/net/name entry */
+ 	struct device		dev;
+ 	/* space for optional statistics and wireless sysfs groups */
+@@ -762,6 +774,20 @@ struct net_device
+ };
+ #define to_net_dev(d) container_of(d, struct net_device, dev)
+ 
++#define NETDEV_HASHBITS	8
++#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
++
++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
++{
++	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
++	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
++}
++
++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
++{
++	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
++}
++
+ #define	NETDEV_ALIGN		32
+ #define	NETDEV_ALIGN_CONST	(NETDEV_ALIGN - 1)
+ 
+@@ -1148,6 +1174,9 @@ extern int		dev_ethtool(struct net *net, struct ifreq *);
+ extern unsigned		dev_get_flags(const struct net_device *);
+ extern int		dev_change_flags(struct net_device *, unsigned);
+ extern int		dev_change_name(struct net_device *, char *);
++int __dev_change_net_namespace(struct net_device *, struct net *, const char *,
++			struct ve_struct *src_ve, struct ve_struct *dst_ve,
++			struct user_beancounter *exec_ub);
+ extern int		dev_change_net_namespace(struct net_device *,
+ 						 struct net *, const char *);
+ extern int		dev_set_mtu(struct net_device *, int);
+@@ -1513,6 +1542,18 @@ extern void linkwatch_run_queue(void);
+ 
+ extern int netdev_compute_features(unsigned long all, unsigned long one);
+ 
++#if defined(CONFIG_VE) && defined(CONFIG_NET)
++static inline int ve_is_dev_movable(struct net_device *dev)
++{
++	return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL));
++}
++#else
++static inline int ve_is_dev_movable(struct net_device *dev)
++{
++	return 0;
++}
++#endif
++
+ static inline int net_gso_ok(int features, int gso_type)
+ {
+ 	int feature = gso_type << NETIF_F_GSO_SHIFT;
+diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
+index 0c5eb7e..8d41ea4 100644
+--- a/include/linux/netfilter.h
++++ b/include/linux/netfilter.h
+@@ -394,5 +394,24 @@ static inline struct net *nf_post_routing_net(const struct net_device *in,
+ #endif
+ }
+ 
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/vziptable_defs.h>
++
++#define net_ipt_module_permitted(netns, ipt)				\
++	(VE_IPT_CMP((netns)->owner_ve->ipt_mask, ipt) &&		\
++	 VE_IPT_CMP((netns)->owner_ve->_iptables_modules,		\
++		(ipt) & ~(ipt##_MOD)))
++
++#define net_ipt_module_set(netns, ipt)	({				\
++		(netns)->owner_ve->_iptables_modules |= ipt##_MOD;	\
++		})
++#define net_is_ipt_module_set(netns, ipt)     (	\
++		(netns)->owner_ve->_iptables_modules & (ipt##_MOD))
++#else
++#define net_ipt_module_permitted(netns, ipt)  (1)
++#define net_ipt_module_set(netns, ipt)
++#define net_is_ipt_module_set(netns, ipt)     (1)
++#endif
++
+ #endif /*__KERNEL__*/
+ #endif /*__LINUX_NETFILTER_H*/
+diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
+index 2326296..7a66377 100644
+--- a/include/linux/netfilter/x_tables.h
++++ b/include/linux/netfilter/x_tables.h
+@@ -302,6 +302,7 @@ struct xt_table_info
+ {
+ 	/* Size per table */
+ 	unsigned int size;
++	unsigned int alloc_size;
+ 	/* Number of entries: FIXME. --RR */
+ 	unsigned int number;
+ 	/* Initial number of entries. Needed for module usage count */
+diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
+index 51b18d8..439da56 100644
+--- a/include/linux/netfilter/xt_hashlimit.h
++++ b/include/linux/netfilter/xt_hashlimit.h
+@@ -63,4 +63,11 @@ struct xt_hashlimit_mtinfo1 {
+ 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+ };
+ 
++#ifdef __KERNEL__
++struct ve_xt_hashlimit {
++	struct hlist_head	hashlimit_htables;
++	struct proc_dir_entry	*hashlimit_procdir4;
++	struct proc_dir_entry	*hashlimit_procdir6;
++};
++#endif
+ #endif /*_XT_HASHLIMIT_H*/
+diff --git a/include/linux/netfilter_ipv4/ipt_recent.h b/include/linux/netfilter_ipv4/ipt_recent.h
+index 6508a45..3b9a1e8 100644
+--- a/include/linux/netfilter_ipv4/ipt_recent.h
++++ b/include/linux/netfilter_ipv4/ipt_recent.h
+@@ -24,4 +24,12 @@ struct ipt_recent_info {
+ 	u_int8_t    side;
+ };
+ 
++#ifdef __KERNEL__
++struct ve_ipt_recent {
++	struct list_head	tables;
++#ifdef CONFIG_PROC_FS
++	struct proc_dir_entry	*proc_dir;
++#endif
++};
++#endif
+ #endif /*_IPT_RECENT_H*/
+diff --git a/include/linux/nfcalls.h b/include/linux/nfcalls.h
+new file mode 100644
+index 0000000..f968054
+--- /dev/null
++++ b/include/linux/nfcalls.h
+@@ -0,0 +1,172 @@
++/*
++ *  include/linux/nfcalls.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_NFCALLS_H
++#define _LINUX_NFCALLS_H
++
++#include <linux/rcupdate.h>
++
++#ifdef CONFIG_MODULES
++extern struct module no_module;
++
++#define DECL_KSYM_MODULE(name)				\
++	extern struct module *vz_mod_##name
++
++#define INIT_KSYM_MODULE(name)				\
++	struct module *vz_mod_##name = &no_module;	\
++	EXPORT_SYMBOL(vz_mod_##name)
++
++static inline void __vzksym_modresolve(struct module **modp, struct module *mod)
++{
++	/*
++	 * we want to be sure, that pointer updates are visible first:
++	 * 1. wmb() is here only for piece of sure
++	 *    (note, no rmb() in KSYMSAFECALL)
++	 * 2. synchronize_sched() guarantees that updates are visible
++	 *    on all cpus and allows us to remove rmb() in KSYMSAFECALL
++	 */
++	wmb(); synchronize_sched();
++	*modp = mod;
++	/* just to be sure, our changes are visible as soon as possible */
++	wmb(); synchronize_sched();
++}
++
++static inline void __vzksym_modunresolve(struct module **modp)
++{
++	/*
++	 * try_module_get() in KSYMSAFECALL should fail at this moment since
++	 * THIS_MODULE in in unloading state (we should be called from fini),
++	 * no need to syncronize pointers/ve_module updates.
++	 */
++	*modp = &no_module;
++	/*
++	 * synchronize_sched() guarantees here that we see
++	 * updated module pointer before the module really gets away
++	 */
++	synchronize_sched();
++}
++
++static inline int __vzksym_module_get(struct module *mod)
++{
++	/*
++	 * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE
++	 * and smp_read_barrier_depends() here...
++	 */
++	smp_read_barrier_depends(); /* for module loading */
++	if (!try_module_get(mod))
++		return -EBUSY;
++
++	return 0;
++}
++
++static inline void __vzksym_module_put(struct module *mod)
++{
++	module_put(mod);
++}
++#else
++#define DECL_KSYM_MODULE(name)
++#define INIT_KSYM_MODULE(name)
++#define __vzksym_modresolve(modp, mod)
++#define __vzksym_modunresolve(modp)
++#define __vzksym_module_get(mod)	0
++#define __vzksym_module_put(mod)
++#endif
++
++#define __KSYMERRCALL(err, type, mod, name, args)	\
++({							\
++	type ret = (type)err;				\
++	if (!__vzksym_module_get(vz_mod_##mod)) {	\
++		if (vz_##name)				\
++			ret = ((*vz_##name)args);	\
++		__vzksym_module_put(vz_mod_##mod);	\
++	}						\
++	ret;						\
++})
++
++#define __KSYMSAFECALL_VOID(mod, name, args)			\
++	do {							\
++		if (!__vzksym_module_get(vz_mod_##mod)) {	\
++			if (vz_##name)				\
++				((*vz_##name)args);		\
++			__vzksym_module_put(vz_mod_##mod);	\
++		}						\
++	} while (0)
++
++#define DECL_KSYM_CALL(type, name, args)               \
++	extern type (*vz_##name) args
++#define INIT_KSYM_CALL(type, name, args)               \
++	type (*vz_##name) args;                         \
++EXPORT_SYMBOL(vz_##name)
++
++#define KSYMERRCALL(err, mod, name, args)              \
++	__KSYMERRCALL(err, int, mod, name, args)
++#define KSYMSAFECALL(type, mod, name, args)            \
++	__KSYMERRCALL(0, type, mod, name, args)
++#define KSYMSAFECALL_VOID(mod, name, args)             \
++	__KSYMSAFECALL_VOID(mod, name, args)
++#define KSYMREF(name)                                  vz_##name
++
++/* should be called _after_ KSYMRESOLVE's */
++#define KSYMMODRESOLVE(name)                           \
++	__vzksym_modresolve(&vz_mod_##name, THIS_MODULE)
++#define KSYMMODUNRESOLVE(name)                         \
++	__vzksym_modunresolve(&vz_mod_##name)
++
++#define KSYMRESOLVE(name)                              \
++	vz_##name = &name
++#define KSYMUNRESOLVE(name)                            \
++	vz_##name = NULL
++
++#if defined(CONFIG_VE)
++DECL_KSYM_MODULE(ip_tables);
++DECL_KSYM_MODULE(ip6_tables);
++DECL_KSYM_MODULE(iptable_filter);
++DECL_KSYM_MODULE(ip6table_filter);
++DECL_KSYM_MODULE(iptable_mangle);
++DECL_KSYM_MODULE(ip6table_mangle);
++DECL_KSYM_MODULE(ip_conntrack);
++DECL_KSYM_MODULE(nf_conntrack);
++DECL_KSYM_MODULE(nf_conntrack_ipv4);
++DECL_KSYM_MODULE(nf_conntrack_ipv6);
++DECL_KSYM_MODULE(xt_conntrack);
++DECL_KSYM_MODULE(ip_nat);
++DECL_KSYM_MODULE(nf_nat);
++DECL_KSYM_MODULE(iptable_nat);
++
++struct sk_buff;
++
++DECL_KSYM_CALL(int, init_iptable_conntrack, (void));
++DECL_KSYM_CALL(int, nf_conntrack_init_ve, (void));
++DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void));
++DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void));
++DECL_KSYM_CALL(int, nf_nat_init, (void));
++DECL_KSYM_CALL(int, init_nftable_nat, (void));
++DECL_KSYM_CALL(int, nf_nat_init, (void));
++DECL_KSYM_CALL(void, fini_nftable_nat, (void));
++DECL_KSYM_CALL(void, nf_nat_cleanup, (void));
++DECL_KSYM_CALL(void, fini_iptable_conntrack, (void));
++DECL_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void));
++DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void));
++DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void));
++
++#include <linux/netfilter/x_tables.h>
++#endif
++
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++DECL_KSYM_MODULE(vzethdev);
++DECL_KSYM_CALL(int, veth_open, (struct net_device *dev));
++#endif
++
++#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE)
++DECL_KSYM_MODULE(vzmon);
++DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env));
++#endif
++
++#endif /* _LINUX_NFCALLS_H */
+diff --git a/include/linux/notifier.h b/include/linux/notifier.h
+index 0ff6224..1e22bad 100644
+--- a/include/linux/notifier.h
++++ b/include/linux/notifier.h
+@@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ 
+ #define NOTIFY_DONE		0x0000		/* Don't care */
+ #define NOTIFY_OK		0x0001		/* Suits me */
++#define NOTIFY_FAIL		0x0002		/* Reject */
+ #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
+-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
++#define NOTIFY_BAD		(NOTIFY_STOP_MASK|NOTIFY_FAIL)
+ 						/* Bad/Veto action */
+ /*
+  * Clean way to return from the notifier and stop further calls.
+diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
+index 0e66b57..dd6d50f 100644
+--- a/include/linux/nsproxy.h
++++ b/include/linux/nsproxy.h
+@@ -66,6 +66,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+ void exit_task_namespaces(struct task_struct *tsk);
+ void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
+ void free_nsproxy(struct nsproxy *ns);
++struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk);
+ int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
+ 	struct fs_struct *);
+ 
+@@ -76,9 +77,10 @@ static inline void put_nsproxy(struct nsproxy *ns)
+ 	}
+ }
+ 
+-static inline void get_nsproxy(struct nsproxy *ns)
++static inline struct nsproxy *get_nsproxy(struct nsproxy *ns)
+ {
+ 	atomic_inc(&ns->count);
++	return ns;
+ }
+ 
+ #ifdef CONFIG_CGROUP_NS
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index f31debf..5b44dd6 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+ __PAGEFLAG(Slab, slab)
+ PAGEFLAG(Checked, owner_priv_1)		/* Used by some filesystems */
+ PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */
++PAGEFLAG(Checkpointed, owner_priv_1)
+ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
+ PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
+ 	__SETPAGEFLAG(Private, private)
+diff --git a/include/linux/percpu.h b/include/linux/percpu.h
+index 4cdd393..5ac97e1 100644
+--- a/include/linux/percpu.h
++++ b/include/linux/percpu.h
+@@ -74,6 +74,13 @@ struct percpu_data {
+         (__typeof__(ptr))__p->ptrs[(cpu)];	          \
+ })
+ 
++#define static_percpu_ptr(sptr, sptrs) ({		\
++		int i;					\
++		for (i = 0; i < NR_CPUS; i++)		\
++			(sptr)->ptrs[i] = &(sptrs)[i];	\
++		(__typeof__(&sptrs[0]))__percpu_disguise(sptr);\
++	})
++
+ extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu);
+ extern void percpu_depopulate(void *__pdata, int cpu);
+ extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+@@ -85,6 +92,7 @@ extern void percpu_free(void *__pdata);
+ #else /* CONFIG_SMP */
+ 
+ #define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
++#define static_percpu_ptr(sptr, sptrs)	(&sptrs[0])
+ 
+ static inline void percpu_depopulate(void *__pdata, int cpu)
+ {
+diff --git a/include/linux/pid.h b/include/linux/pid.h
+index c21c7e8..4331c6b 100644
+--- a/include/linux/pid.h
++++ b/include/linux/pid.h
+@@ -59,6 +59,9 @@ struct pid
+ 	atomic_t count;
+ 	/* lists of tasks that use this pid */
+ 	struct hlist_head tasks[PIDTYPE_MAX];
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *ub;
++#endif
+ 	struct rcu_head rcu;
+ 	unsigned int level;
+ 	struct upid numbers[1];
+@@ -96,6 +99,11 @@ extern void change_pid(struct task_struct *task, enum pid_type,
+ 			struct pid *pid);
+ extern void transfer_pid(struct task_struct *old, struct task_struct *new,
+ 			 enum pid_type);
++extern void reattach_pid(struct task_struct *, enum pid_type, struct pid *);
++extern int alloc_pidmap(struct pid_namespace *pid_ns);
++extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid);
++
++extern spinlock_t pidmap_lock;
+ 
+ struct pid_namespace;
+ extern struct pid_namespace init_pid_ns;
+@@ -121,8 +129,11 @@ extern struct pid *find_get_pid(int nr);
+ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
+ int next_pidmap(struct pid_namespace *pid_ns, int last);
+ 
+-extern struct pid *alloc_pid(struct pid_namespace *ns);
++extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid);
+ extern void free_pid(struct pid *pid);
++extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *);
++extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *);
++pid_t pid_to_vpid(pid_t nr);
+ 
+ /*
+  * the helpers to get the pid's id seen from different namespaces
+diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
+index caff528..82514f2 100644
+--- a/include/linux/pid_namespace.h
++++ b/include/linux/pid_namespace.h
+@@ -14,6 +14,14 @@ struct pidmap {
+ 
+ #define PIDMAP_ENTRIES         ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
+ 
++/* pid namespace flags */
++
++/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */
++#define PID_NS_HIDE_CHILD	0x00000001
++
++/* if set newly created processes invisible from parent ns*/
++#define PID_NS_HIDDEN		0x00000002
++
+ struct pid_namespace {
+ 	struct kref kref;
+ 	struct pidmap pidmap[PIDMAP_ENTRIES];
+@@ -22,6 +30,7 @@ struct pid_namespace {
+ 	struct kmem_cache *pid_cachep;
+ 	unsigned int level;
+ 	struct pid_namespace *parent;
++	unsigned flags;
+ #ifdef CONFIG_PROC_FS
+ 	struct vfsmount *proc_mnt;
+ #endif
+diff --git a/include/linux/poll.h b/include/linux/poll.h
+index ef45382..c1bf82a 100644
+--- a/include/linux/poll.h
++++ b/include/linux/poll.h
+@@ -119,6 +119,7 @@ extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
+ 		       s64 *timeout);
+ extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+ 			   fd_set __user *exp, s64 *timeout);
++long do_restart_poll(struct restart_block *restart_block);
+ 
+ #endif /* KERNEL */
+ 
+diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
+index fff1d27..62e16d4 100644
+--- a/include/linux/proc_fs.h
++++ b/include/linux/proc_fs.h
+@@ -125,7 +125,10 @@ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
+ extern struct vfsmount *proc_mnt;
+ struct pid_namespace;
+ extern int proc_fill_super(struct super_block *);
+-extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
++extern struct inode *proc_get_inode(struct super_block *, unsigned int,
++		struct proc_dir_entry *glob, struct proc_dir_entry *loc);
++
++extern struct file_system_type proc_fs_type;
+ 
+ /*
+  * These are generic /proc routines that use the internal
+@@ -174,6 +177,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
+ extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
+ 			struct proc_dir_entry *parent);
+ 
++extern struct proc_dir_entry glob_proc_root;
++
+ static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode,
+ 	struct proc_dir_entry *parent, const struct file_operations *proc_fops)
+ {
+@@ -287,6 +292,9 @@ struct proc_inode {
+ 	int fd;
+ 	union proc_op op;
+ 	struct proc_dir_entry *pde;
++#ifdef CONFIG_VE
++	struct proc_dir_entry *lpde;
++#endif
+ 	struct inode vfs_inode;
+ };
+ 
+@@ -300,6 +308,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
+ 	return PROC_I(inode)->pde;
+ }
+ 
++static inline struct proc_dir_entry *LPDE(const struct inode *inode)
++{
++#ifdef CONFIG_VE
++	return PROC_I(inode)->lpde;
++#else
++	return NULL;
++#endif
++}
++
+ static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+ {
+ 	return pde->parent->data;
+diff --git a/include/linux/quota.h b/include/linux/quota.h
+index dcddfb2..97dacfd 100644
+--- a/include/linux/quota.h
++++ b/include/linux/quota.h
+@@ -166,6 +166,10 @@ enum {
+ #include <linux/spinlock.h>
+ #include <linux/wait.h>
+ 
++#include <linux/spinlock.h>
++
++extern spinlock_t dq_data_lock;
++
+ #include <linux/dqblk_xfs.h>
+ #include <linux/dqblk_v1.h>
+ #include <linux/dqblk_v2.h>
+@@ -282,6 +286,8 @@ struct quota_format_ops {
+ 	int (*release_dqblk)(struct dquot *dquot);	/* Called when last reference to dquot is being dropped */
+ };
+ 
++struct inode;
++struct iattr;
+ /* Operations working with dquots */
+ struct dquot_operations {
+ 	int (*initialize) (struct inode *, int);
+@@ -296,9 +302,11 @@ struct dquot_operations {
+ 	int (*release_dquot) (struct dquot *);		/* Quota is going to be deleted from disk */
+ 	int (*mark_dirty) (struct dquot *);		/* Dquot is marked dirty */
+ 	int (*write_info) (struct super_block *, int);	/* Write of quota "superblock" */
++	int (*rename) (struct inode *, struct inode *, struct inode *);
+ };
+ 
+ /* Operations handling requests from userspace */
++struct v2_disk_dqblk;
+ struct quotactl_ops {
+ 	int (*quota_on)(struct super_block *, int, int, char *, int);
+ 	int (*quota_off)(struct super_block *, int, int);
+@@ -311,6 +319,10 @@ struct quotactl_ops {
+ 	int (*set_xstate)(struct super_block *, unsigned int, int);
+ 	int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
+ 	int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
++#ifdef CONFIG_QUOTA_COMPAT
++	int (*get_quoti)(struct super_block *, int, unsigned int,
++			struct v2_disk_dqblk __user *);
++#endif
+ };
+ 
+ struct quota_format_type {
+@@ -335,6 +347,10 @@ struct quota_info {
+ 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
+ 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
+ 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++	struct vz_quota_master *vzdq_master;
++	int vzdq_count;
++#endif
+ };
+ 
+ #define sb_has_quota_enabled(sb, type) ((type)==USRQUOTA ? \
+diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
+index f867020..a1bcf67 100644
+--- a/include/linux/quotaops.h
++++ b/include/linux/quotaops.h
+@@ -170,6 +170,19 @@ static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr)
+ 	return 0;
+ }
+ 
++static __inline__ int DQUOT_RENAME(struct inode *inode,
++		struct inode *old_dir, struct inode *new_dir)
++{
++	struct dquot_operations *q_op;
++
++	q_op = inode->i_sb->dq_op;
++	if (q_op && q_op->rename) {
++		if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA)
++			return 1;
++	}
++	return 0;
++}
++
+ /* The following two functions cannot be called inside a transaction */
+ static inline void DQUOT_SYNC(struct super_block *sb)
+ {
+@@ -244,6 +257,12 @@ static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr)
+ 	return 0;
+ }
+ 
++static inline int DQUOT_RENAME(struct inode *inode, struct inode *old_dir,
++		struct inode *new_dir)
++{
++	return 0;
++}
++
+ static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+ {
+ 	inode_add_bytes(inode, nr);
+diff --git a/include/linux/rmap.h b/include/linux/rmap.h
+index 1383692..be68e7a 100644
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -74,6 +74,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+ void page_add_file_rmap(struct page *);
+ void page_remove_rmap(struct page *, struct vm_area_struct *);
++struct anon_vma *page_lock_anon_vma(struct page *page);
++void page_unlock_anon_vma(struct anon_vma *anon_vma);
+ 
+ #ifdef CONFIG_DEBUG_VM
+ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address);
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index c5d3f84..272da80 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -29,6 +29,10 @@
+ #define CLONE_NEWNET		0x40000000	/* New network namespace */
+ #define CLONE_IO		0x80000000	/* Clone io context */
+ 
++/* mask of clones which are disabled in OpenVZ VEs */
++#define CLONE_NAMESPACES_MASK	(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | \
++				 CLONE_NEWPID | CLONE_NEWNET)
++
+ /*
+  * Scheduling policies
+  */
+@@ -90,6 +94,8 @@ struct sched_param {
+ 
+ #include <asm/processor.h>
+ 
++#include <bc/task.h>
++
+ struct mem_cgroup;
+ struct exec_domain;
+ struct futex_pi_state;
+@@ -126,15 +132,38 @@ extern unsigned long avenrun[];		/* Load averages */
+ 	load += n*(FIXED_1-exp); \
+ 	load >>= FSHIFT;
+ 
++#define LOAD_INT(x) ((x) >> FSHIFT)
++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
++
+ extern unsigned long total_forks;
+ extern int nr_threads;
+ DECLARE_PER_CPU(unsigned long, process_counts);
+ extern int nr_processes(void);
+ extern unsigned long nr_running(void);
++extern unsigned long nr_sleeping(void);
++extern unsigned long nr_stopped(void);
+ extern unsigned long nr_uninterruptible(void);
+ extern unsigned long nr_active(void);
+ extern unsigned long nr_iowait(void);
+ extern unsigned long weighted_cpuload(const int cpu);
++extern atomic_t nr_dead;
++extern unsigned long nr_zombie;
++
++#ifdef CONFIG_VE
++struct ve_struct;
++extern unsigned long nr_running_ve(struct ve_struct *);
++extern unsigned long nr_iowait_ve(struct ve_struct *);
++extern unsigned long nr_uninterruptible_ve(struct ve_struct *);
++extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu);
++extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu);
++void ve_sched_attach(struct ve_struct *envid);
++#else
++#define nr_running_ve(ve)			0
++#define nr_iowait_ve(ve)			0
++#define nr_uninterruptible_ve(ve)		0
++#define ve_sched_get_idle_time(ve, cpu)		0
++#define ve_sched_get_iowait_time(ve, cpu)	0
++#endif
+ 
+ struct seq_file;
+ struct cfs_rq;
+@@ -269,6 +298,7 @@ static inline void show_state(void)
+ }
+ 
+ extern void show_regs(struct pt_regs *);
++extern void smp_show_regs(struct pt_regs *, void *);
+ 
+ /*
+  * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+@@ -423,6 +453,9 @@ struct pacct_struct {
+ 	unsigned long		ac_minflt, ac_majflt;
+ };
+ 
++#include <linux/ve.h>
++#include <linux/ve_task.h>
++
+ /*
+  * NOTE! "signal_struct" does not have it's own
+  * locking, because a shared signal_struct always
+@@ -1092,6 +1125,7 @@ struct task_struct {
+ 	/* ??? */
+ 	unsigned int personality;
+ 	unsigned did_exec:1;
++	unsigned did_ve_enter:1;
+ 	pid_t pid;
+ 	pid_t tgid;
+ 
+@@ -1289,6 +1323,14 @@ struct task_struct {
+ 	struct rcu_head rcu;
+ 
+ 	/*
++	 * state tracking for suspend
++	 * FIXME - ptrace is completely rewritten in this kernel
++	 * so set_pn_state() is not set in many places correctyl
++	 */
++	__u8	 pn_state;
++	__u8	 stopped_state:1;
++
++	/*
+ 	 * cache last used pipe for splice
+ 	 */
+ 	struct pipe_inode_info *splice_pipe;
+@@ -1303,6 +1345,19 @@ struct task_struct {
+ 	int latency_record_count;
+ 	struct latency_record latency_record[LT_SAVECOUNT];
+ #endif
++#ifdef CONFIG_BEANCOUNTERS
++	struct task_beancounter task_bc;
++#endif
++#ifdef CONFIG_VE
++	struct ve_task_info ve_task_info;
++#endif
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++	unsigned long	magic;
++	struct inode	*ino;
++#endif
++#ifdef CONFIG_VZ_FAIRSCHED
++	struct fairsched_node *fsched_node;
++#endif
+ };
+ 
+ /*
+@@ -1477,6 +1532,43 @@ static inline void put_task_struct(struct task_struct *t)
+ 		__put_task_struct(t);
+ }
+ 
++#ifndef CONFIG_VE
++#define set_pn_state(tsk, state)	do { } while(0)
++#define clear_pn_state(tsk)		do { } while(0)
++#define set_stop_state(tsk)		do { } while(0)
++#define clear_stop_state(tsk)		do { } while(0)
++#else
++#define PN_STOP_TF	1	/* was not in 2.6.8 */
++#define PN_STOP_TF_RT	2	/* was not in 2.6.8 */ 
++#define PN_STOP_ENTRY	3
++#define PN_STOP_FORK	4
++#define PN_STOP_VFORK	5
++#define PN_STOP_SIGNAL	6
++#define PN_STOP_EXIT	7
++#define PN_STOP_EXEC	8
++#define PN_STOP_LEAVE	9
++
++static inline void set_pn_state(struct task_struct *tsk, int state)
++{
++	tsk->pn_state = state;
++}
++
++static inline void clear_pn_state(struct task_struct *tsk)
++{
++	tsk->pn_state = 0;
++}
++
++static inline void set_stop_state(struct task_struct *tsk)
++{
++	tsk->stopped_state = 1;
++}
++
++static inline void clear_stop_state(struct task_struct *tsk)
++{
++	tsk->stopped_state = 0;
++}
++#endif
++
+ /*
+  * Per process flags
+  */
+@@ -1493,6 +1585,7 @@ static inline void put_task_struct(struct task_struct *t)
+ #define PF_MEMALLOC	0x00000800	/* Allocating memory */
+ #define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
+ #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
++#define PF_EXIT_RESTART	0x00004000	/* do_exit() restarted, see do_exit() */
+ #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
+ #define PF_FROZEN	0x00010000	/* frozen for system suspend */
+ #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
+@@ -1590,6 +1683,21 @@ extern unsigned long long cpu_clock(int cpu);
+ extern unsigned long long
+ task_sched_runtime(struct task_struct *task);
+ 
++static inline unsigned long cycles_to_clocks(cycles_t cycles)
++{
++	extern unsigned long cycles_per_clock;
++	do_div(cycles, cycles_per_clock);
++	return cycles;
++}
++
++static inline u64 cycles_to_jiffies(cycles_t cycles)
++{
++	extern unsigned long cycles_per_jiffy;
++	do_div(cycles, cycles_per_jiffy);
++	return cycles;
++}
++
++
+ /* sched_exec is called by processes performing an exec */
+ #ifdef CONFIG_SMP
+ extern void sched_exec(void);
+@@ -1727,6 +1835,7 @@ static inline struct user_struct *get_uid(struct user_struct *u)
+ extern void free_uid(struct user_struct *);
+ extern void switch_uid(struct user_struct *);
+ extern void release_uids(struct user_namespace *ns);
++extern int set_user(uid_t uid, int dumpclear);
+ 
+ #include <asm/current.h>
+ 
+@@ -1859,6 +1968,13 @@ extern int disallow_signal(int);
+ 
+ extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
+ extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
++extern long do_fork_pid(unsigned long clone_flags,
++			unsigned long stack_start,
++			struct pt_regs *regs,
++			unsigned long stack_size,
++			int __user *parent_tidptr,
++			int __user *child_tidptr,
++			long pid0);
+ struct task_struct *fork_idle(int);
+ 
+ extern void set_task_comm(struct task_struct *tsk, char *from);
+@@ -1873,19 +1989,19 @@ extern void wait_task_inactive(struct task_struct * p);
+ #define remove_parent(p)	list_del_init(&(p)->sibling)
+ #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
+ 
+-#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
++#define next_task_all(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+ 
+-#define for_each_process(p) \
+-	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
++#define for_each_process_all(p) \
++	for (p = &init_task ; (p = next_task_all(p)) != &init_task ; )
+ 
+ /*
+  * Careful: do_each_thread/while_each_thread is a double loop so
+  *          'break' will not work as expected - use goto instead.
+  */
+-#define do_each_thread(g, t) \
+-	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
++#define do_each_thread_all(g, t) \
++	for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do
+ 
+-#define while_each_thread(g, t) \
++#define while_each_thread_all(g, t) \
+ 	while ((t = next_thread(t)) != g)
+ 
+ /* de_thread depends on thread_group_leader not being a pid based check */
+@@ -1910,8 +2026,15 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
+ 
+ static inline struct task_struct *next_thread(const struct task_struct *p)
+ {
+-	return list_entry(rcu_dereference(p->thread_group.next),
++	struct task_struct *tsk;
++
++	tsk = list_entry(rcu_dereference(p->thread_group.next),
+ 			  struct task_struct, thread_group);
++#ifdef CONFIG_VE
++	/* all threads should belong to ONE ve! */
++	BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env);
++#endif
++	return tsk;
+ }
+ 
+ static inline int thread_group_empty(struct task_struct *p)
+@@ -1951,6 +2074,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
+ 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+ }
+ 
++#ifndef CONFIG_VE
++
++#define for_each_process_ve(p)		for_each_process_all(p)
++#define do_each_thread_ve(g, t)		do_each_thread_all(g, t)
++#define while_each_thread_ve(g, t)	while_each_thread_all(g, t)
++#define first_task_ve()			next_task_ve(&init_task)
++#define __first_task_ve(owner)		next_task_ve(&init_task)
++#define __next_task_ve(owner, p)	next_task_ve(p)
++#define next_task_ve(p)			\
++	(next_task_all(p) != &init_task ? next_task_all(p) : NULL)
++
++#define ve_is_super(env)				1
++#define ve_accessible(target, owner)			1
++#define ve_accessible_strict(target, owner)		1
++#define ve_accessible_veid(target, owner)		1
++#define ve_accessible_strict_veid(target, owner)	1
++
++#define VEID(ve)					0
++
++#else	/* CONFIG_VE */
++
++#include <linux/ve.h>
++
++#define ve_is_super(env)			((env) == get_ve0())
++
++#define ve_accessible_strict(target, owner)	((target) == (owner))
++static inline int ve_accessible(struct ve_struct *target,
++		struct ve_struct *owner)
++{
++	return ve_is_super(owner) || ve_accessible_strict(target, owner);
++}
++
++#define ve_accessible_strict_veid(target, owner) ((target) == (owner))
++static inline int ve_accessible_veid(envid_t target, envid_t owner)
++{
++	return get_ve0()->veid == owner ||
++		ve_accessible_strict_veid(target, owner);
++}
++
++#define VEID(ve)	(ve->veid)
++
++static inline struct task_struct *ve_lh2task(struct ve_struct *ve,
++		struct list_head *lh)
++{
++	return lh == &ve->vetask_lh ? NULL :
++		list_entry(lh, struct task_struct, ve_task_info.vetask_list);
++}
++
++static inline struct task_struct *__first_task_ve(struct ve_struct *ve)
++{
++	struct task_struct *tsk;
++
++	if (unlikely(ve_is_super(ve))) {
++		tsk = next_task_all(&init_task);
++		if (tsk == &init_task)
++			tsk = NULL;
++	} else {
++		tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next));
++	}
++	return tsk;
++}
++
++static inline struct task_struct *__next_task_ve(struct ve_struct *ve,
++		struct task_struct *tsk)
++{
++	if (unlikely(ve_is_super(ve))) {
++		tsk = next_task_all(tsk);
++		if (tsk == &init_task)
++			tsk = NULL;
++	} else {
++		BUG_ON(tsk->ve_task_info.owner_env != ve);
++		tsk = ve_lh2task(ve, rcu_dereference(tsk->
++					ve_task_info.vetask_list.next));
++	}
++	return tsk;
++}
++
++#define first_task_ve()	__first_task_ve(get_exec_env())
++#define next_task_ve(p)	__next_task_ve(get_exec_env(), p)
++/* no one uses prev_task_ve(), copy next_task_ve() if needed */
++
++#define for_each_process_ve(p) \
++	for (p = first_task_ve(); p != NULL ; p = next_task_ve(p))
++
++#define do_each_thread_ve(g, t) \
++	for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do
++
++#define while_each_thread_ve(g, t) \
++	while ((t = next_thread(t)) != g)
++
++#endif	/* CONFIG_VE */
++
+ #ifndef __HAVE_THREAD_FUNCTIONS
+ 
+ #define task_thread_info(task)	((struct thread_info *)(task)->stack)
+diff --git a/include/linux/sem.h b/include/linux/sem.h
+index c8eaad9..380e1d1 100644
+--- a/include/linux/sem.h
++++ b/include/linux/sem.h
+@@ -154,6 +154,9 @@ static inline void exit_sem(struct task_struct *tsk)
+ }
+ #endif
+ 
++int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg);
++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg);
++
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_SEM_H */
+diff --git a/include/linux/shm.h b/include/linux/shm.h
+index eca6235..c2b3bb5 100644
+--- a/include/linux/shm.h
++++ b/include/linux/shm.h
+@@ -83,6 +83,22 @@ struct shm_info {
+ };
+ 
+ #ifdef __KERNEL__
++
++#include <linux/ipc_namespace.h>
++
++#define IPC_SEM_IDS	0
++#define IPC_MSG_IDS	1
++#define IPC_SHM_IDS	2
++
++struct shm_file_data {
++	int id;
++	struct ipc_namespace *ns;
++	struct file *file;
++	const struct vm_operations_struct *vm_ops;
++};
++#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
++#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
++
+ struct shmid_kernel /* private to the kernel */
+ {	
+ 	struct kern_ipc_perm	shm_perm;
+@@ -97,6 +113,23 @@ struct shmid_kernel /* private to the kernel */
+ 	struct user_struct	*mlock_user;
+ };
+ 
++/*
++ * shm_lock_(check_) routines are called in the paths where the rw_mutex
++ * is not held.
++ */
++static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
++{
++	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
++
++	if (IS_ERR(ipcp))
++		return (struct shmid_kernel *)ipcp;
++
++	return container_of(ipcp, struct shmid_kernel, shm_perm);
++}
++
++#define shm_unlock(shp)			\
++	ipc_unlock(&(shp)->shm_perm)
++
+ /* shm_mode upper byte flags */
+ #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
+ #define SHM_LOCKED      02000   /* segment will not be swapped */
+@@ -118,6 +151,12 @@ static inline int is_file_shm_hugepages(struct file *file)
+ }
+ #endif
+ 
++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg);
++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg);
++extern const struct file_operations shmem_file_operations;
++extern const struct file_operations shm_file_operations;
++
++extern struct file_system_type tmpfs_fs_type;
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_SHM_H_ */
+diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
+index f2d12d5..c4d6482 100644
+--- a/include/linux/shmem_fs.h
++++ b/include/linux/shmem_fs.h
+@@ -23,6 +23,9 @@ struct shmem_inode_info {
+ 	struct posix_acl	*i_acl;
+ 	struct posix_acl	*i_default_acl;
+ #endif
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter	*shmi_ub;
++#endif
+ };
+ 
+ struct shmem_sb_info {
+@@ -62,4 +65,7 @@ static inline void shmem_acl_destroy_inode(struct inode *inode)
+ }
+ #endif  /* CONFIG_TMPFS_POSIX_ACL */
+ 
++int shmem_insertpage(struct inode * inode, unsigned long index,
++		     swp_entry_t swap);
++
+ #endif
+diff --git a/include/linux/signal.h b/include/linux/signal.h
+index 84f997f..5adb84b 100644
+--- a/include/linux/signal.h
++++ b/include/linux/signal.h
+@@ -6,6 +6,8 @@
+ 
+ #ifdef __KERNEL__
+ #include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/slab.h>
+ 
+ /*
+  * Real Time signals may be queued.
+@@ -16,6 +18,9 @@ struct sigqueue {
+ 	int flags;
+ 	siginfo_t info;
+ 	struct user_struct *user;
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *sig_ub;
++#endif
+ };
+ 
+ /* flags values. */
+@@ -372,6 +377,8 @@ int unhandled_signal(struct task_struct *tsk, int sig);
+ 
+ void signals_init(void);
+ 
++extern struct kmem_cache *sigqueue_cachep;
++
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_SIGNAL_H */
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
+index 299ec4b..a05f088 100644
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -248,6 +248,8 @@ typedef unsigned char *sk_buff_data_t;
+  *	@secmark: security marking
+  */
+ 
++#include <bc/sock.h>
++
+ struct sk_buff {
+ 	/* These two members must be first. */
+ 	struct sk_buff		*next;
+@@ -294,7 +296,13 @@ struct sk_buff {
+ 				peeked:1,
+ 				nf_trace:1;
+ 	__be16			protocol;
+-
++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
++	__u8			brmark;
++#endif
++#ifdef CONFIG_VE
++	unsigned int		accounted:1;
++	unsigned int		redirected:1;
++#endif
+ 	void			(*destructor)(struct sk_buff *skb);
+ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ 	struct nf_conntrack	*nfct;
+@@ -338,6 +346,8 @@ struct sk_buff {
+ 				*data;
+ 	unsigned int		truesize;
+ 	atomic_t		users;
++	struct skb_beancounter	skb_bc;
++	struct ve_struct	*owner_env;
+ };
+ 
+ #ifdef __KERNEL__
+@@ -345,6 +355,7 @@ struct sk_buff {
+  *	Handling routines are only of interest to the kernel
+  */
+ #include <linux/slab.h>
++#include <bc/net.h>
+ 
+ #include <asm/system.h>
+ 
+@@ -1171,6 +1182,8 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
+  */
+ static inline void skb_orphan(struct sk_buff *skb)
+ {
++	ub_skb_uncharge(skb);
++
+ 	if (skb->destructor)
+ 		skb->destructor(skb);
+ 	skb->destructor = NULL;
+@@ -1669,6 +1682,26 @@ static inline void skb_init_secmark(struct sk_buff *skb)
+ { }
+ #endif
+ 
++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
++static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
++{
++	to->brmark = from->brmark;
++}
++
++static inline void skb_init_brmark(struct sk_buff *skb)
++{
++	skb->brmark = 0;
++}
++#else
++static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
++{
++}
++
++static inline void skb_init_brmark(struct sk_buff *skb)
++{
++}
++#endif
++
+ static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
+ {
+ #ifdef CONFIG_NETDEVICES_MULTIQUEUE
+diff --git a/include/linux/slab.h b/include/linux/slab.h
+index 9aa90a6..588eea8 100644
+--- a/include/linux/slab.h
++++ b/include/linux/slab.h
+@@ -51,6 +51,26 @@
+ 				(unsigned long)ZERO_SIZE_PTR)
+ 
+ /*
++ * allocation rules:                            __GFP_UBC       0
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *  cache (SLAB_UBC)				charge		charge
++ *				      (usual caches: mm, vma, task_struct, ...)
++ *
++ *  cache (SLAB_UBC | SLAB_NO_CHARGE)		charge		---
++ *					     (ub_kmalloc)    (kmalloc)
++ *
++ *  cache (no UB flags)				BUG()		---
++ *							(nonub caches, mempools)
++ *
++ *  pages					charge		---
++ *					   (ub_vmalloc,	      (vmalloc,
++ *				        poll, fdsets, ...)  non-ub allocs)
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++#define SLAB_UBC		0x10000000UL	/* alloc space for ubs ... */
++#define SLAB_NO_CHARGE		0x20000000UL	/* ... but don't charge */
++
++/*
+  * struct kmem_cache related prototypes
+  */
+ void __init kmem_cache_init(void);
+@@ -65,7 +85,20 @@ void kmem_cache_free(struct kmem_cache *, void *);
+ unsigned int kmem_cache_size(struct kmem_cache *);
+ const char *kmem_cache_name(struct kmem_cache *);
+ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
++extern void show_slab_info(void);
++int kmem_cache_objuse(struct kmem_cache *cachep);
++int kmem_obj_objuse(void *obj);
++int kmem_dname_objuse(void *obj);
++unsigned long ub_cache_growth(struct kmem_cache *cachep);
+ 
++#ifdef CONFIG_BEANCOUNTERS
++void kmem_mark_nocharge(struct kmem_cache *cachep);
++struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj);
++struct user_beancounter *slab_ub(void *obj);
++#else
++static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { }
++static inline struct user_beancounter *slab_ub(void *obj) { return NULL; }
++#endif
+ /*
+  * Please use this macro to create slab caches. Simply specify the
+  * name of the structure and maybe some flags that are listed above.
+diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
+index 39c3a5e..6be00b2 100644
+--- a/include/linux/slab_def.h
++++ b/include/linux/slab_def.h
+@@ -15,6 +15,111 @@
+ #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
+ #include <linux/compiler.h>
+ 
++/*
++ * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
++ *		  0 for faster, smaller code (especially in the critical paths).
++ *
++ * STATS	- 1 to collect stats for /proc/slabinfo.
++ *		  0 for faster, smaller code (especially in the critical paths).
++ *
++ * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
++ */
++
++#ifdef CONFIG_DEBUG_SLAB
++#define	SLAB_DEBUG		1
++#define	SLAB_STATS		1
++#define SLAB_FORCED_DEBUG	1
++#else
++#define	SLAB_DEBUG		0
++#define	SLAB_STATS		0
++#define SLAB_FORCED_DEBUG	0
++#endif
++
++/*
++ * struct kmem_cache
++ *
++ * manages a cache.
++ */
++
++struct kmem_cache {
++/* 1) per-cpu data, touched during every alloc/free */
++	struct array_cache *array[NR_CPUS];
++/* 2) Cache tunables. Protected by cache_chain_mutex */
++	unsigned int batchcount;
++	unsigned int limit;
++	unsigned int shared;
++
++	unsigned int buffer_size;
++	u32 reciprocal_buffer_size;
++/* 3) touched by every alloc & free from the backend */
++
++	unsigned int flags;		/* constant flags */
++	unsigned int num;		/* # of objs per slab */
++
++/* 4) cache_grow/shrink */
++	/* order of pgs per slab (2^n) */
++	unsigned int gfporder;
++
++	/* force GFP flags, e.g. GFP_DMA */
++	gfp_t gfpflags;
++
++	size_t colour;			/* cache colouring range */
++	unsigned int colour_off;	/* colour offset */
++	struct kmem_cache *slabp_cache;
++	unsigned int slab_size;
++	unsigned int dflags;		/* dynamic flags */
++
++	/* constructor func */
++	void (*ctor) (struct kmem_cache *, void *);
++
++/* 5) cache creation/removal */
++	const char *name;
++	struct list_head next;
++
++/* 6) statistics */
++	unsigned long grown;
++	unsigned long reaped;
++	unsigned long shrunk;
++#if SLAB_STATS
++	unsigned long num_active;
++	unsigned long num_allocations;
++	unsigned long high_mark;
++	unsigned long errors;
++	unsigned long max_freeable;
++	unsigned long node_allocs;
++	unsigned long node_frees;
++	unsigned long node_overflow;
++	atomic_t allochit;
++	atomic_t allocmiss;
++	atomic_t freehit;
++	atomic_t freemiss;
++#endif
++#if SLAB_DEBUG
++	/*
++	 * If debugging is enabled, then the allocator can add additional
++	 * fields and/or padding to every object. buffer_size contains the total
++	 * object size including these internal fields, the following two
++	 * variables contain the offset to the user object and its size.
++	 */
++	int obj_offset;
++	int obj_size;
++#endif
++#ifdef CONFIG_BEANCOUNTERS
++	int objuse;
++#endif
++	/*
++	 * We put nodelists[] at the end of kmem_cache, because we want to size
++	 * this array to nr_node_ids slots instead of MAX_NUMNODES
++	 * (see kmem_cache_init())
++	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
++	 * is statically defined, so we reserve the max number of nodes.
++	 */
++	struct kmem_list3 *nodelists[MAX_NUMNODES];
++	/*
++	 * Do not add fields after nodelists[]
++	 */
++};
++
+ /* Size description struct for general caches. */
+ struct cache_sizes {
+ 	size_t		 	cs_size;
+@@ -24,6 +129,7 @@ struct cache_sizes {
+ #endif
+ };
+ extern struct cache_sizes malloc_sizes[];
++extern int malloc_cache_num;
+ 
+ void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+ void *__kmalloc(size_t size, gfp_t flags);
+@@ -48,6 +154,8 @@ static inline void *kmalloc(size_t size, gfp_t flags)
+ 			__you_cannot_kmalloc_that_much();
+ 		}
+ found:
++		if (flags & __GFP_UBC)
++			i += malloc_cache_num;
+ #ifdef CONFIG_ZONE_DMA
+ 		if (flags & GFP_DMA)
+ 			return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep,
+diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
+index d117ea2..c6187b4 100644
+--- a/include/linux/slub_def.h
++++ b/include/linux/slub_def.h
+@@ -94,6 +94,10 @@ struct kmem_cache {
+ 	struct kobject kobj;	/* For sysfs */
+ #endif
+ 
++#ifdef CONFIG_BEANCOUNTERS
++	atomic_t grown;
++	int objuse;
++#endif
+ #ifdef CONFIG_NUMA
+ 	/*
+ 	 * Defragmentation by allocating from a remote node.
+@@ -125,6 +129,19 @@ struct kmem_cache {
+  */
+ extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1];
+ 
++#ifdef CONFIG_BEANCOUNTERS
++extern struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
++static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx)
++{
++	return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx];
++}
++#else
++static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx)
++{
++	return &kmalloc_caches[idx];
++}
++#endif
++
+ /*
+  * Sorry that the following has to be that ugly but some versions of GCC
+  * have trouble with constant propagation and loops.
+@@ -183,14 +200,14 @@ static __always_inline int kmalloc_index(size_t size)
+  * This ought to end up with a global pointer to the right cache
+  * in kmalloc_caches.
+  */
+-static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
++static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+ {
+ 	int index = kmalloc_index(size);
+ 
+ 	if (index == 0)
+ 		return NULL;
+ 
+-	return &kmalloc_caches[index];
++	return __kmalloc_cache(flags, index);
+ }
+ 
+ #ifdef CONFIG_ZONE_DMA
+@@ -215,7 +232,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
+ 			return kmalloc_large(size, flags);
+ 
+ 		if (!(flags & SLUB_DMA)) {
+-			struct kmem_cache *s = kmalloc_slab(size);
++			struct kmem_cache *s = kmalloc_slab(size, flags);
+ 
+ 			if (!s)
+ 				return ZERO_SIZE_PTR;
+@@ -234,7 +251,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+ {
+ 	if (__builtin_constant_p(size) &&
+ 		size <= PAGE_SIZE && !(flags & SLUB_DMA)) {
+-			struct kmem_cache *s = kmalloc_slab(size);
++			struct kmem_cache *s = kmalloc_slab(size, flags);
+ 
+ 		if (!s)
+ 			return ZERO_SIZE_PTR;
+diff --git a/include/linux/smp.h b/include/linux/smp.h
+index 55232cc..8491af0 100644
+--- a/include/linux/smp.h
++++ b/include/linux/smp.h
+@@ -10,6 +10,9 @@
+ 
+ extern void cpu_idle(void);
+ 
++struct pt_regs;
++typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info);
++
+ #ifdef CONFIG_SMP
+ 
+ #include <linux/preempt.h>
+@@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum);
+  */
+ extern void smp_cpus_done(unsigned int max_cpus);
+ 
++extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait);
++
+ /*
+  * Call a function on all other processors
+  */
+@@ -113,6 +118,12 @@ static inline void smp_send_reschedule(int cpu) { }
+ #define smp_call_function_mask(mask, func, info, wait) \
+ 			(up_smp_call_function(func, info))
+ 
++static inline int smp_nmi_call_function(smp_nmi_function func,
++					 void *info, int wait)
++{
++	return 0;
++}
++
+ #endif /* !SMP */
+ 
+ /*
+diff --git a/include/linux/socket.h b/include/linux/socket.h
+index bd2b30a..675ee51 100644
+--- a/include/linux/socket.h
++++ b/include/linux/socket.h
+@@ -298,6 +298,16 @@ struct ucred {
+ #define IPX_TYPE	1
+ 
+ #ifdef __KERNEL__
++
++#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
++					   16 for IP, 16 for IPX,
++					   24 for IPv6,
++					   about 80 for AX.25
++					   must be at least one bigger than
++					   the AF_UNIX size (see net/unix/af_unix.c
++					   :unix_mkname()).
++					 */
++
+ extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
+ extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, 
+ 				int offset, int len);
+@@ -311,6 +321,8 @@ extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
+ extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen);
+ extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr);
+ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
++extern int vz_security_family_check(int family);
++extern int vz_security_protocol_check(int protocol);
+ 
+ #endif
+ #endif /* not kernel and not glibc */
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 0b33776..7e449b8 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -18,6 +18,7 @@ struct bio;
+ #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
+ #define SWAP_FLAG_PRIO_MASK	0x7fff
+ #define SWAP_FLAG_PRIO_SHIFT	0
++#define SWAP_FLAG_READONLY	0x40000000      /* set if swap is read-only */
+ 
+ static inline int current_is_kswapd(void)
+ {
+@@ -93,6 +94,7 @@ struct address_space;
+ struct sysinfo;
+ struct writeback_control;
+ struct zone;
++struct user_beancounter;
+ 
+ /*
+  * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
+@@ -122,6 +124,7 @@ enum {
+ 	SWP_ACTIVE	= (SWP_USED | SWP_WRITEOK),
+ 					/* add others here before... */
+ 	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
++	SWP_READONLY	= (1 << 2),
+ };
+ 
+ #define SWAP_CLUSTER_MAX 32
+@@ -132,6 +135,7 @@ enum {
+ /*
+  * The in-memory structure used to track swap areas.
+  */
++struct user_beancounter;
+ struct swap_info_struct {
+ 	unsigned int flags;
+ 	int prio;			/* swap priority */
+@@ -149,6 +153,9 @@ struct swap_info_struct {
+ 	unsigned int max;
+ 	unsigned int inuse_pages;
+ 	int next;			/* next entry on swap list */
++#ifdef CONFIG_BC_SWAP_ACCOUNTING
++	struct user_beancounter **swap_ubs;
++#endif
+ };
+ 
+ struct swap_list_t {
+@@ -156,9 +163,21 @@ struct swap_list_t {
+ 	int next;	/* swapfile to be used next */
+ };
+ 
++extern struct swap_list_t swap_list;
++extern struct swap_info_struct swap_info[MAX_SWAPFILES];
++
+ /* Swap 50% full? Release swapcache more aggressively.. */
+ #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
+ 
++/* linux/mm/oom_kill.c */
++extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
++extern int register_oom_notifier(struct notifier_block *nb);
++extern int unregister_oom_notifier(struct notifier_block *nb);
++extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
++			    struct mem_cgroup *mem, const char *message);
++extern struct task_struct *select_bad_process(struct user_beancounter *ub,
++		struct mem_cgroup *memcg);
++
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+@@ -226,6 +245,8 @@ extern void show_swap_cache_info(void);
+ extern int add_to_swap(struct page *, gfp_t);
+ extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
+ extern void __delete_from_swap_cache(struct page *);
++extern int __add_to_swap_cache(struct page *page,
++			       swp_entry_t entry, gfp_t gfp_mask);
+ extern void delete_from_swap_cache(struct page *);
+ extern void free_page_and_swap_cache(struct page *);
+ extern void free_pages_and_swap_cache(struct page **, int);
+@@ -239,7 +260,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
+ extern long total_swap_pages;
+ extern unsigned int nr_swapfiles;
+ extern void si_swapinfo(struct sysinfo *);
+-extern swp_entry_t get_swap_page(void);
++extern swp_entry_t get_swap_page(struct user_beancounter *);
+ extern swp_entry_t get_swap_page_of_type(int);
+ extern int swap_duplicate(swp_entry_t);
+ extern int valid_swaphandles(swp_entry_t, unsigned long *);
+@@ -252,6 +273,7 @@ extern sector_t swapdev_block(int, pgoff_t);
+ extern struct swap_info_struct *get_swap_info_struct(unsigned);
+ extern int can_share_swap_page(struct page *);
+ extern int remove_exclusive_swap_page(struct page *);
++extern int try_to_remove_exclusive_swap_page(struct page *);
+ struct backing_dev_info;
+ 
+ extern spinlock_t swap_lock;
+@@ -342,7 +364,7 @@ static inline int remove_exclusive_swap_page(struct page *p)
+ 	return 0;
+ }
+ 
+-static inline swp_entry_t get_swap_page(void)
++static inline swp_entry_t get_swap_page(struct user_beancounter *ub)
+ {
+ 	swp_entry_t entry;
+ 	entry.val = 0;
+diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
+index 24141b4..7caace5 100644
+--- a/include/linux/sysctl.h
++++ b/include/linux/sysctl.h
+@@ -1081,10 +1081,15 @@ struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
+ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ 						struct ctl_table *table);
++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int);
++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *,
++						struct ctl_table *, int);
+ 
+ void unregister_sysctl_table(struct ctl_table_header * table);
+ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
+ 
++extern int ve_allow_kthreads;
++
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_SYSCTL_H */
+diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
+index 7858eac..3887cd6 100644
+--- a/include/linux/sysfs.h
++++ b/include/linux/sysfs.h
+@@ -19,6 +19,7 @@
+ 
+ struct kobject;
+ struct module;
++struct sysfs_open_dirent;
+ 
+ /* FIXME
+  * The *owner field is no longer used, but leave around
+@@ -78,6 +79,66 @@ struct sysfs_ops {
+ 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
+ };
+ 
++/* type-specific structures for sysfs_dirent->s_* union members */
++struct sysfs_elem_dir {
++	struct kobject		*kobj;
++	/* children list starts here and goes through sd->s_sibling */
++	struct sysfs_dirent	*children;
++};
++
++struct sysfs_elem_symlink {
++	struct sysfs_dirent	*target_sd;
++};
++
++struct sysfs_elem_attr {
++	struct attribute	*attr;
++	struct sysfs_open_dirent *open;
++};
++
++struct sysfs_elem_bin_attr {
++	struct bin_attribute	*bin_attr;
++};
++
++/*
++ * sysfs_dirent - the building block of sysfs hierarchy.  Each and
++ * every sysfs node is represented by single sysfs_dirent.
++ *
++ * As long as s_count reference is held, the sysfs_dirent itself is
++ * accessible.  Dereferencing s_elem or any other outer entity
++ * requires s_active reference.
++ */
++struct sysfs_dirent {
++	atomic_t		s_count;
++	atomic_t		s_active;
++	struct sysfs_dirent	*s_parent;
++	struct sysfs_dirent	*s_sibling;
++	const char		*s_name;
++
++	union {
++		struct sysfs_elem_dir		s_dir;
++		struct sysfs_elem_symlink	s_symlink;
++		struct sysfs_elem_attr		s_attr;
++		struct sysfs_elem_bin_attr	s_bin_attr;
++	};
++
++	unsigned int		s_flags;
++	ino_t			s_ino;
++	umode_t			s_mode;
++	struct iattr		*s_iattr;
++};
++
++#define SD_DEACTIVATED_BIAS		INT_MIN
++
++#define SYSFS_TYPE_MASK			0x00ff
++#define SYSFS_DIR			0x0001
++#define SYSFS_KOBJ_ATTR			0x0002
++#define SYSFS_KOBJ_BIN_ATTR		0x0004
++#define SYSFS_KOBJ_LINK			0x0008
++#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
++
++#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
++#define SYSFS_FLAG_REMOVED		0x0200
++
+ #ifdef CONFIG_SYSFS
+ 
+ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
+@@ -118,6 +179,8 @@ void sysfs_notify(struct kobject *kobj, char *dir, char *attr);
+ 
+ extern int __must_check sysfs_init(void);
+ 
++extern struct file_system_type sysfs_fs_type;
++
+ #else /* CONFIG_SYSFS */
+ 
+ static inline int sysfs_schedule_callback(struct kobject *kobj,
+diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
+index ff46c6f..205f82e 100644
+--- a/include/linux/task_io_accounting_ops.h
++++ b/include/linux/task_io_accounting_ops.h
+@@ -5,10 +5,12 @@
+ #define __TASK_IO_ACCOUNTING_OPS_INCLUDED
+ 
+ #include <linux/sched.h>
++#include <bc/io_acct.h>
+ 
+ #ifdef CONFIG_TASK_IO_ACCOUNTING
+ static inline void task_io_account_read(size_t bytes)
+ {
++	ub_io_account_read(bytes);
+ 	current->ioac.read_bytes += bytes;
+ }
+ 
+@@ -21,8 +23,14 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
+ 	return p->ioac.read_bytes >> 9;
+ }
+ 
+-static inline void task_io_account_write(size_t bytes)
++static inline void task_io_account_write(struct page *page, size_t bytes,
++		int sync)
+ {
++	if (sync)
++		ub_io_account_write(bytes);
++	else
++		ub_io_account_dirty(page, bytes);
++
+ 	current->ioac.write_bytes += bytes;
+ }
+ 
+@@ -37,6 +45,7 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p)
+ 
+ static inline void task_io_account_cancelled_write(size_t bytes)
+ {
++	ub_io_account_write_cancelled(bytes);
+ 	current->ioac.cancelled_write_bytes += bytes;
+ }
+ 
+@@ -56,7 +65,8 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
+ 	return 0;
+ }
+ 
+-static inline void task_io_account_write(size_t bytes)
++static inline void task_io_account_write(struct page *page, size_t bytes,
++		int sync)
+ {
+ }
+ 
+diff --git a/include/linux/tty.h b/include/linux/tty.h
+index 324a3b2..fb4d996 100644
+--- a/include/linux/tty.h
++++ b/include/linux/tty.h
+@@ -241,6 +241,7 @@ struct tty_struct {
+ 	spinlock_t read_lock;
+ 	/* If the tty has a pending do_SAK, queue it here - akpm */
+ 	struct work_struct SAK_work;
++	struct ve_struct *owner_env;
+ };
+ 
+ /* tty magic number */
+@@ -270,6 +271,7 @@ struct tty_struct {
+ #define TTY_HUPPED 		18	/* Post driver->hangup() */
+ #define TTY_FLUSHING		19	/* Flushing to ldisc in progress */
+ #define TTY_FLUSHPENDING	20	/* Queued buffer flush pending */
++#define TTY_CHARGED		21	/* Charged as ub resource */
+ 
+ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
+ 
+diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
+index d2a0035..fe1f025 100644
+--- a/include/linux/tty_driver.h
++++ b/include/linux/tty_driver.h
+@@ -242,8 +242,19 @@ struct tty_driver {
+ 
+ 	const struct tty_operations *ops;
+ 	struct list_head tty_drivers;
++	struct ve_struct *owner_env;
+ };
+ 
++#ifdef CONFIG_UNIX98_PTYS
++extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
++extern struct tty_driver *pts_driver;	/* Unix98 pty slaves;  for /dev/ptmx */
++#endif
++
++#ifdef CONFIG_LEGACY_PTYS
++extern struct tty_driver *pty_driver;
++extern struct tty_driver *pty_slave_driver;
++#endif
++
+ extern struct list_head tty_drivers;
+ 
+ struct tty_driver *alloc_tty_driver(int lines);
+@@ -252,6 +263,9 @@ void tty_set_operations(struct tty_driver *driver,
+ 			const struct tty_operations *op);
+ extern struct tty_driver *tty_find_polling_driver(char *name, int *line);
+ 
++int init_ve_tty_class(void);
++void fini_ve_tty_class(void);
++
+ /* tty driver magic number */
+ #define TTY_DRIVER_MAGIC		0x5402
+ 
+diff --git a/include/linux/types.h b/include/linux/types.h
+index d4a9ce6..dcdaf75 100644
+--- a/include/linux/types.h
++++ b/include/linux/types.h
+@@ -29,6 +29,11 @@ typedef __kernel_timer_t	timer_t;
+ typedef __kernel_clockid_t	clockid_t;
+ typedef __kernel_mqd_t		mqd_t;
+ 
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
+ #ifdef __KERNEL__
+ typedef _Bool			bool;
+ 
+diff --git a/include/linux/utsname.h b/include/linux/utsname.h
+index 1123267..ec24d89 100644
+--- a/include/linux/utsname.h
++++ b/include/linux/utsname.h
+@@ -43,6 +43,7 @@ struct uts_namespace {
+ 	struct new_utsname name;
+ };
+ extern struct uts_namespace init_uts_ns;
++extern struct new_utsname virt_utsname;
+ 
+ #ifdef CONFIG_UTS_NS
+ static inline void get_uts_ns(struct uts_namespace *ns)
+diff --git a/include/linux/ve.h b/include/linux/ve.h
+new file mode 100644
+index 0000000..7025716
+--- /dev/null
++++ b/include/linux/ve.h
+@@ -0,0 +1,344 @@
++/*
++ *  include/linux/ve.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VE_H
++#define _LINUX_VE_H
++
++#include <linux/types.h>
++#include <linux/capability.h>
++#include <linux/sysctl.h>
++#include <linux/net.h>
++#include <linux/vzstat.h>
++#include <linux/kobject.h>
++#include <linux/pid.h>
++#include <linux/socket.h>
++#include <net/inet_frag.h>
++
++#ifdef VZMON_DEBUG
++#  define VZTRACE(fmt,args...) \
++	printk(KERN_DEBUG fmt, ##args)
++#else
++#  define VZTRACE(fmt,args...)
++#endif /* VZMON_DEBUG */
++
++struct tty_driver;
++struct devpts_config;
++struct task_struct;
++struct new_utsname;
++struct file_system_type;
++struct icmp_mib;
++struct ip_mib;
++struct tcp_mib;
++struct udp_mib;
++struct linux_mib;
++struct fib_info;
++struct fib_rule;
++struct veip_struct;
++struct ve_monitor;
++struct nsproxy;
++
++#if defined(CONFIG_VE) && defined(CONFIG_INET)
++struct fib_table;
++#ifdef CONFIG_VE_IPTABLES
++struct xt_table;
++struct nf_conn;
++
++#define FRAG6Q_HASHSZ   64
++
++struct ve_nf_conntrack {
++	struct hlist_head		*_bysource;
++	struct nf_nat_protocol		**_nf_nat_protos;
++	int				_nf_nat_vmalloced;
++	struct xt_table			*_nf_nat_table;
++	struct nf_conntrack_l3proto	*_nf_nat_l3proto;
++	atomic_t			_nf_conntrack_count;
++	int				_nf_conntrack_max;
++	struct hlist_head		*_nf_conntrack_hash;
++	int				_nf_conntrack_checksum;
++	int				_nf_conntrack_vmalloc;
++	struct hlist_head		_unconfirmed;
++	struct hlist_head		*_nf_ct_expect_hash;
++	unsigned int			_nf_ct_expect_vmalloc;
++	unsigned int			_nf_ct_expect_count;
++	unsigned int			_nf_ct_expect_max;
++	struct hlist_head		*_nf_ct_helper_hash;
++	unsigned int			_nf_ct_helper_vmalloc;
++#ifdef CONFIG_SYSCTL
++	/* l4 stuff: */
++	unsigned long			_nf_ct_icmp_timeout;
++	unsigned long			_nf_ct_icmpv6_timeout;
++	unsigned int			_nf_ct_udp_timeout;
++	unsigned int			_nf_ct_udp_timeout_stream;
++	unsigned int			_nf_ct_generic_timeout;
++	unsigned int			_nf_ct_log_invalid;
++	unsigned int			_nf_ct_tcp_timeout_max_retrans;
++	int				_nf_ct_tcp_be_liberal;
++	int				_nf_ct_tcp_loose;
++	int				_nf_ct_tcp_max_retrans;
++	unsigned int			_nf_ct_tcp_timeouts[10];
++	struct ctl_table_header		*_icmp_sysctl_header;
++	unsigned int			_tcp_sysctl_table_users;
++	struct ctl_table_header		*_tcp_sysctl_header;
++	unsigned int			_udp_sysctl_table_users;
++	struct ctl_table_header		*_udp_sysctl_header;
++	struct ctl_table_header		*_icmpv6_sysctl_header;
++	struct ctl_table_header		*_generic_sysctl_header;
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	struct ctl_table_header		*_icmp_compat_sysctl_header;
++	struct ctl_table_header		*_tcp_compat_sysctl_header;
++	struct ctl_table_header		*_udp_compat_sysctl_header;
++	struct ctl_table_header		*_generic_compat_sysctl_header;
++#endif
++	/* l4 protocols sysctl tables: */
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_icmp;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_tcp4;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_icmpv6;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_tcp6;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_udp4;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_udp6;
++	struct nf_conntrack_l4proto	*_nf_conntrack_l4proto_generic;
++	struct nf_conntrack_l4proto	**_nf_ct_protos[PF_MAX];
++	/* l3 protocols sysctl tables: */
++	struct nf_conntrack_l3proto	*_nf_conntrack_l3proto_ipv4;
++	struct nf_conntrack_l3proto	*_nf_conntrack_l3proto_ipv6;
++	struct nf_conntrack_l3proto	*_nf_ct_l3protos[AF_MAX];
++	/* sysctl standalone stuff: */
++	struct ctl_table_header		*_nf_ct_sysctl_header;
++	ctl_table			*_nf_ct_sysctl_table;
++	ctl_table			*_nf_ct_netfilter_table;
++	ctl_table			*_nf_ct_net_table;
++	ctl_table			*_ip_ct_netfilter_table;
++	struct ctl_table_header		*_ip_ct_sysctl_header;
++	int				_nf_ct_log_invalid_proto_min;
++	int				_nf_ct_log_invalid_proto_max;
++#endif /* CONFIG_SYSCTL */
++};
++#endif
++#endif
++
++struct ve_cpu_stats {
++	cycles_t	idle_time;
++	cycles_t	iowait_time;
++	cycles_t	strt_idle_time;
++	cycles_t	used_time;
++	seqcount_t	stat_lock;
++	int		nr_running;
++	int		nr_unint;
++	int		nr_iowait;
++	cputime64_t	user;
++	cputime64_t	nice;
++	cputime64_t	system;
++} ____cacheline_aligned;
++
++struct ve_ipt_recent;
++struct ve_xt_hashlimit;
++
++struct cgroup;
++struct css_set;
++
++struct ve_struct {
++	struct list_head	ve_list;
++
++	envid_t			veid;
++	struct list_head	vetask_lh;
++	/* capability bounding set */
++	kernel_cap_t		ve_cap_bset;
++	atomic_t		pcounter;
++	/* ref counter to ve from ipc */
++	atomic_t		counter;
++	unsigned int		class_id;
++	struct rw_semaphore	op_sem;
++	int			is_running;
++	int			is_locked;
++	atomic_t		suspend;
++	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
++	__u64			features;
++
++/* VE's root */
++	struct path		root_path;
++
++	struct file_system_type *proc_fstype;
++	struct vfsmount		*proc_mnt;
++	struct proc_dir_entry	*proc_root;
++
++/* BSD pty's */
++#ifdef CONFIG_LEGACY_PTYS
++	struct tty_driver       *pty_driver;
++	struct tty_driver       *pty_slave_driver;
++#endif
++#ifdef CONFIG_UNIX98_PTYS
++	struct tty_driver	*ptm_driver;
++	struct tty_driver	*pts_driver;
++	struct idr		*allocated_ptys;
++	struct file_system_type *devpts_fstype;
++	struct vfsmount		*devpts_mnt;
++	struct dentry		*devpts_root;
++	struct devpts_config	*devpts_config;
++#endif
++
++	struct file_system_type *shmem_fstype;
++	struct vfsmount		*shmem_mnt;
++#ifdef CONFIG_SYSFS
++	struct file_system_type *sysfs_fstype;
++	struct vfsmount		*sysfs_mnt;
++	struct super_block	*sysfs_sb;
++	struct sysfs_dirent	*_sysfs_root;
++#endif
++#ifndef CONFIG_SYSFS_DEPRECATED
++	struct kobject		*_virtual_dir;
++#endif
++	struct kset		*class_kset;
++	struct kset		*devices_kset;
++	struct class		*tty_class;
++	struct class		*mem_class;
++
++#ifdef CONFIG_NET
++	struct class		*net_class;
++#ifdef CONFIG_INET
++ 	unsigned long		rt_flush_required;
++#endif
++#endif
++#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE)
++	struct veip_struct	*veip;
++	struct net_device	*_venet_dev;
++#endif
++
++/* per VE CPU stats*/
++	struct timespec		start_timespec;
++	u64			start_jiffies;	/* Deprecated */
++	cycles_t 		start_cycles;
++	unsigned long		avenrun[3];	/* loadavg data */
++
++	cycles_t 		cpu_used_ve;
++	struct kstat_lat_pcpu_struct	sched_lat_ve;
++
++#ifdef CONFIG_INET
++	struct icmp_mib		*_icmp_statistics[2];
++	struct icmpmsg_mib	*_icmpmsg_statistics[2];
++	struct ipstats_mib	*_ip_statistics[2];
++	struct tcp_mib		*_tcp_statistics[2];
++	struct udp_mib		*_udp_statistics[2];
++	struct udp_mib		*_udplite_statistics[2];
++	struct linux_mib	*_net_statistics[2];
++	struct venet_stat       *stat;
++#ifdef CONFIG_VE_IPTABLES
++/* core/netfilter.c virtualization */
++	struct xt_table		*_ve_ipt_filter_pf; /* packet_filter struct */
++	struct xt_table		*_ve_ip6t_filter_pf;
++	struct xt_table		*_ipt_mangle_table;
++	struct xt_table		*_ip6t_mangle_table;
++	struct list_head	_xt_tables[NPROTO];
++
++	__u64			ipt_mask;
++	__u64			_iptables_modules;
++	struct ve_nf_conntrack	*_nf_conntrack;
++	struct ve_ipt_recent	*_ipt_recent;
++	struct ve_xt_hashlimit	*_xt_hashlimit;
++#endif /* CONFIG_VE_IPTABLES */
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	struct ipstats_mib	*_ipv6_statistics[2];
++	struct icmpv6_mib	*_icmpv6_statistics[2];
++	struct icmpv6msg_mib	*_icmpv6msg_statistics[2];
++	struct udp_mib		*_udp_stats_in6[2];
++	struct udp_mib		*_udplite_stats_in6[2];
++#endif
++#endif
++	wait_queue_head_t	*_log_wait;
++	unsigned		*_log_start;
++	unsigned		*_log_end;
++	unsigned		*_logged_chars;
++	char			*log_buf;
++#define VE_DEFAULT_LOG_BUF_LEN	4096
++
++	struct ve_cpu_stats	*cpu_stats;
++	unsigned long		down_at;
++	struct list_head	cleanup_list;
++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
++	struct list_head	_fuse_conn_list;
++	struct super_block	*_fuse_control_sb;
++
++	struct file_system_type	*fuse_fs_type;
++	struct file_system_type	*fuse_ctl_fs_type;
++#endif
++	unsigned long		jiffies_fixup;
++	unsigned char		disable_net;
++	struct ve_monitor	*monitor;
++	struct proc_dir_entry	*monitor_proc;
++	unsigned long		meminfo_val;
++
++	struct nsproxy		*ve_ns;
++	struct net		*ve_netns;
++	struct cgroup		*ve_cgroup;
++	struct css_set		*ve_css_set;
++};
++
++int init_ve_cgroups(struct ve_struct *ve);
++void fini_ve_cgroups(struct ve_struct *ve);
++
++#define VE_CPU_STATS(ve, cpu)	(per_cpu_ptr((ve)->cpu_stats, cpu))
++
++extern int nr_ve;
++extern struct proc_dir_entry *proc_vz_dir;
++extern struct proc_dir_entry *glob_proc_vz_dir;
++
++#ifdef CONFIG_VE
++
++void do_update_load_avg_ve(void);
++void do_env_free(struct ve_struct *ptr);
++
++static inline struct ve_struct *get_ve(struct ve_struct *ptr)
++{
++	if (ptr != NULL)
++		atomic_inc(&ptr->counter);
++	return ptr;
++}
++
++static inline void put_ve(struct ve_struct *ptr)
++{
++	if (ptr && atomic_dec_and_test(&ptr->counter)) {
++		BUG_ON(atomic_read(&ptr->pcounter) > 0);
++		BUG_ON(ptr->is_running);
++		do_env_free(ptr);
++	}
++}
++
++static inline void pget_ve(struct ve_struct *ptr)
++{
++	atomic_inc(&ptr->pcounter);
++}
++
++void ve_cleanup_schedule(struct ve_struct *);
++static inline void pput_ve(struct ve_struct *ptr)
++{
++	if (unlikely(atomic_dec_and_test(&ptr->pcounter)))
++		ve_cleanup_schedule(ptr);
++}
++
++extern spinlock_t ve_cleanup_lock;
++extern struct list_head ve_cleanup_list;
++extern struct task_struct *ve_cleanup_thread;
++
++extern unsigned long long ve_relative_clock(struct timespec * ts);
++
++#ifdef CONFIG_FAIRSCHED
++#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask)
++#else
++#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0)
++#endif
++#else	/* CONFIG_VE */
++#define ve_utsname	system_utsname
++#define get_ve(ve)	(NULL)
++#define put_ve(ve)	do { } while (0)
++#define pget_ve(ve)	do { } while (0)
++#define pput_ve(ve)	do { } while (0)
++#endif	/* CONFIG_VE */
++
++#endif /* _LINUX_VE_H */
+diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
+new file mode 100644
+index 0000000..26ca897
+--- /dev/null
++++ b/include/linux/ve_proto.h
+@@ -0,0 +1,89 @@
++/*
++ *  include/linux/ve_proto.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_H__
++#define __VE_H__
++
++#ifdef CONFIG_VE
++
++struct ve_struct;
++
++#ifdef CONFIG_INET
++void tcp_v4_kill_ve_sockets(struct ve_struct *envid);
++#ifdef CONFIG_VE_NETDEV
++int venet_init(void);
++#endif
++#endif
++
++extern struct list_head ve_list_head;
++#define for_each_ve(ve)	list_for_each_entry((ve), &ve_list_head, ve_list)
++extern rwlock_t ve_list_lock;
++extern struct ve_struct *get_ve_by_id(envid_t);
++extern struct ve_struct *__find_ve_by_id(envid_t);
++
++struct env_create_param3;
++extern int real_env_create(envid_t veid, unsigned flags, u32 class_id,
++			   struct env_create_param3 *data, int datalen);
++extern void ve_move_task(struct task_struct *, struct ve_struct *);
++
++int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned);
++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode);
++int devperms_seq_show(struct seq_file *m, void *v);
++
++enum {
++	VE_SS_CHAIN,
++
++	VE_MAX_CHAINS
++};
++
++typedef int ve_hook_init_fn(void *data);
++typedef void ve_hook_fini_fn(void *data);
++
++struct ve_hook
++{
++	ve_hook_init_fn *init;
++	ve_hook_fini_fn *fini;
++	struct module *owner;
++
++	/* Functions are called in ascending priority */
++	int priority;
++
++	/* Private part */
++	struct list_head list;
++};
++
++enum {
++	HOOK_PRIO_DEFAULT = 0,
++
++	HOOK_PRIO_FS = HOOK_PRIO_DEFAULT,
++
++	HOOK_PRIO_NET_PRE,
++	HOOK_PRIO_NET,
++	HOOK_PRIO_NET_POST,
++
++	HOOK_PRIO_AFTERALL = INT_MAX
++};
++
++void *ve_seq_start(struct seq_file *m, loff_t *pos);
++void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos);
++void ve_seq_stop(struct seq_file *m, void *v);
++
++extern int ve_hook_iterate_init(int chain, void *data);
++extern void ve_hook_iterate_fini(int chain, void *data);
++
++extern void ve_hook_register(int chain, struct ve_hook *vh);
++extern void ve_hook_unregister(struct ve_hook *vh);
++#else /* CONFIG_VE */
++#define ve_hook_register(ch, vh)	do { } while (0)
++#define ve_hook_unregister(ve)		do { } while (0)
++
++#define get_device_perms_ve(t, d, a)	(0)
++#endif /* CONFIG_VE */
++#endif
+diff --git a/include/linux/ve_task.h b/include/linux/ve_task.h
+new file mode 100644
+index 0000000..4b7d722
+--- /dev/null
++++ b/include/linux/ve_task.h
+@@ -0,0 +1,73 @@
++/*
++ *  include/linux/ve_task.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_TASK_H__
++#define __VE_TASK_H__
++
++#include <linux/seqlock.h>
++#include <asm/timex.h>
++
++struct ve_task_info {
++/* virtualization */
++	struct ve_struct *owner_env;
++	struct ve_struct *exec_env;
++	struct ve_struct *saved_env;
++	struct list_head vetask_list;
++	struct dentry *glob_proc_dentry;
++/* statistics: scheduling latency */
++	cycles_t sleep_time;
++	cycles_t sched_time;
++	cycles_t sleep_stamp;
++	cycles_t wakeup_stamp;
++	seqcount_t wakeup_lock;
++};
++
++#define VE_TASK_INFO(task)	(&(task)->ve_task_info)
++#define VE_TASK_LIST_2_TASK(lh)	\
++	list_entry(lh, struct task_struct, ve_task_info.vetask_list)
++
++#ifdef CONFIG_VE
++extern struct ve_struct ve0;
++#define get_ve0()	(&ve0)
++
++#define ve_save_context(t)	do {				\
++		t->ve_task_info.saved_env = 			\
++				t->ve_task_info.exec_env;	\
++		t->ve_task_info.exec_env = get_ve0();		\
++	} while (0)
++#define ve_restore_context(t)	do {				\
++		t->ve_task_info.exec_env = 			\
++				t->ve_task_info.saved_env;	\
++	} while (0)
++
++#define get_exec_env()	(current->ve_task_info.exec_env)
++#define set_exec_env(ve)	({		\
++		struct ve_task_info *vi;	\
++		struct ve_struct *old, *new;	\
++						\
++		vi = &current->ve_task_info;	\
++		old = vi->exec_env;		\
++		new = ve;			\
++		if (unlikely(new == NULL)) {	\
++			printk("%s: NULL exec env (%s)\n", __func__, #ve);\
++			new = get_ve0();	\
++		}				\
++		vi->exec_env = new;		\
++		old;				\
++	})
++#else
++#define get_ve0()		(NULL)
++#define get_exec_env()		(NULL)
++#define set_exec_env(new_env)	(NULL)
++#define ve_save_context(t)	do { } while (0)
++#define ve_restore_context(t)	do { } while (0)
++#endif
++
++#endif /* __VE_TASK_H__ */
+diff --git a/include/linux/veip.h b/include/linux/veip.h
+new file mode 100644
+index 0000000..745f1ec
+--- /dev/null
++++ b/include/linux/veip.h
+@@ -0,0 +1,15 @@
++#ifndef __VE_IP_H_
++#define __VE_IP_H_
++
++struct ve_addr_struct {
++	int family;
++	__u32 key[4];
++};
++
++struct sockaddr;
++
++extern void veaddr_print(char *, int, struct ve_addr_struct *);
++extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
++		struct ve_addr_struct *veaddr);
++
++#endif
+diff --git a/include/linux/venet.h b/include/linux/venet.h
+new file mode 100644
+index 0000000..14cf89e
+--- /dev/null
++++ b/include/linux/venet.h
+@@ -0,0 +1,86 @@
++/*
++ *  include/linux/venet.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VENET_H
++#define _VENET_H
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/vzcalluser.h>
++#include <linux/veip.h>
++#include <linux/netdevice.h>
++
++#define VEIP_HASH_SZ 512
++
++struct ve_struct;
++struct venet_stat;
++struct venet_stats {
++	struct net_device_stats	stats;
++	struct net_device_stats	*real_stats;
++};
++
++struct ip_entry_struct
++{
++	struct ve_addr_struct	addr;
++	struct ve_struct	*active_env;
++	struct venet_stat	*stat;
++	struct veip_struct	*veip;
++	struct list_head 	ip_hash;
++	struct list_head 	ve_list;
++};
++
++struct veip_struct
++{
++	struct list_head	src_lh;
++	struct list_head	dst_lh;
++	struct list_head	ip_lh;
++	struct list_head	list;
++	envid_t			veid;
++};
++
++static inline struct net_device_stats *
++venet_stats(struct net_device *dev, int cpu)
++{
++	struct venet_stats *stats;
++	stats = (struct venet_stats*)dev->priv;
++	return per_cpu_ptr(stats->real_stats, cpu);
++}
++
++/* veip_hash_lock should be taken for write by caller */
++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
++/* veip_hash_lock should be taken for write by caller */
++void ip_entry_unhash(struct ip_entry_struct *entry);
++/* veip_hash_lock should be taken for read by caller */
++struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *);
++
++/* veip_hash_lock should be taken for read by caller */
++struct veip_struct *veip_find(envid_t veid);
++/* veip_hash_lock should be taken for write by caller */
++struct veip_struct *veip_findcreate(envid_t veid);
++/* veip_hash_lock should be taken for write by caller */
++void veip_put(struct veip_struct *veip);
++
++extern struct list_head veip_lh;
++
++int veip_start(struct ve_struct *ve);
++void veip_stop(struct ve_struct *ve);
++__exit void veip_cleanup(void);
++int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr);
++int veip_entry_del(envid_t veid, struct ve_addr_struct *addr);
++int venet_change_skb_owner(struct sk_buff *skb);
++
++extern struct list_head ip_entry_hash_table[];
++extern rwlock_t veip_hash_lock;
++
++#ifdef CONFIG_PROC_FS
++int veip_seq_show(struct seq_file *m, void *v);
++#endif
++
++#endif
+diff --git a/include/linux/veprintk.h b/include/linux/veprintk.h
+new file mode 100644
+index 0000000..5669d7b
+--- /dev/null
++++ b/include/linux/veprintk.h
+@@ -0,0 +1,38 @@
++/*
++ *  include/linux/veprintk.h
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_PRINTK_H__
++#define __VE_PRINTK_H__
++
++#ifdef CONFIG_VE
++
++#define ve_log_wait		(*(get_exec_env()->_log_wait))
++#define ve_log_start		(*(get_exec_env()->_log_start))
++#define ve_log_end		(*(get_exec_env()->_log_end))
++#define ve_logged_chars		(*(get_exec_env()->_logged_chars))
++#define ve_log_buf		(get_exec_env()->log_buf)
++#define ve_log_buf_len		(ve_is_super(get_exec_env()) ? \
++				log_buf_len : VE_DEFAULT_LOG_BUF_LEN)
++#define VE_LOG_BUF_MASK		(ve_log_buf_len - 1)
++#define VE_LOG_BUF(idx)		(ve_log_buf[(idx) & VE_LOG_BUF_MASK])
++
++#else
++
++#define ve_log_wait		log_wait
++#define ve_log_start		log_start
++#define ve_log_end		log_end
++#define ve_logged_chars		logged_chars
++#define ve_log_buf		log_buf
++#define ve_log_buf_len		log_buf_len
++#define VE_LOG_BUF_MASK		LOG_BUF_MASK
++#define VE_LOG_BUF(idx)		LOG_BUF(idx)
++
++#endif /* CONFIG_VE */
++#endif /* __VE_PRINTK_H__ */
+diff --git a/include/linux/veth.h b/include/linux/veth.h
+index 3354c1e..34cfe2b 100644
+--- a/include/linux/veth.h
++++ b/include/linux/veth.h
+@@ -1,3 +1,12 @@
++/*
++ *  include/linux/veth.h
++ *
++ *  Copyright (C) 2007  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
+ #ifndef __NET_VETH_H_
+ #define __NET_VETH_H_
+ 
+@@ -9,4 +18,28 @@ enum {
+ #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
+ };
+ 
++#ifdef __KERNEL__
++struct veth_struct
++{
++	struct net_device_stats stats;
++	struct net_device	*pair;
++	struct list_head	hwaddr_list;
++	struct net_device_stats	*real_stats;
++	int			allow_mac_change;
++};
++
++#define veth_from_netdev(dev) \
++	((struct veth_struct *)(netdev_priv(dev)))
++static inline struct net_device * veth_to_netdev(struct veth_struct *veth)
++{
++	return (struct net_device *)((char *)veth - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST));
++}
++#endif
++
++static inline struct net_device_stats *
++veth_stats(struct net_device *dev, int cpuid)
++{
++	return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid);
++}
++
+ #endif
+diff --git a/include/linux/virtinfo.h b/include/linux/virtinfo.h
+new file mode 100644
+index 0000000..b0dad07
+--- /dev/null
++++ b/include/linux/virtinfo.h
+@@ -0,0 +1,100 @@
++/*
++ *  include/linux/virtinfo.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __LINUX_VIRTINFO_H
++#define __LINUX_VIRTINFO_H
++
++#include <linux/kernel.h>
++#include <linux/page-flags.h>
++#include <linux/notifier.h>
++
++struct vnotifier_block
++{
++	int (*notifier_call)(struct vnotifier_block *self,
++			unsigned long, void *, int);
++	struct vnotifier_block *next;
++	int priority;
++};
++
++extern struct semaphore virtinfo_sem;
++void __virtinfo_notifier_register(int type, struct vnotifier_block *nb);
++void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
++int virtinfo_notifier_call(int type, unsigned long n, void *data);
++
++struct page_info {
++	unsigned long nr_file_dirty;
++	unsigned long nr_writeback;
++	unsigned long nr_anon_pages;
++	unsigned long nr_file_mapped;
++	unsigned long nr_slab_rec;
++	unsigned long nr_slab_unrec;
++	unsigned long nr_pagetable;
++	unsigned long nr_unstable_nfs;
++	unsigned long nr_bounce;
++	unsigned long nr_writeback_temp;
++};
++
++struct meminfo {
++	struct sysinfo si;
++	struct page_info pi;
++	unsigned long active, inactive;
++	unsigned long cache, swapcache;
++	unsigned long committed_space;
++	unsigned long allowed;
++	unsigned long vmalloc_total, vmalloc_used, vmalloc_largest;
++};
++
++#define VIRTINFO_MEMINFO	0
++#define VIRTINFO_ENOUGHMEM	1
++#define VIRTINFO_DOFORK         2
++#define VIRTINFO_DOEXIT         3
++#define VIRTINFO_DOEXECVE       4
++#define VIRTINFO_DOFORKRET      5
++#define VIRTINFO_DOFORKPOST     6
++#define VIRTINFO_EXIT           7
++#define VIRTINFO_EXITMMAP       8
++#define VIRTINFO_EXECMMAP       9
++#define VIRTINFO_OUTOFMEM       10
++#define VIRTINFO_PAGEIN         11
++#define VIRTINFO_SYSINFO        12
++#define VIRTINFO_NEWUBC         13
++#define VIRTINFO_VMSTAT		14
++
++enum virt_info_types {
++	VITYPE_GENERAL,
++	VITYPE_FAUDIT,
++	VITYPE_QUOTA,
++	VITYPE_SCP,
++
++	VIRT_TYPES
++};
++
++#ifdef CONFIG_VZ_GENCALLS
++
++static inline int virtinfo_gencall(unsigned long n, void *data)
++{
++	int r;
++
++	r = virtinfo_notifier_call(VITYPE_GENERAL, n, data);
++	if (r & NOTIFY_FAIL)
++		return -ENOBUFS;
++	if (r & NOTIFY_OK)
++		return -ERESTARTNOINTR;
++	return 0;
++}
++
++#else
++
++#define virtinfo_gencall(n, data)	0
++
++#endif
++
++#endif /* __LINUX_VIRTINFO_H */
+diff --git a/include/linux/virtinfoscp.h b/include/linux/virtinfoscp.h
+new file mode 100644
+index 0000000..9e7584f
+--- /dev/null
++++ b/include/linux/virtinfoscp.h
+@@ -0,0 +1,21 @@
++#ifndef __VIRTINFO_SCP_H__
++#define __VIRTINFO_SCP_H__
++
++/*
++ * Dump and restore operations are non-symmetric.
++ * With respect to finish/fail hooks, 2 dump hooks are called from
++ * different proc operations, but restore hooks are called from a single one.
++ */
++#define VIRTINFO_SCP_COLLECT    0x10
++#define VIRTINFO_SCP_DUMP       0x11
++#define VIRTINFO_SCP_DMPFIN     0x12
++#define VIRTINFO_SCP_RSTCHECK   0x13
++#define VIRTINFO_SCP_RESTORE    0x14
++#define VIRTINFO_SCP_RSTFAIL    0x15
++
++#define VIRTINFO_SCP_RSTTSK     0x20
++#define VIRTINFO_SCP_RSTMM      0x21
++
++#define VIRTNOTIFY_CHANGE       0x100 
++
++#endif /* __VIRTINFO_SCP_H__ */
+diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
+index 364789a..3de21b7 100644
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -22,6 +22,10 @@ struct vm_area_struct;
+ #define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
+ #endif
+ 
++/* align size to 2^n page boundary */
++#define POWER2_PAGE_ALIGN(size) \
++	((typeof(size))(1UL << (PAGE_SHIFT + get_order(size))))
++
+ struct vm_struct {
+ 	/* keep next,addr,size together to speedup lookups */
+ 	struct vm_struct	*next;
+@@ -38,12 +42,16 @@ struct vm_struct {
+  *	Highlevel APIs for driver use
+  */
+ extern void *vmalloc(unsigned long size);
++extern void *ub_vmalloc(unsigned long size);
+ extern void *vmalloc_user(unsigned long size);
+ extern void *vmalloc_node(unsigned long size, int node);
++extern void *ub_vmalloc_node(unsigned long size, int node);
+ extern void *vmalloc_exec(unsigned long size);
+ extern void *vmalloc_32(unsigned long size);
+ extern void *vmalloc_32_user(unsigned long size);
+ extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
++extern void *vmalloc_best(unsigned long size);
++extern void *ub_vmalloc_best(unsigned long size);
+ extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+ 				pgprot_t prot);
+ extern void vfree(const void *addr);
+@@ -71,6 +79,9 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size,
+ 					unsigned long flags, void *caller);
+ extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ 					unsigned long start, unsigned long end);
++extern struct vm_struct * get_vm_area_best(unsigned long size,
++					   unsigned long flags);
++extern void vprintstat(void);
+ extern struct vm_struct *get_vm_area_node(unsigned long size,
+ 					  unsigned long flags, int node,
+ 					  gfp_t gfp_mask);
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index e83b693..fec874f 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -92,6 +92,7 @@ static inline void vm_events_fold_cpu(int cpu)
+ }
+ #endif
+ 
++extern unsigned long vm_events(enum vm_event_item i);
+ #else
+ 
+ /* Disable counters */
+@@ -114,6 +115,7 @@ static inline void vm_events_fold_cpu(int cpu)
+ {
+ }
+ 
++static inline unsigned long vm_events(enum vm_event_item i) { return 0; }
+ #endif /* CONFIG_VM_EVENT_COUNTERS */
+ 
+ #define __count_zone_vm_events(item, zone, delta) \
+diff --git a/include/linux/vzcalluser.h b/include/linux/vzcalluser.h
+new file mode 100644
+index 0000000..9736479
+--- /dev/null
++++ b/include/linux/vzcalluser.h
+@@ -0,0 +1,195 @@
++/*
++ *  include/linux/vzcalluser.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VZCALLUSER_H
++#define _LINUX_VZCALLUSER_H
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++#include <linux/vziptable_defs.h>
++
++#define KERN_VZ_PRIV_RANGE 51
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++#ifndef __KERNEL__
++#define __user
++#endif
++
++/*
++ * VE management ioctls
++ */
++
++struct vzctl_old_env_create {
++	envid_t veid;
++	unsigned flags;
++#define VE_CREATE 	1	/* Create VE, VE_ENTER added automatically */
++#define VE_EXCLUSIVE	2	/* Fail if exists */
++#define VE_ENTER	4	/* Enter existing VE */
++#define VE_TEST		8	/* Test if VE exists */
++#define VE_LOCK		16	/* Do not allow entering created VE */
++#define VE_SKIPLOCK	32	/* Allow entering embrion VE */
++	__u32 addr;
++};
++
++struct vzctl_mark_env_to_down {
++	envid_t veid;
++};
++
++struct vzctl_setdevperms {
++	envid_t veid;
++	unsigned type;
++#define VE_USE_MAJOR	010	/* Test MAJOR supplied in rule */
++#define VE_USE_MINOR	030	/* Test MINOR supplied in rule */
++#define VE_USE_MASK	030	/* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
++	unsigned dev;
++	unsigned mask;
++};
++
++struct vzctl_ve_netdev {
++	envid_t veid;
++	int op;
++#define VE_NETDEV_ADD  1
++#define VE_NETDEV_DEL  2
++	char __user *dev_name;
++};
++
++struct vzctl_ve_meminfo {
++	envid_t veid;
++	unsigned long val;
++};
++
++struct vzctl_env_create_cid {
++	envid_t veid;
++	unsigned flags;
++	__u32 class_id;
++};
++
++struct vzctl_env_create {
++	envid_t veid;
++	unsigned flags;
++	__u32 class_id;
++};
++
++struct env_create_param {
++	__u64 iptables_mask;
++};
++
++#define VZCTL_ENV_CREATE_DATA_MINLEN	sizeof(struct env_create_param)
++
++struct env_create_param2 {
++	__u64 iptables_mask;
++	__u64 feature_mask;
++	__u32 total_vcpus;	/* 0 - don't care, same as in host */
++};
++
++struct env_create_param3 {
++	__u64 iptables_mask;
++	__u64 feature_mask;
++	__u32 total_vcpus;
++	__u32 pad;
++	__u64 known_features;
++};
++
++#define VE_FEATURE_SYSFS	(1ULL << 0)
++#define VE_FEATURE_DEF_PERMS	(1ULL << 2)
++
++#define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
++#define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | \
++				 VE_FEATURE_DEF_PERMS)
++
++typedef struct env_create_param3 env_create_param_t;
++#define VZCTL_ENV_CREATE_DATA_MAXLEN	sizeof(env_create_param_t)
++
++struct vzctl_env_create_data {
++	envid_t veid;
++	unsigned flags;
++	__u32 class_id;
++	env_create_param_t __user *data;
++	int datalen;
++};
++
++struct vz_load_avg {
++	int val_int;
++	int val_frac;
++};
++
++struct vz_cpu_stat {
++	unsigned long user_jif;
++	unsigned long nice_jif;
++	unsigned long system_jif; 
++	unsigned long uptime_jif;
++	__u64 idle_clk;
++	__u64 strv_clk;
++	__u64 uptime_clk;
++	struct vz_load_avg avenrun[3];	/* loadavg data */
++};
++
++struct vzctl_cpustatctl {
++	envid_t veid;
++	struct vz_cpu_stat __user *cpustat;
++};
++
++#define VZCTLTYPE '.'
++#define VZCTL_OLD_ENV_CREATE	_IOW(VZCTLTYPE, 0,			\
++					struct vzctl_old_env_create)
++#define VZCTL_MARK_ENV_TO_DOWN	_IOW(VZCTLTYPE, 1,			\
++					struct vzctl_mark_env_to_down)
++#define VZCTL_SETDEVPERMS	_IOW(VZCTLTYPE, 2,			\
++					struct vzctl_setdevperms)
++#define VZCTL_ENV_CREATE_CID	_IOW(VZCTLTYPE, 4,			\
++					struct vzctl_env_create_cid)
++#define VZCTL_ENV_CREATE	_IOW(VZCTLTYPE, 5,			\
++					struct vzctl_env_create)
++#define VZCTL_GET_CPU_STAT	_IOW(VZCTLTYPE, 6,			\
++					struct vzctl_cpustatctl)
++#define VZCTL_ENV_CREATE_DATA	_IOW(VZCTLTYPE, 10,			\
++					struct vzctl_env_create_data)
++#define VZCTL_VE_NETDEV		_IOW(VZCTLTYPE, 11,			\
++					struct vzctl_ve_netdev)
++#define VZCTL_VE_MEMINFO	_IOW(VZCTLTYPE, 13,                     \
++					struct vzctl_ve_meminfo)
++
++#ifdef __KERNEL__
++#ifdef CONFIG_COMPAT
++#include <linux/compat.h>
++
++struct compat_vzctl_ve_netdev {
++	envid_t veid;
++	int op;
++	compat_uptr_t dev_name;
++};
++
++struct compat_vzctl_ve_meminfo {
++	envid_t veid;
++	compat_ulong_t val;
++};
++
++struct compat_vzctl_env_create_data {
++	envid_t veid;
++	unsigned flags;
++	__u32 class_id;
++	compat_uptr_t data;
++	int datalen;
++};
++
++#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10,		\
++					struct compat_vzctl_env_create_data)
++#define VZCTL_COMPAT_VE_NETDEV	_IOW(VZCTLTYPE, 11,			\
++					struct compat_vzctl_ve_netdev)
++#define VZCTL_COMPAT_VE_MEMINFO	_IOW(VZCTLTYPE, 13,                     \
++					struct compat_vzctl_ve_meminfo)
++#endif
++#endif
++
++#endif
+diff --git a/include/linux/vzctl.h b/include/linux/vzctl.h
+new file mode 100644
+index 0000000..ad967ed
+--- /dev/null
++++ b/include/linux/vzctl.h
+@@ -0,0 +1,30 @@
++/*
++ *  include/linux/vzctl.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VZCTL_H
++#define _LINUX_VZCTL_H
++
++#include <linux/list.h>
++
++struct module;
++struct inode;
++struct file;
++struct vzioctlinfo {
++	unsigned type;
++	int (*ioctl)(struct file *, unsigned int, unsigned long);
++	int (*compat_ioctl)(struct file *, unsigned int, unsigned long);
++	struct module *owner;
++	struct list_head list;
++};
++
++extern void vzioctl_register(struct vzioctlinfo *inf);
++extern void vzioctl_unregister(struct vzioctlinfo *inf);
++
++#endif
+diff --git a/include/linux/vzctl_quota.h b/include/linux/vzctl_quota.h
+new file mode 100644
+index 0000000..6d36cdd
+--- /dev/null
++++ b/include/linux/vzctl_quota.h
+@@ -0,0 +1,74 @@
++/*
++ *  include/linux/vzctl_quota.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __LINUX_VZCTL_QUOTA_H__
++#define __LINUX_VZCTL_QUOTA_H__
++
++#include <linux/compat.h>
++
++#ifndef __KERNEL__
++#define __user
++#endif
++
++/*
++ * Quota management ioctl
++ */
++
++struct vz_quota_stat;
++struct vzctl_quotactl {
++	int cmd;
++	unsigned int quota_id;
++	struct vz_quota_stat __user *qstat;
++	char __user *ve_root;
++};
++
++struct vzctl_quotaugidctl {
++	int cmd;		/* subcommand */
++	unsigned int quota_id;	/* quota id where it applies to */
++	unsigned int ugid_index;/* for reading statistic. index of first
++				    uid/gid record to read */
++	unsigned int ugid_size;	/* size of ugid_buf array */
++	void *addr; 		/* user-level buffer */
++};
++
++#define VZDQCTLTYPE '+'
++#define VZCTL_QUOTA_DEPR_CTL	_IOWR(VZDQCTLTYPE, 1,			\
++					struct vzctl_quotactl)
++#define VZCTL_QUOTA_NEW_CTL	_IOWR(VZDQCTLTYPE, 2,			\
++					struct vzctl_quotactl)
++#define VZCTL_QUOTA_UGID_CTL	_IOWR(VZDQCTLTYPE, 3,			\
++					struct vzctl_quotaugidctl)
++
++#ifdef __KERNEL__
++#ifdef CONFIG_COMPAT
++struct compat_vzctl_quotactl {
++	int cmd;
++	unsigned int quota_id;
++	compat_uptr_t qstat;
++	compat_uptr_t ve_root;
++};
++
++struct compat_vzctl_quotaugidctl {
++	int cmd;		/* subcommand */
++	unsigned int quota_id;	/* quota id where it applies to */
++	unsigned int ugid_index;/* for reading statistic. index of first
++				    uid/gid record to read */
++	unsigned int ugid_size;	/* size of ugid_buf array */
++	compat_uptr_t addr; 	/* user-level buffer */
++};
++
++#define VZCTL_COMPAT_QUOTA_CTL	_IOWR(VZDQCTLTYPE, 2,			\
++					struct compat_vzctl_quotactl)
++#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3,		\
++					struct compat_vzctl_quotaugidctl)
++#endif
++#endif
++
++#endif /* __LINUX_VZCTL_QUOTA_H__ */
+diff --git a/include/linux/vzctl_venet.h b/include/linux/vzctl_venet.h
+new file mode 100644
+index 0000000..4797a50
+--- /dev/null
++++ b/include/linux/vzctl_venet.h
+@@ -0,0 +1,51 @@
++/*
++ *  include/linux/vzctl_venet.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZCTL_VENET_H
++#define _VZCTL_VENET_H
++
++#include <linux/types.h>
++#include <linux/compat.h>
++#include <linux/ioctl.h>
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++struct vzctl_ve_ip_map {
++	envid_t veid;
++	int op;
++#define VE_IP_ADD	1
++#define VE_IP_DEL	2
++	struct sockaddr *addr;
++	int addrlen;
++};
++
++#define VENETCTLTYPE '('
++
++#define VENETCTL_VE_IP_MAP	_IOW(VENETCTLTYPE, 3,			\
++					struct vzctl_ve_ip_map)
++
++#ifdef __KERNEL__
++#ifdef CONFIG_COMPAT
++struct compat_vzctl_ve_ip_map {
++	envid_t veid;
++	int op;
++	compat_uptr_t addr;
++	int addrlen;
++};
++
++#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3,			\
++					struct compat_vzctl_ve_ip_map)
++#endif
++#endif
++
++#endif
+diff --git a/include/linux/vzctl_veth.h b/include/linux/vzctl_veth.h
+new file mode 100644
+index 0000000..1480c5b
+--- /dev/null
++++ b/include/linux/vzctl_veth.h
+@@ -0,0 +1,42 @@
++/*
++ *  include/linux/vzctl_veth.h
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZCTL_VETH_H
++#define _VZCTL_VETH_H
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++struct vzctl_ve_hwaddr {
++	envid_t veid;
++	int op;
++#define VE_ETH_ADD			1
++#define VE_ETH_DEL			2
++#define VE_ETH_ALLOW_MAC_CHANGE		3
++#define VE_ETH_DENY_MAC_CHANGE		4
++	unsigned char	dev_addr[6];
++	int addrlen;
++	char		dev_name[16];
++	unsigned char	dev_addr_ve[6];
++	int addrlen_ve;
++	char		dev_name_ve[16];
++};
++
++#define VETHCTLTYPE '['
++
++#define VETHCTL_VE_HWADDR	_IOW(VETHCTLTYPE, 3,			\
++					struct vzctl_ve_hwaddr)
++
++#endif
+diff --git a/include/linux/vzdq_tree.h b/include/linux/vzdq_tree.h
+new file mode 100644
+index 0000000..c019e09
+--- /dev/null
++++ b/include/linux/vzdq_tree.h
+@@ -0,0 +1,99 @@
++/*
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo disk quota tree definition
++ */
++
++#ifndef _VZDQ_TREE_H
++#define _VZDQ_TREE_H
++
++#include <linux/list.h>
++#include <asm/string.h>
++
++typedef unsigned int quotaid_t;
++#define QUOTAID_BITS		32
++#define QUOTAID_BBITS		4
++#define QUOTAID_EBITS		8
++
++#if QUOTAID_EBITS % QUOTAID_BBITS
++#error Quota bit assumption failure
++#endif
++
++#define QUOTATREE_BSIZE		(1 << QUOTAID_BBITS)
++#define QUOTATREE_BMASK		(QUOTATREE_BSIZE - 1)
++#define QUOTATREE_DEPTH		((QUOTAID_BITS + QUOTAID_BBITS - 1) \
++							/ QUOTAID_BBITS)
++#define QUOTATREE_EDEPTH	((QUOTAID_BITS + QUOTAID_EBITS - 1) \
++							/ QUOTAID_EBITS)
++#define QUOTATREE_BSHIFT(lvl)	((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS)
++
++/*
++ * Depth of keeping unused node (not inclusive).
++ * 0 means release all nodes including root,
++ * QUOTATREE_DEPTH means never release nodes.
++ * Current value: release all nodes strictly after QUOTATREE_EDEPTH 
++ * (measured in external shift units).
++ */
++#define QUOTATREE_CDEPTH	(QUOTATREE_DEPTH \
++				- 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \
++				+ 1)
++
++/*
++ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes.
++ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS),
++ * and each node contains 2^QUOTAID_BBITS pointers.
++ * Level 0 is a (single) tree root node.
++ *
++ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data.
++ * Nodes of lower levels contain pointers to nodes.
++ *
++ * Double pointer in array of i-level node, pointing to a (i+1)-level node
++ * (such as inside quotatree_find_state) are marked by level (i+1), not i.
++ * Level 0 double pointer is a pointer to root inside tree struct.
++ *
++ * The tree is permanent, i.e. all index blocks allocated are keeped alive to
++ * preserve the blocks numbers in the quota file tree to keep its changes
++ * locally.
++ */
++struct quotatree_node {
++	struct list_head list;
++	quotaid_t num;
++	void *blocks[QUOTATREE_BSIZE];
++};
++
++struct quotatree_level {
++	struct list_head usedlh, freelh;
++	quotaid_t freenum;
++};
++
++struct quotatree_tree {
++	struct quotatree_level levels[QUOTATREE_DEPTH];
++	struct quotatree_node *root;
++	unsigned int leaf_num;
++};
++
++struct quotatree_find_state {
++	void **block;
++	int level;
++};
++
++/* number of leafs (objects) and leaf level of the tree */
++#define QTREE_LEAFNUM(tree)	((tree)->leaf_num)
++#define QTREE_LEAFLVL(tree)	(&(tree)->levels[QUOTATREE_DEPTH - 1])
++
++struct quotatree_tree *quotatree_alloc(void);
++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
++		struct quotatree_find_state *st);
++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
++		struct quotatree_find_state *st, void *data);
++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id);
++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *));
++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id);
++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index);
++
++#endif /* _VZDQ_TREE_H */
++
+diff --git a/include/linux/vzevent.h b/include/linux/vzevent.h
+new file mode 100644
+index 0000000..1a67297
+--- /dev/null
++++ b/include/linux/vzevent.h
+@@ -0,0 +1,13 @@
++#ifndef __LINUX_VZ_EVENT_H__
++#define __LINUX_VZ_EVENT_H__
++
++#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE)
++extern int vzevent_send(int msg, const char *attrs_fmt, ...);
++#else
++static inline int vzevent_send(int msg, const char *attrs_fmt, ...)
++{
++	return 0;
++}
++#endif
++
++#endif /* __LINUX_VZ_EVENT_H__ */
+diff --git a/include/linux/vziptable_defs.h b/include/linux/vziptable_defs.h
+new file mode 100644
+index 0000000..ec7586f
+--- /dev/null
++++ b/include/linux/vziptable_defs.h
+@@ -0,0 +1,51 @@
++#ifndef _LINUX_VZIPTABLE_DEFS_H
++#define _LINUX_VZIPTABLE_DEFS_H
++
++/* these masks represent modules */
++#define VE_IP_IPTABLES_MOD		(1U<<0)
++#define VE_IP_FILTER_MOD		(1U<<1)
++#define VE_IP_MANGLE_MOD		(1U<<2)
++#define VE_IP_CONNTRACK_MOD		(1U<<14)
++#define VE_IP_CONNTRACK_FTP_MOD		(1U<<15)
++#define VE_IP_CONNTRACK_IRC_MOD		(1U<<16)
++#define VE_IP_NAT_MOD			(1U<<20)
++#define VE_IP_NAT_FTP_MOD		(1U<<21)
++#define VE_IP_NAT_IRC_MOD		(1U<<22)
++#define VE_IP_IPTABLES6_MOD		(1U<<26)
++#define VE_IP_FILTER6_MOD		(1U<<27)
++#define VE_IP_MANGLE6_MOD		(1U<<28)
++#define VE_IP_IPTABLE_NAT_MOD		(1U<<29)
++#define VE_NF_CONNTRACK_MOD		(1U<<30)
++
++/* these masks represent modules with their dependences */
++#define VE_IP_IPTABLES		(VE_IP_IPTABLES_MOD)
++#define VE_IP_FILTER		(VE_IP_FILTER_MOD		\
++					| VE_IP_IPTABLES)
++#define VE_IP_MANGLE		(VE_IP_MANGLE_MOD		\
++					| VE_IP_IPTABLES)
++#define VE_IP_IPTABLES6		(VE_IP_IPTABLES6_MOD)
++#define VE_IP_FILTER6		(VE_IP_FILTER6_MOD | VE_IP_IPTABLES6)
++#define VE_IP_MANGLE6		(VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6)
++#define VE_NF_CONNTRACK		(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES)
++#define VE_IP_CONNTRACK		(VE_IP_CONNTRACK_MOD		\
++					| VE_IP_IPTABLES)
++#define VE_IP_CONNTRACK_FTP	(VE_IP_CONNTRACK_FTP_MOD	\
++					| VE_IP_CONNTRACK)
++#define VE_IP_CONNTRACK_IRC	(VE_IP_CONNTRACK_IRC_MOD	\
++					| VE_IP_CONNTRACK)
++#define VE_IP_NAT		(VE_IP_NAT_MOD			\
++					| VE_IP_CONNTRACK)
++#define VE_IP_NAT_FTP		(VE_IP_NAT_FTP_MOD		\
++					| VE_IP_NAT | VE_IP_CONNTRACK_FTP)
++#define VE_IP_NAT_IRC		(VE_IP_NAT_IRC_MOD		\
++					| VE_IP_NAT | VE_IP_CONNTRACK_IRC)
++#define VE_IP_IPTABLE_NAT	(VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK)
++
++/* safe iptables mask to be used by default */
++#define VE_IP_DEFAULT					\
++	(VE_IP_IPTABLES |				\
++	VE_IP_FILTER | VE_IP_MANGLE)
++
++#define VE_IPT_CMP(x, y)		(((x) & (y)) == (y))
++
++#endif /* _LINUX_VZIPTABLE_DEFS_H */
+diff --git a/include/linux/vzquota.h b/include/linux/vzquota.h
+new file mode 100644
+index 0000000..18668e6
+--- /dev/null
++++ b/include/linux/vzquota.h
+@@ -0,0 +1,379 @@
++/*
++ *
++ * Copyright (C) 2001-2005 SWsoft
++ * All rights reserved.
++ * 
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo disk quota implementation
++ */
++
++#ifndef _VZDQUOTA_H
++#define _VZDQUOTA_H
++
++#include <linux/types.h>
++#include <linux/quota.h>
++
++/* vzquotactl syscall commands */
++#define VZ_DQ_CREATE		5 /* create quota master block */
++#define VZ_DQ_DESTROY		6 /* destroy qmblk */
++#define VZ_DQ_ON		7 /* mark dentry with already created qmblk */
++#define VZ_DQ_OFF		8 /* remove mark, don't destroy qmblk */
++#define VZ_DQ_SETLIMIT		9 /* set new limits */
++#define VZ_DQ_GETSTAT		10 /* get usage statistic */
++#define VZ_DQ_OFF_FORCED	11 /* forced off */
++/* set of syscalls to maintain UGID quotas */
++#define VZ_DQ_UGID_GETSTAT	1 /* get usage/limits for ugid(s) */
++#define VZ_DQ_UGID_ADDSTAT	2 /* set usage/limits statistic for ugid(s) */
++#define VZ_DQ_UGID_GETGRACE	3 /* get expire times */
++#define VZ_DQ_UGID_SETGRACE	4 /* set expire times */
++#define VZ_DQ_UGID_GETCONFIG	5 /* get ugid_max limit, cnt, flags of qmblk */
++#define VZ_DQ_UGID_SETCONFIG	6 /* set ugid_max limit, flags of qmblk */
++#define VZ_DQ_UGID_SETLIMIT	7 /* set ugid B/I limits */
++#define VZ_DQ_UGID_SETINFO	8 /* set ugid info */
++
++/* common structure for vz and ugid quota */
++struct dq_stat {
++	/* blocks limits */
++	__u64	bhardlimit;	/* absolute limit in bytes */
++	__u64	bsoftlimit;	/* preferred limit in bytes */
++	time_t	btime;		/* time limit for excessive disk use */
++	__u64	bcurrent;	/* current bytes count */
++	/* inodes limits */
++	__u32	ihardlimit;	/* absolute limit on allocated inodes */
++	__u32	isoftlimit;	/* preferred inode limit */
++	time_t	itime;		/* time limit for excessive inode use */
++	__u32	icurrent;	/* current # allocated inodes */
++};
++
++/* One second resolution for grace times */
++#define CURRENT_TIME_SECONDS	(get_seconds())
++
++/* Values for dq_info->flags */
++#define VZ_QUOTA_INODES 0x01       /* inodes limit warning printed */
++#define VZ_QUOTA_SPACE  0x02       /* space limit warning printed */
++
++struct dq_info {
++	time_t		bexpire;   /* expire timeout for excessive disk use */
++	time_t		iexpire;   /* expire timeout for excessive inode use */
++	unsigned	flags;	   /* see previos defines */
++};
++
++struct vz_quota_stat  {
++	struct dq_stat dq_stat;
++	struct dq_info dq_info;
++};
++
++/* UID/GID interface record - for user-kernel level exchange */
++struct vz_quota_iface {
++	unsigned int	qi_id;	   /* UID/GID this applies to */
++	unsigned int	qi_type;   /* USRQUOTA|GRPQUOTA */
++	struct dq_stat	qi_stat;   /* limits, options, usage stats */
++};
++
++#ifdef CONFIG_COMPAT
++#include <linux/compat.h>
++struct compat_dq_stat {
++	/* blocks limits */
++	__u64	bhardlimit;	/* absolute limit in bytes */
++	__u64	bsoftlimit;	/* preferred limit in bytes */
++	compat_time_t btime;	/* time limit for excessive disk use */
++	__u64	bcurrent;	/* current bytes count */
++	/* inodes limits */
++	__u32	ihardlimit;	/* absolute limit on allocated inodes */
++	__u32	isoftlimit;	/* preferred inode limit */
++	compat_time_t itime;	/* time limit for excessive inode use */
++	__u32	icurrent;	/* current # allocated inodes */
++};
++
++struct compat_dq_info {
++	compat_time_t	bexpire;   /* expire timeout for excessive disk use */
++	compat_time_t	iexpire;   /* expire timeout for excessive inode use */
++	unsigned	flags;	   /* see previos defines */
++};
++
++struct compat_vz_quota_stat  {
++	struct compat_dq_stat dq_stat;
++	struct compat_dq_info dq_info;
++};
++
++struct compat_vz_quota_iface {
++	unsigned int	qi_id;	   /* UID/GID this applies to */
++	unsigned int	qi_type;   /* USRQUOTA|GRPQUOTA */
++	struct compat_dq_stat qi_stat;   /* limits, options, usage stats */
++};
++
++static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs,
++				struct dq_stat *dqs)
++{
++	dqs->bhardlimit = odqs->bhardlimit;
++	dqs->bsoftlimit = odqs->bsoftlimit;
++	dqs->bcurrent = odqs->bcurrent;
++	dqs->btime = odqs->btime;
++
++	dqs->ihardlimit = odqs->ihardlimit;
++	dqs->isoftlimit = odqs->isoftlimit;
++	dqs->icurrent = odqs->icurrent;
++	dqs->itime = odqs->itime;
++}
++
++static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi,
++				struct dq_info *dqi)
++{
++	dqi->bexpire = odqi->bexpire;
++	dqi->iexpire = odqi->iexpire;
++	dqi->flags = odqi->flags;
++}
++
++static inline void dqstat2compat_dqstat(struct dq_stat *dqs,
++				struct compat_dq_stat *odqs)
++{
++	odqs->bhardlimit = dqs->bhardlimit;
++	odqs->bsoftlimit = dqs->bsoftlimit;
++	odqs->bcurrent = dqs->bcurrent;
++	odqs->btime = (compat_time_t)dqs->btime;
++
++	odqs->ihardlimit = dqs->ihardlimit;
++	odqs->isoftlimit = dqs->isoftlimit;
++	odqs->icurrent = dqs->icurrent;
++	odqs->itime = (compat_time_t)dqs->itime;
++}
++
++static inline void dqinfo2compat_dqinfo(struct dq_info *dqi,
++				struct compat_dq_info *odqi)
++{
++	odqi->bexpire = (compat_time_t)dqi->bexpire;
++	odqi->iexpire = (compat_time_t)dqi->iexpire;
++	odqi->flags = dqi->flags;
++}
++#endif
++
++/* values for flags and dq_flags */
++/* this flag is set if the userspace has been unable to provide usage
++ * information about all ugids
++ * if the flag is set, we don't allocate new UG quota blocks (their
++ * current usage is unknown) or free existing UG quota blocks (not to
++ * lose information that this block is ok) */
++#define VZDQUG_FIXED_SET	0x01
++/* permit to use ugid quota */
++#define VZDQUG_ON		0x02
++#define VZDQ_USRQUOTA		0x10
++#define VZDQ_GRPQUOTA		0x20
++#define VZDQ_NOACT		0x1000	/* not actual */
++#define VZDQ_NOQUOT		0x2000	/* not under quota tree */
++
++struct vz_quota_ugid_stat {
++	unsigned int	limit;	/* max amount of ugid records */
++	unsigned int	count;	/* amount of ugid records */
++	unsigned int	flags;	
++};
++
++struct vz_quota_ugid_setlimit {
++	unsigned int	type;	/* quota type (USR/GRP) */
++	unsigned int	id;	/* ugid */
++	struct if_dqblk dqb;	/* limits info */
++};
++
++struct vz_quota_ugid_setinfo {
++	unsigned int	type;	/* quota type (USR/GRP) */
++	struct if_dqinfo dqi;	/* grace info */
++};
++
++#ifdef __KERNEL__
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++#include <linux/time.h>
++#include <linux/vzquota_qlnk.h>
++#include <linux/vzdq_tree.h>
++
++/* Values for dq_info flags */
++#define VZ_QUOTA_INODES	0x01	   /* inodes limit warning printed */
++#define VZ_QUOTA_SPACE	0x02	   /* space limit warning printed */
++
++/* values for dq_state */
++#define VZDQ_STARTING		0 /* created, not turned on yet */
++#define VZDQ_WORKING		1 /* quota created, turned on */
++#define VZDQ_STOPING		2 /* created, turned on and off */
++
++/* master quota record - one per veid */
++struct vz_quota_master {
++	struct list_head	dq_hash;	/* next quota in hash list */
++	atomic_t		dq_count;	/* inode reference count */
++	unsigned int		dq_flags;	/* see VZDQUG_FIXED_SET */
++	unsigned int		dq_state;	/* see values above */
++	unsigned int		dq_id;		/* VEID this applies to */
++	struct dq_stat		dq_stat; 	/* limits, grace, usage stats */
++	struct dq_info		dq_info;	/* grace times and flags */
++	spinlock_t		dq_data_lock;	/* for dq_stat */
++
++	struct semaphore	dq_sem;		/* semaphore to protect 
++						   ugid tree */
++
++	struct list_head	dq_ilink_list;	/* list of vz_quota_ilink */
++	struct quotatree_tree	*dq_uid_tree;	/* vz_quota_ugid tree for UIDs */
++	struct quotatree_tree	*dq_gid_tree;	/* vz_quota_ugid tree for GIDs */
++	unsigned int		dq_ugid_count;	/* amount of ugid records */
++	unsigned int		dq_ugid_max;	/* max amount of ugid records */
++	struct dq_info		dq_ugid_info[MAXQUOTAS]; /* ugid grace times */
++
++	struct path		dq_root_path;	/* path of fs tree */
++	struct super_block	*dq_sb;	      /* superblock of our quota root */
++};
++
++/* UID/GID quota record - one per pair (quota_master, uid or gid) */
++struct vz_quota_ugid {
++	unsigned int		qugid_id;     /* UID/GID this applies to */
++	struct dq_stat		qugid_stat;   /* limits, options, usage stats */
++	int			qugid_type;   /* USRQUOTA|GRPQUOTA */
++	atomic_t		qugid_count;  /* reference count */
++};
++
++#define VZ_QUOTA_UGBAD		((struct vz_quota_ugid *)0xfeafea11)
++
++struct vz_quota_datast {
++	struct vz_quota_ilink qlnk;
++};
++
++#define VIRTINFO_QUOTA_GETSTAT	0
++#define VIRTINFO_QUOTA_ON	1
++#define VIRTINFO_QUOTA_OFF	2
++#define VIRTINFO_QUOTA_DISABLE	3
++
++struct virt_info_quota {
++	struct super_block *super;
++	struct dq_stat *qstat;
++};
++
++/*
++ * Interface to VZ quota core
++ */
++#define INODE_QLNK(inode)	(&(inode)->i_qlnk)
++#define QLNK_INODE(qlnk)	container_of((qlnk), struct inode, i_qlnk)
++
++#define VZ_QUOTA_BAD		((struct vz_quota_master *)0xefefefef)
++
++#define VZ_QUOTAO_SETE		1
++#define VZ_QUOTAO_INIT		2
++#define VZ_QUOTAO_DESTR		3
++#define VZ_QUOTAO_SWAP		4
++#define VZ_QUOTAO_INICAL	5
++#define VZ_QUOTAO_DRCAL		6
++#define VZ_QUOTAO_QSET		7
++#define VZ_QUOTAO_TRANS		8
++#define VZ_QUOTAO_ACT		9
++#define VZ_QUOTAO_DTREE		10
++#define VZ_QUOTAO_DET		11
++#define VZ_QUOTAO_ON		12
++#define VZ_QUOTAO_RE_LOCK	13
++
++#define DQUOT_CMD_ALLOC		0
++#define DQUOT_CMD_PREALLOC	1
++#define DQUOT_CMD_CHECK		12
++#define DQUOT_CMD_FORCE		13
++
++extern struct semaphore vz_quota_sem;
++void inode_qmblk_lock(struct super_block *sb);
++void inode_qmblk_unlock(struct super_block *sb);
++void qmblk_data_read_lock(struct vz_quota_master *qmblk);
++void qmblk_data_read_unlock(struct vz_quota_master *qmblk);
++void qmblk_data_write_lock(struct vz_quota_master *qmblk);
++void qmblk_data_write_unlock(struct vz_quota_master *qmblk);
++
++/* for quota operations */
++void vzquota_inode_init_call(struct inode *inode);
++void vzquota_inode_drop_call(struct inode *inode);
++int vzquota_inode_transfer_call(struct inode *, struct iattr *);
++struct vz_quota_master *vzquota_inode_data(struct inode *inode,
++		struct vz_quota_datast *);
++void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *);
++int vzquota_rename_check(struct inode *inode,
++		struct inode *old_dir, struct inode *new_dir);
++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode);
++/* for second-level quota */
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
++/* for management operations */
++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
++		struct vz_quota_stat *qstat);
++void vzquota_free_master(struct vz_quota_master *);
++struct vz_quota_master *vzquota_find_master(unsigned int quota_id);
++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
++		struct vz_quota_master *qmblk, char __user *buf);
++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
++		char __user *buf, int force);
++int vzquota_get_super(struct super_block *sb);
++void vzquota_put_super(struct super_block *sb);
++
++static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk)
++{
++	if (!atomic_read(&qmblk->dq_count))
++		BUG();
++	atomic_inc(&qmblk->dq_count);
++	return qmblk;
++}
++
++static inline void __qmblk_put(struct vz_quota_master *qmblk)
++{
++	atomic_dec(&qmblk->dq_count);
++}
++
++static inline void qmblk_put(struct vz_quota_master *qmblk)
++{
++	if (!atomic_dec_and_test(&qmblk->dq_count))
++		return;
++	vzquota_free_master(qmblk);
++}
++
++extern struct list_head vzquota_hash_table[];
++extern int vzquota_hash_size;
++
++/*
++ * Interface to VZ UGID quota
++ */
++extern struct quotactl_ops vz_quotactl_operations;
++extern struct dquot_operations vz_quota_operations2;
++extern struct quota_format_type vz_quota_empty_v2_format;
++
++#define QUGID_TREE(qmblk, type)	(((type) == USRQUOTA) ?		\
++					qmblk->dq_uid_tree :	\
++					qmblk->dq_gid_tree)
++
++#define VZDQUG_FIND_DONT_ALLOC	1
++#define VZDQUG_FIND_FAKE	2
++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
++		unsigned int quota_id, int type, int flags);
++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
++		unsigned int quota_id, int type, int flags);
++struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid);
++void vzquota_put_ugid(struct vz_quota_master *qmblk,
++		struct vz_quota_ugid *qugid);
++void vzquota_kill_ugid(struct vz_quota_master *qmblk);
++int vzquota_ugid_init(void);
++void vzquota_ugid_release(void);
++int vzquota_transfer_usage(struct inode *inode, int mask,
++		struct vz_quota_ilink *qlnk);
++void vzquota_inode_off(struct inode *inode);
++
++long do_vzquotaugidctl(int cmd, unsigned int quota_id,
++		unsigned int ugid_index, unsigned int ugid_size,
++		void *addr, int compat);
++
++/*
++ * Other VZ quota parts
++ */
++extern struct dquot_operations vz_quota_operations;
++
++long do_vzquotactl(int cmd, unsigned int quota_id,
++		struct vz_quota_stat __user *qstat, const char __user *ve_root,
++		int compat);
++int vzquota_proc_init(void);
++void vzquota_proc_release(void);
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
++extern struct semaphore vz_quota_sem;
++
++void vzaquota_init(void);
++void vzaquota_fini(void);
++
++#endif /* __KERNEL__ */
++
++#endif /* _VZDQUOTA_H */
+diff --git a/include/linux/vzquota_qlnk.h b/include/linux/vzquota_qlnk.h
+new file mode 100644
+index 0000000..2788c41
+--- /dev/null
++++ b/include/linux/vzquota_qlnk.h
+@@ -0,0 +1,25 @@
++/*
++ *  include/linux/vzquota_qlnk.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZDQUOTA_QLNK_H
++#define _VZDQUOTA_QLNK_H
++
++struct vz_quota_master;
++struct vz_quota_ugid;
++
++/* inode link, used to track inodes using quota via dq_ilink_list */
++struct vz_quota_ilink {
++	struct vz_quota_master *qmblk;
++	struct vz_quota_ugid *qugid[MAXQUOTAS];
++	struct list_head list;
++	unsigned char origin[2];
++};
++
++#endif /* _VZDQUOTA_QLNK_H */
+diff --git a/include/linux/vzratelimit.h b/include/linux/vzratelimit.h
+new file mode 100644
+index 0000000..f26baad
+--- /dev/null
++++ b/include/linux/vzratelimit.h
+@@ -0,0 +1,28 @@
++/*
++ *  include/linux/vzratelimit.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VZ_RATELIMIT_H__
++#define __VZ_RATELIMIT_H__
++
++/*
++ * Generic ratelimiting stuff.
++ */
++
++struct vz_rate_info {
++	int burst;
++	int interval; /* jiffy_t per event */
++	int bucket; /* kind of leaky bucket */
++	unsigned long last; /* last event */
++};
++
++/* Return true if rate limit permits. */
++int vz_ratelimit(struct vz_rate_info *p);
++
++#endif /* __VZ_RATELIMIT_H__ */
+diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
+new file mode 100644
+index 0000000..5c23ea4
+--- /dev/null
++++ b/include/linux/vzstat.h
+@@ -0,0 +1,182 @@
++/*
++ *  include/linux/vzstat.h
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VZSTAT_H__
++#define __VZSTAT_H__
++
++struct swap_cache_info_struct {
++	unsigned long add_total;
++	unsigned long del_total;
++	unsigned long find_success;
++	unsigned long find_total;
++	unsigned long noent_race;
++	unsigned long exist_race;
++	unsigned long remove_race;
++};
++
++struct kstat_lat_snap_struct {
++	cycles_t maxlat, totlat;
++	unsigned long count;
++};
++struct kstat_lat_pcpu_snap_struct {
++	cycles_t maxlat, totlat;
++	unsigned long count;
++	seqcount_t lock;
++} ____cacheline_aligned_in_smp;
++
++struct kstat_lat_struct {
++	struct kstat_lat_snap_struct cur, last;
++	cycles_t avg[3];
++};
++struct kstat_lat_pcpu_struct {
++	struct kstat_lat_pcpu_snap_struct cur[NR_CPUS];
++	cycles_t max_snap;
++	struct kstat_lat_snap_struct last;
++	cycles_t avg[3];
++};
++
++struct kstat_perf_snap_struct {
++	cycles_t wall_tottime, cpu_tottime;
++	cycles_t wall_maxdur, cpu_maxdur;
++	unsigned long count;
++};
++struct kstat_perf_struct {
++	struct kstat_perf_snap_struct cur, last;
++};
++
++struct kstat_zone_avg {
++	unsigned long		free_pages_avg[3],
++				nr_active_avg[3],
++				nr_inactive_avg[3];
++};
++
++#define KSTAT_ALLOCSTAT_NR 5
++
++struct kernel_stat_glob {
++	unsigned long nr_unint_avg[3];
++
++	unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
++	struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
++	struct kstat_lat_pcpu_struct sched_lat;
++	struct kstat_lat_struct swap_in;
++
++	struct kstat_perf_struct ttfp, cache_reap,
++			refill_inact, shrink_icache, shrink_dcache;
++
++	struct kstat_zone_avg zone_avg[3];	/* MAX_NR_ZONES */
++} ____cacheline_aligned;
++
++extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
++extern spinlock_t kstat_glb_lock;
++
++#ifdef CONFIG_VE
++#define KSTAT_PERF_ENTER(name)				\
++	unsigned long flags;				\
++	cycles_t start, sleep_time;			\
++							\
++	start = get_cycles();				\
++	sleep_time = VE_TASK_INFO(current)->sleep_time;	\
++
++#define KSTAT_PERF_LEAVE(name)				\
++	spin_lock_irqsave(&kstat_glb_lock, flags);	\
++	kstat_glob.name.cur.count++;			\
++	start = get_cycles() - start;			\
++	if (kstat_glob.name.cur.wall_maxdur < start)	\
++		kstat_glob.name.cur.wall_maxdur = start;\
++	kstat_glob.name.cur.wall_tottime += start;	\
++	start -= VE_TASK_INFO(current)->sleep_time -	\
++					sleep_time;	\
++	if (kstat_glob.name.cur.cpu_maxdur < start)	\
++		kstat_glob.name.cur.cpu_maxdur = start;	\
++	kstat_glob.name.cur.cpu_tottime += start;	\
++	spin_unlock_irqrestore(&kstat_glb_lock, flags);	\
++
++#else
++#define KSTAT_PERF_ENTER(name)
++#define KSTAT_PERF_LEAVE(name)
++#endif
++
++/*
++ * Add another statistics reading.
++ * Serialization is the caller's due.
++ */
++static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
++		cycles_t dur)
++{
++	p->cur.count++;
++	if (p->cur.maxlat < dur)
++		p->cur.maxlat = dur;
++	p->cur.totlat += dur;
++}
++
++static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
++		cycles_t dur)
++{
++	struct kstat_lat_pcpu_snap_struct *cur;
++
++	cur = &p->cur[cpu];
++	write_seqcount_begin(&cur->lock);
++	cur->count++;
++	if (cur->maxlat < dur)
++		cur->maxlat = dur;
++	cur->totlat += dur;
++	write_seqcount_end(&cur->lock);
++}
++
++/*
++ * Move current statistics to last, clear last.
++ * Serialization is the caller's due.
++ */
++static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
++{
++	cycles_t m;
++	memcpy(&p->last, &p->cur, sizeof(p->last));
++	p->cur.maxlat = 0;
++	m = p->last.maxlat;
++	CALC_LOAD(p->avg[0], EXP_1, m)
++	CALC_LOAD(p->avg[1], EXP_5, m)
++	CALC_LOAD(p->avg[2], EXP_15, m)
++}
++
++static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
++{
++	unsigned i, cpu;
++	struct kstat_lat_pcpu_snap_struct snap, *cur;
++	cycles_t m;
++
++	memset(&p->last, 0, sizeof(p->last));
++	for (cpu = 0; cpu < NR_CPUS; cpu++) {
++		cur = &p->cur[cpu];
++		do {
++			i = read_seqcount_begin(&cur->lock);
++			memcpy(&snap, cur, sizeof(snap));
++		} while (read_seqcount_retry(&cur->lock, i));
++		/* 
++		 * read above and this update of maxlat is not atomic,
++		 * but this is OK, since it happens rarely and losing
++		 * a couple of peaks is not essential. xemul
++		 */
++		cur->maxlat = 0;
++
++		p->last.count += snap.count;
++		p->last.totlat += snap.totlat;
++		if (p->last.maxlat < snap.maxlat)
++			p->last.maxlat = snap.maxlat;
++	}
++
++	m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
++	CALC_LOAD(p->avg[0], EXP_1, m);
++	CALC_LOAD(p->avg[1], EXP_5, m);
++	CALC_LOAD(p->avg[2], EXP_15, m);
++	/* reset max_snap to calculate it correctly next time */
++	p->max_snap = 0;
++}
++
++#endif /* __VZSTAT_H__ */
+diff --git a/include/net/addrconf.h b/include/net/addrconf.h
+index bbd3d58..22e57e7 100644
+--- a/include/net/addrconf.h
++++ b/include/net/addrconf.h
+@@ -258,5 +258,9 @@ extern int if6_proc_init(void);
+ extern void if6_proc_exit(void);
+ #endif
+ 
++int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
++		unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
++		__u32 valid_lft);
++
+ #endif
+ #endif
+diff --git a/include/net/af_unix.h b/include/net/af_unix.h
+index 2dfa96b..10183b3 100644
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -9,6 +9,7 @@
+ extern void unix_inflight(struct file *fp);
+ extern void unix_notinflight(struct file *fp);
+ extern void unix_gc(void);
++extern void unix_destruct_fds(struct sk_buff *skb);
+ 
+ #define UNIX_HASH_SIZE	256
+ 
+diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
+index a5c6ccc..c2bb5ca 100644
+--- a/include/net/fib_rules.h
++++ b/include/net/fib_rules.h
+@@ -62,7 +62,7 @@ struct fib_rules_ops
+ 
+ 	/* Called after modifications to the rules set, must flush
+ 	 * the route cache if one exists. */
+-	void			(*flush_cache)(void);
++	void			(*flush_cache)(struct fib_rules_ops *ops);
+ 
+ 	int			nlgroup;
+ 	const struct nla_policy	*policy;
+diff --git a/include/net/flow.h b/include/net/flow.h
+index ad16e00..bcf2002 100644
+--- a/include/net/flow.h
++++ b/include/net/flow.h
+@@ -10,6 +10,7 @@
+ #include <linux/in6.h>
+ #include <asm/atomic.h>
+ 
++struct ve_struct;
+ struct flowi {
+ 	int	oif;
+ 	int	iif;
+@@ -76,6 +77,9 @@ struct flowi {
+ #define fl_icmp_code	uli_u.icmpt.code
+ #define fl_ipsec_spi	uli_u.spi
+ #define fl_mh_type	uli_u.mht.type
++#ifdef CONFIG_VE
++	struct ve_struct *owner_env;
++#endif
+ 	__u32           secid;	/* used by xfrm; see secid.txt */
+ } __attribute__((__aligned__(BITS_PER_LONG/8)));
+ 
+diff --git a/include/net/icmp.h b/include/net/icmp.h
+index dddb839..c0362a4 100644
+--- a/include/net/icmp.h
++++ b/include/net/icmp.h
+@@ -31,15 +31,24 @@ struct icmp_err {
+ extern struct icmp_err icmp_err_convert[];
+ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+ DECLARE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics);
+-#define ICMP_INC_STATS(field)		SNMP_INC_STATS(icmp_statistics, field)
+-#define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmp_statistics, field)
+-#define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmp_statistics, field)
+-#define ICMPMSGOUT_INC_STATS(field)	SNMP_INC_STATS(icmpmsg_statistics, field+256)
+-#define ICMPMSGOUT_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmpmsg_statistics, field+256)
+-#define ICMPMSGOUT_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmpmsg_statistics, field+256)
+-#define ICMPMSGIN_INC_STATS(field)	SNMP_INC_STATS(icmpmsg_statistics, field)
+-#define ICMPMSGIN_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmpmsg_statistics, field)
+-#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field)
++
++#if defined(CONFIG_VE) && defined(CONFIG_INET)
++#define ve_icmp_statistics (get_exec_env()->_icmp_statistics)
++#define ve_icmpmsg_statistics (get_exec_env()->_icmpmsg_statistics)
++#else
++#define ve_icmp_statistics icmp_statistics
++#define ve_icmpmsg_statistics icmpmsg_statistics
++#endif
++
++#define ICMP_INC_STATS(field)		SNMP_INC_STATS(ve_icmp_statistics, field)
++#define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(ve_icmp_statistics, field)
++#define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ve_icmp_statistics, field)
++#define ICMPMSGOUT_INC_STATS(field)	SNMP_INC_STATS(ve_icmpmsg_statistics, field+256)
++#define ICMPMSGOUT_INC_STATS_BH(field)	SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field+256)
++#define ICMPMSGOUT_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field+256)
++#define ICMPMSGIN_INC_STATS(field)	SNMP_INC_STATS(ve_icmpmsg_statistics, field)
++#define ICMPMSGIN_INC_STATS_BH(field)	SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field)
++#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field)
+ 
+ struct dst_entry;
+ struct net_proto_family;
+diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
+index 62a5b69..4b5375b 100644
+--- a/include/net/inet6_hashtables.h
++++ b/include/net/inet6_hashtables.h
+@@ -29,9 +29,10 @@ struct inet_hashinfo;
+ 
+ /* I have no idea if this is a good hash for v6 or not. -DaveM */
+ static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport,
+-				const struct in6_addr *faddr, const __be16 fport)
++				const struct in6_addr *faddr, const __be16 fport,
++				const envid_t veid)
+ {
+-	u32 ports = (lport ^ (__force u16)fport);
++	u32 ports = (lport ^ (__force u16)fport) ^ (veid ^ (veid >> 16));
+ 
+ 	return jhash_3words((__force u32)laddr->s6_addr32[3],
+ 			    (__force u32)faddr->s6_addr32[3],
+@@ -46,7 +47,7 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
+ 	const struct in6_addr *faddr = &np->daddr;
+ 	const __u16 lport = inet->num;
+ 	const __be16 fport = inet->dport;
+-	return inet6_ehashfn(laddr, lport, faddr, fport);
++	return inet6_ehashfn(laddr, lport, faddr, fport, VEID(sk->owner_env));
+ }
+ 
+ extern void __inet6_hash(struct sock *sk);
+diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
+index e081eef..7a554cc 100644
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -15,6 +15,9 @@ struct netns_frags {
+ struct inet_frag_queue {
+ 	struct hlist_node	list;
+ 	struct netns_frags	*net;
++#ifdef CONFIG_VE
++	struct ve_struct	*owner_ve;
++#endif
+ 	struct list_head	lru_list;   /* lru list member */
+ 	spinlock_t		lock;
+ 	atomic_t		refcnt;
+diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
+index 735b926..30a2741 100644
+--- a/include/net/inet_hashtables.h
++++ b/include/net/inet_hashtables.h
+@@ -74,6 +74,7 @@ struct inet_ehash_bucket {
+  * ports are created in O(1) time?  I thought so. ;-)	-DaveM
+  */
+ struct inet_bind_bucket {
++	struct ve_struct	*owner_env;
+ 	struct net		*ib_net;
+ 	unsigned short		port;
+ 	signed short		fastreuse;
+@@ -197,27 +198,29 @@ extern struct inet_bind_bucket *
+ 		    inet_bind_bucket_create(struct kmem_cache *cachep,
+ 					    struct net *net,
+ 					    struct inet_bind_hashbucket *head,
+-					    const unsigned short snum);
++					    const unsigned short snum,
++					    struct ve_struct *env);
+ extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+ 				     struct inet_bind_bucket *tb);
+ 
+-static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
++static inline int inet_bhashfn(const __u16 lport, const int bhash_size,
++		unsigned veid)
+ {
+-	return lport & (bhash_size - 1);
++	return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1));
+ }
+ 
+ extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ 			   const unsigned short snum);
+ 
+ /* These can have wildcards, don't try too hard. */
+-static inline int inet_lhashfn(const unsigned short num)
++static inline int inet_lhashfn(const unsigned short num, unsigned veid)
+ {
+-	return num & (INET_LHTABLE_SIZE - 1);
++	return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1));
+ }
+ 
+ static inline int inet_sk_listen_hashfn(const struct sock *sk)
+ {
+-	return inet_lhashfn(inet_sk(sk)->num);
++	return inet_lhashfn(inet_sk(sk)->num, VEID(sk->owner_env));
+ }
+ 
+ /* Caller must disable local BH processing. */
+@@ -372,7 +375,8 @@ static inline struct sock *inet_lookup(struct net *net,
+ extern int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 		struct sock *sk, u32 port_offset,
+ 		int (*check_established)(struct inet_timewait_death_row *,
+-			struct sock *, __u16, struct inet_timewait_sock **),
++			struct sock *, __u16, struct inet_timewait_sock **,
++			struct ve_struct *),
+ 			       void (*hash)(struct sock *sk));
+ extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 			     struct sock *sk);
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index 9fabe5b..1ea3392 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -172,12 +172,13 @@ extern u32 inet_ehash_secret;
+ extern void build_ehash_secret(void);
+ 
+ static inline unsigned int inet_ehashfn(const __be32 laddr, const __u16 lport,
+-					const __be32 faddr, const __be16 fport)
++					const __be32 faddr, const __be16 fport,
++					const envid_t veid)
+ {
+ 	return jhash_3words((__force __u32) laddr,
+ 			    (__force __u32) faddr,
+ 			    ((__u32) lport) << 16 | (__force __u32)fport,
+-			    inet_ehash_secret);
++			    inet_ehash_secret ^ (veid ^ (veid >> 16)));
+ }
+ 
+ static inline int inet_sk_ehashfn(const struct sock *sk)
+@@ -187,8 +188,9 @@ static inline int inet_sk_ehashfn(const struct sock *sk)
+ 	const __u16 lport = inet->num;
+ 	const __be32 faddr = inet->daddr;
+ 	const __be16 fport = inet->dport;
++	envid_t veid = VEID(sk->owner_env);
+ 
+-	return inet_ehashfn(laddr, lport, faddr, fport);
++	return inet_ehashfn(laddr, lport, faddr, fport, veid);
+ }
+ 
+ 
+diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
+index 95c660c..3102e7b 100644
+--- a/include/net/inet_timewait_sock.h
++++ b/include/net/inet_timewait_sock.h
+@@ -81,6 +81,7 @@ struct inet_timewait_death_row {
+ 	struct inet_hashinfo 	*hashinfo;
+ 	int			sysctl_tw_recycle;
+ 	int			sysctl_max_tw_buckets;
++	int			ub_managed;
+ };
+ 
+ extern void inet_twdr_hangman(unsigned long data);
+@@ -134,6 +135,7 @@ struct inet_timewait_sock {
+ 	unsigned long		tw_ttd;
+ 	struct inet_bind_bucket	*tw_tb;
+ 	struct hlist_node	tw_death_node;
++	envid_t			tw_owner_env;
+ };
+ 
+ static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
+diff --git a/include/net/ip.h b/include/net/ip.h
+index 3b40bc2..3a8d5f4 100644
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -157,16 +157,31 @@ struct ipv4_config
+ 
+ extern struct ipv4_config ipv4_config;
+ DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+-#define IP_INC_STATS(field)		SNMP_INC_STATS(ip_statistics, field)
+-#define IP_INC_STATS_BH(field)		SNMP_INC_STATS_BH(ip_statistics, field)
+-#define IP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ip_statistics, field)
+-#define IP_ADD_STATS_BH(field, val)	SNMP_ADD_STATS_BH(ip_statistics, field, val)
++
++#ifdef CONFIG_VE
++#define ve_ip_statistics (get_exec_env()->_ip_statistics)
++#else
++#define ve_ip_statistics ip_statistics
++#endif
++#define IP_INC_STATS(field)		SNMP_INC_STATS(ve_ip_statistics, field)
++#define IP_INC_STATS_BH(field)		SNMP_INC_STATS_BH(ve_ip_statistics, field)
++#define IP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ve_ip_statistics, field)
++#define IP_ADD_STATS_BH(field, val)	SNMP_ADD_STATS_BH(ve_ip_statistics, field, val)
++
+ DECLARE_SNMP_STAT(struct linux_mib, net_statistics);
+-#define NET_INC_STATS(field)		SNMP_INC_STATS(net_statistics, field)
+-#define NET_INC_STATS_BH(field)		SNMP_INC_STATS_BH(net_statistics, field)
+-#define NET_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(net_statistics, field)
+-#define NET_ADD_STATS_BH(field, adnd)	SNMP_ADD_STATS_BH(net_statistics, field, adnd)
+-#define NET_ADD_STATS_USER(field, adnd)	SNMP_ADD_STATS_USER(net_statistics, field, adnd)
++#if defined(CONFIG_VE) && defined(CONFIG_INET)
++#define ve_net_statistics (get_exec_env()->_net_statistics)
++
++extern int init_ipv4_mibs(void);
++extern void cleanup_ipv4_mibs(void);
++#else
++#define ve_net_statistics net_statistics
++#endif
++#define NET_INC_STATS(field)		SNMP_INC_STATS(ve_net_statistics, field)
++#define NET_INC_STATS_BH(field)		SNMP_INC_STATS_BH(ve_net_statistics, field)
++#define NET_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ve_net_statistics, field)
++#define NET_ADD_STATS_BH(field, adnd)	SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd)
++#define NET_ADD_STATS_USER(field, adnd)	SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd)
+ 
+ extern unsigned long snmp_fold_field(void *mib[], int offt);
+ extern int snmp_mib_init(void *ptr[2], size_t mibsize);
+diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
+index 7c5c0f7..6d549ac 100644
+--- a/include/net/ip6_fib.h
++++ b/include/net/ip6_fib.h
+@@ -156,6 +156,7 @@ struct fib6_table {
+ 	u32			tb6_id;
+ 	rwlock_t		tb6_lock;
+ 	struct fib6_node	tb6_root;
++	struct ve_struct	*owner_env;
+ };
+ 
+ #define RT6_TABLE_UNSPEC	RT_TABLE_UNSPEC
+diff --git a/include/net/ipv6.h b/include/net/ipv6.h
+index f422f72..185a018 100644
+--- a/include/net/ipv6.h
++++ b/include/net/ipv6.h
+@@ -117,7 +117,7 @@ extern struct ctl_path net_ipv6_ctl_path[];
+ 	struct inet6_dev *_idev = (idev);				\
+ 	if (likely(_idev != NULL))					\
+ 		SNMP_INC_STATS##modifier((_idev)->stats.statname, (field)); \
+-	SNMP_INC_STATS##modifier(statname##_statistics, (field));	\
++	SNMP_INC_STATS##modifier(ve_##statname##_statistics, (field));	\
+ })
+ 
+ #define _DEVADD(statname, modifier, idev, field, val)			\
+@@ -125,9 +125,22 @@ extern struct ctl_path net_ipv6_ctl_path[];
+ 	struct inet6_dev *_idev = (idev);				\
+ 	if (likely(_idev != NULL))					\
+ 		SNMP_ADD_STATS##modifier((_idev)->stats.statname, (field), (val)); \
+-	SNMP_ADD_STATS##modifier(statname##_statistics, (field), (val));\
++	SNMP_ADD_STATS##modifier(ve_##statname##_statistics, (field), (val));\
+ })
+ 
++#ifdef CONFIG_VE
++#define ve_ipv6_statistics	(get_exec_env()->_ipv6_statistics)
++#define ve_icmpv6_statistics	(get_exec_env()->_icmpv6_statistics)
++#define ve_icmpv6msg_statistics	(get_exec_env()->_icmpv6msg_statistics)
++
++extern int init_ipv6_mibs(void);
++extern void cleanup_ipv6_mibs(void);
++#else
++#define ve_ipv6_statistics	ipv6_statistics
++#define ve_icmpv6_statistics	icmpv6_statistics
++#define ve_icmpv6msg_statistics	icmpv6msg_statistics
++#endif
++
+ /* MIBs */
+ DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+ 
+diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
+index d9dd0f7..eca50f8 100644
+--- a/include/net/net_namespace.h
++++ b/include/net/net_namespace.h
+@@ -45,6 +45,13 @@ struct net {
+ 	struct hlist_head 	*dev_name_head;
+ 	struct hlist_head	*dev_index_head;
+ 
++	int			ifindex;
++
++#ifdef CONFIG_VE
++	struct completion	*sysfs_completion;
++	struct ve_struct	*owner_ve;
++#endif
++
+ 	/* core fib_rules */
+ 	struct list_head	rules_ops;
+ 	spinlock_t		rules_mod_lock;
+diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+index 9bf0598..c94a355 100644
+--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
++++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+@@ -18,8 +18,18 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
+ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
+ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
+ 
++#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ extern int nf_conntrack_ipv4_compat_init(void);
+ extern void nf_conntrack_ipv4_compat_fini(void);
++#else
++static inline int nf_conntrack_ipv4_compat_init(void)
++{
++	return 0;
++}
++static inline void nf_conntrack_ipv4_compat_fini(void)
++{
++}
++#endif
+ 
+ extern void need_ipv4_conntrack(void);
+ 
+diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
+index 2dbd6c0..9563d32 100644
+--- a/include/net/netfilter/nf_conntrack.h
++++ b/include/net/netfilter/nf_conntrack.h
+@@ -28,6 +28,10 @@
+ 
+ #include <net/netfilter/nf_conntrack_tuple.h>
+ 
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/ve.h>
++#endif
++
+ /* per conntrack: protocol private data */
+ union nf_conntrack_proto {
+ 	/* insert conntrack proto private data here */
+@@ -131,6 +135,10 @@ struct nf_conn
+ 	struct nf_ct_ext *ext;
+ 
+ 	struct rcu_head rcu;
++
++#ifdef CONFIG_VE_IPTABLES
++	struct ve_struct *ct_owner_env;
++#endif
+ };
+ 
+ static inline struct nf_conn *
+@@ -194,6 +202,11 @@ extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+ 
+ extern void nf_conntrack_flush(void);
+ 
++struct nf_conntrack_helper * nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple);
++void nf_ct_helper_put(struct nf_conntrack_helper *helper);
++
++struct nf_conntrack_helper * __nf_conntrack_helper_find_byname(const char *name);
++
+ extern bool nf_ct_get_tuplepr(const struct sk_buff *skb,
+ 			      unsigned int nhoff, u_int16_t l3num,
+ 			      struct nf_conntrack_tuple *tuple);
+@@ -239,7 +252,8 @@ nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data);
+ extern void nf_conntrack_free(struct nf_conn *ct);
+ extern struct nf_conn *
+ nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+-		   const struct nf_conntrack_tuple *repl);
++		   const struct nf_conntrack_tuple *repl,
++		   struct user_beancounter *);
+ 
+ /* It's confirmed if it is, or has been in the hash table. */
+ static inline int nf_ct_is_confirmed(struct nf_conn *ct)
+@@ -262,6 +276,8 @@ extern unsigned int nf_conntrack_htable_size;
+ extern int nf_conntrack_checksum;
+ extern atomic_t nf_conntrack_count;
+ extern int nf_conntrack_max;
++extern int nf_conntrack_disable_ve0;
++extern int ip_conntrack_disable_ve0;
+ 
+ DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
+ #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++)
+diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
+index a817712..30831ef 100644
+--- a/include/net/netfilter/nf_conntrack_core.h
++++ b/include/net/netfilter/nf_conntrack_core.h
+@@ -52,6 +52,42 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple);
+ 
+ extern int __nf_conntrack_confirm(struct sk_buff *skb);
+ 
++#if defined(CONFIG_VE_IPTABLES)
++#include <linux/sched.h>
++#define ve_nf_conntrack_hash	(get_exec_env()->_nf_conntrack->_nf_conntrack_hash)
++#define ve_nf_conntrack_vmalloc	(get_exec_env()->_nf_conntrack->_nf_conntrack_vmalloc)
++#define ve_unconfirmed		(get_exec_env()->_nf_conntrack->_unconfirmed)
++#else
++#define ve_nf_conntrack_hash		nf_conntrack_hash
++#define ve_nf_conntrack_vmalloc		nf_conntrack_vmalloc
++#define ve_unconfirmed			unconfirmed
++#endif /* CONFIG_VE_IPTABLES */
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++#define ve_nf_ct_sysctl_header		\
++		(get_exec_env()->_nf_conntrack->_nf_ct_sysctl_header)
++#define ve_nf_ct_sysctl_table		\
++		(get_exec_env()->_nf_conntrack->_nf_ct_sysctl_table)
++#define ve_nf_ct_netfilter_table	\
++		(get_exec_env()->_nf_conntrack->_nf_ct_netfilter_table)
++#define ve_nf_ct_net_table		\
++		(get_exec_env()->_nf_conntrack->_nf_ct_net_table)
++extern void nf_ct_proto_generic_sysctl_cleanup(void);
++extern int nf_ct_proto_generic_sysctl_init(void);
++#else
++#define ve_nf_ct_sysctl_header		nf_ct_sysctl_header
++#define ve_nf_ct_sysctl_table		nf_ct_sysctl_table
++#define ve_nf_ct_netfilter_table	nf_ct_netfilter_table
++#define ve_nf_ct_net_table		nf_ct_net_table
++static inline int nf_ct_proto_generic_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_generic_sysctl_cleanup(void)
++{
++}
++#endif /* CONFIG_VE_IPTABLES */
++
+ /* Confirm a connection: returns NF_DROP if packet must be dropped. */
+ static inline int nf_conntrack_confirm(struct sk_buff *skb)
+ {
+@@ -71,7 +107,9 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+             const struct nf_conntrack_l3proto *l3proto,
+             const struct nf_conntrack_l4proto *proto);
+ 
++#ifndef CONFIG_VE_IPTABLES
+ extern struct hlist_head *nf_conntrack_hash;
++#endif
+ extern spinlock_t nf_conntrack_lock ;
+ extern struct hlist_head unconfirmed;
+ 
+diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
+index f0b9078..4bcf1bd 100644
+--- a/include/net/netfilter/nf_conntrack_ecache.h
++++ b/include/net/netfilter/nf_conntrack_ecache.h
+@@ -34,6 +34,9 @@ nf_conntrack_event_cache(enum ip_conntrack_events event,
+ 	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+ 	struct nf_conntrack_ecache *ecache;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return;
++
+ 	local_bh_disable();
+ 	ecache = &__get_cpu_var(nf_conntrack_ecache);
+ 	if (ct != ecache->ct)
+@@ -45,7 +48,7 @@ nf_conntrack_event_cache(enum ip_conntrack_events event,
+ static inline void nf_conntrack_event(enum ip_conntrack_events event,
+ 				      struct nf_conn *ct)
+ {
+-	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
++	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && ve_is_super(get_exec_env()))
+ 		atomic_notifier_call_chain(&nf_conntrack_chain, event, ct);
+ }
+ 
+@@ -57,7 +60,8 @@ static inline void
+ nf_ct_expect_event(enum ip_conntrack_expect_events event,
+ 		   struct nf_conntrack_expect *exp)
+ {
+-	atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp);
++	if (ve_is_super(get_exec_env()))
++		atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp);
+ }
+ 
+ #else /* CONFIG_NF_CONNTRACK_EVENTS */
+diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
+index dfdf4b4..4175cdf 100644
+--- a/include/net/netfilter/nf_conntrack_expect.h
++++ b/include/net/netfilter/nf_conntrack_expect.h
+@@ -6,9 +6,17 @@
+ #define _NF_CONNTRACK_EXPECT_H
+ #include <net/netfilter/nf_conntrack.h>
+ 
+-extern struct hlist_head *nf_ct_expect_hash;
+ extern unsigned int nf_ct_expect_hsize;
+ extern unsigned int nf_ct_expect_max;
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_nf_ct_expect_hash	(get_exec_env()->_nf_conntrack->_nf_ct_expect_hash)
++#define ve_nf_ct_expect_max	(get_exec_env()->_nf_conntrack->_nf_ct_expect_max)
++#else
++extern struct hlist_head *nf_ct_expect_hash;
++#define ve_nf_ct_expect_hash	nf_ct_expect_hash
++#define ve_nf_ct_expect_max	nf_ct_expect_max
++#endif
+ 
+ struct nf_conntrack_expect
+ {
+@@ -73,6 +81,8 @@ void nf_conntrack_expect_fini(void);
+ struct nf_conntrack_expect *
+ __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple);
+ 
++void nf_ct_expect_insert(struct nf_conntrack_expect *exp);
++
+ struct nf_conntrack_expect *
+ nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple);
+ 
+diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
+index 0378676..6b1f720 100644
+--- a/include/net/netfilter/nf_conntrack_l3proto.h
++++ b/include/net/netfilter/nf_conntrack_l3proto.h
+@@ -42,6 +42,9 @@ struct nf_conntrack_l3proto
+ 	int (*print_tuple)(struct seq_file *s,
+ 			   const struct nf_conntrack_tuple *);
+ 
++	/* Called when a conntrack entry is destroyed */
++	void (*destroy)(struct nf_conn *conntrack);
++
+ 	/*
+ 	 * Called before tracking. 
+ 	 *	*dataoff: offset of protocol header (TCP, UDP,...) in skb
+@@ -67,6 +70,33 @@ struct nf_conntrack_l3proto
+ 	struct module *me;
+ };
+ 
++/* virtualization of l3 protocol's sysctl tables: */
++#if defined(CONFIG_VE_IPTABLES)
++#include <linux/sched.h>
++#define ve_nf_ct3			(get_exec_env()->_nf_conntrack)
++#endif
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++#define ve_nf_ct_l3protos		ve_nf_ct3->_nf_ct_l3protos
++#define ve_nf_conntrack_l3proto_ipv4	(ve_nf_ct3->_nf_conntrack_l3proto_ipv4)
++#define	ve_nf_conntrack_l3proto_ipv6	(ve_nf_ct3->_nf_conntrack_l3proto_ipv6)
++#define ve_nf_conntrack_max		(ve_nf_ct3->_nf_conntrack_max)
++#define ve_nf_conntrack_count		(ve_nf_ct3->_nf_conntrack_count)
++#define ve_nf_conntrack_checksum	(ve_nf_ct3->_nf_conntrack_checksum)
++#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */
++#define ve_nf_ct_l3protos		nf_ct_l3protos
++#define ve_nf_conntrack_l3proto_ipv4	&nf_conntrack_l3proto_ipv4
++#define ve_nf_conntrack_l3proto_ipv6	&nf_conntrack_l3proto_ipv6
++#define ve_nf_conntrack_max		nf_conntrack_max
++#define ve_nf_conntrack_count		nf_conntrack_count
++#define ve_nf_conntrack_checksum	nf_conntrack_checksum
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
++
++extern int init_nf_ct_l3proto_ipv4(void);
++extern void fini_nf_ct_l3proto_ipv4(void);
++extern int init_nf_ct_l3proto_ipv6(void);
++extern void fini_nf_ct_l3proto_ipv6(void);
++
+ extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX];
+ 
+ /* Protocol registration. */
+@@ -83,7 +113,11 @@ __nf_ct_l3proto_find(u_int16_t l3proto)
+ {
+ 	if (unlikely(l3proto >= AF_MAX))
+ 		return &nf_conntrack_l3proto_generic;
+-	return rcu_dereference(nf_ct_l3protos[l3proto]);
++#ifdef CONFIG_VE_IPTABLES
++	if (!get_exec_env()->_nf_conntrack)
++		return &nf_conntrack_l3proto_generic;
++#endif
++	return rcu_dereference(ve_nf_ct_l3protos[l3proto]);
+ }
+ 
+ #endif /*_NF_CONNTRACK_L3PROTO_H*/
+diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
+index 723df9d..43ecaf7 100644
+--- a/include/net/netfilter/nf_conntrack_l4proto.h
++++ b/include/net/netfilter/nf_conntrack_l4proto.h
+@@ -97,6 +97,7 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
+ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
+ 
+ #define MAX_NF_CT_PROTO 256
++extern struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX];
+ 
+ extern struct nf_conntrack_l4proto *
+ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto);
+@@ -117,16 +118,142 @@ extern int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
+ 				      struct nf_conntrack_tuple *t);
+ extern const struct nla_policy nf_ct_port_nla_policy[];
+ 
++#ifdef CONFIG_SYSCTL
+ /* Log invalid packets */
+ extern unsigned int nf_ct_log_invalid;
++#endif
++
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_nf_ct4			(get_exec_env()->_nf_conntrack)
++#endif
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++
++#define ve_nf_ct_protos			(ve_nf_ct4->_nf_ct_protos)
++#define ve_nf_conntrack_l4proto_icmp	(ve_nf_ct4->_nf_conntrack_l4proto_icmp)
++#define ve_nf_conntrack_l4proto_icmpv6	\
++				(ve_nf_ct4->_nf_conntrack_l4proto_icmpv6)
++#define ve_nf_conntrack_l4proto_tcp4	(ve_nf_ct4->_nf_conntrack_l4proto_tcp4)
++#define ve_nf_conntrack_l4proto_tcp6	(ve_nf_ct4->_nf_conntrack_l4proto_tcp6)
++#define ve_nf_conntrack_l4proto_udp4	(ve_nf_ct4->_nf_conntrack_l4proto_udp4)
++#define ve_nf_conntrack_l4proto_udp6	(ve_nf_ct4->_nf_conntrack_l4proto_udp6)
++#define ve_nf_conntrack_l4proto_generic		\
++				(ve_nf_ct4->_nf_conntrack_l4proto_generic)
++#define ve_nf_ct_log_invalid		(ve_nf_ct4->_nf_ct_log_invalid)
++/* TCP: */
++#define ve_nf_ct_tcp_timeouts		(ve_nf_ct4->_nf_ct_tcp_timeouts)
++#define ve_nf_ct_tcp_timeout_max_retrans	\
++				(ve_nf_ct4->_nf_ct_tcp_timeout_max_retrans)
++#define ve_nf_ct_tcp_max_retrans	(ve_nf_ct4->_nf_ct_tcp_max_retrans)
++#define ve_nf_ct_tcp_loose		(ve_nf_ct4->_nf_ct_tcp_loose)
++#define ve_nf_ct_tcp_be_liberal		(ve_nf_ct4->_nf_ct_tcp_be_liberal)
++#define ve_tcp_sysctl_table_users	(ve_nf_ct4->_tcp_sysctl_table_users)
++#define ve_tcp_sysctl_header		(ve_nf_ct4->_tcp_sysctl_header)
++#define ve_tcp_compat_sysctl_header	(ve_nf_ct4->_tcp_compat_sysctl_header)
++/* UDP: */
++#define ve_nf_ct_udp_timeout		(ve_nf_ct4->_nf_ct_udp_timeout)
++#define ve_nf_ct_udp_timeout_stream	(ve_nf_ct4->_nf_ct_udp_timeout_stream)
++#define ve_udp_sysctl_table_users	(ve_nf_ct4->_udp_sysctl_table_users)
++#define ve_udp_sysctl_header		(ve_nf_ct4->_udp_sysctl_header)
++#define ve_udp_compat_sysctl_header	(ve_nf_ct4->_udp_compat_sysctl_header)
++/* ICMP: */
++#define ve_nf_ct_icmp_timeout		(ve_nf_ct4->_nf_ct_icmp_timeout)
++#define ve_icmp_sysctl_header		(ve_nf_ct4->_icmp_sysctl_header)
++#define ve_icmp_compat_sysctl_header	(ve_nf_ct4->_icmp_compat_sysctl_header)
++/* ICMPV6: */
++#define ve_nf_ct_icmpv6_timeout		(ve_nf_ct4->_nf_ct_icmpv6_timeout)
++#define ve_icmpv6_sysctl_header		(ve_nf_ct4->_icmpv6_sysctl_header)
++/* GENERIC: */
++#define ve_nf_ct_generic_timeout	(ve_nf_ct4->_nf_ct_generic_timeout)
++#define ve_generic_sysctl_header	(ve_nf_ct4->_generic_sysctl_header)
++#define ve_generic_compat_sysctl_header	(ve_nf_ct4->_generic_compat_sysctl_header)
++
++extern void nf_ct_proto_icmp_sysctl_cleanup(void);
++extern int nf_ct_proto_icmp_sysctl_init(void);
++extern void nf_ct_proto_icmpv6_sysctl_cleanup(void);
++extern int nf_ct_proto_icmpv6_sysctl_init(void);
++extern void nf_ct_proto_tcp_sysctl_cleanup(void);
++extern int nf_ct_proto_tcp_sysctl_init(void);
++extern void nf_ct_proto_udp_sysctl_cleanup(void);
++extern int nf_ct_proto_udp_sysctl_init(void);
++
++#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */
++
++#define ve_nf_ct_protos			nf_ct_protos
++#define ve_nf_conntrack_l4proto_icmp	&nf_conntrack_l4proto_icmp
++#define ve_nf_conntrack_l4proto_icmpv6	&nf_conntrack_l4proto_icmpv6
++#define ve_nf_conntrack_l4proto_tcp4	&nf_conntrack_l4proto_tcp4
++#define ve_nf_conntrack_l4proto_tcp6	&nf_conntrack_l4proto_tcp6
++#define ve_nf_conntrack_l4proto_udp4	&nf_conntrack_l4proto_udp4
++#define ve_nf_conntrack_l4proto_udp6	&nf_conntrack_l4proto_udp6
++#define ve_nf_conntrack_l4proto_generic	&nf_conntrack_l4proto_generic
++
++#if defined(CONFIG_SYSCTL)
++
++#define ve_nf_ct_log_invalid		nf_ct_log_invalid
++/* TCP: */
++#define ve_nf_ct_tcp_timeouts		*tcp_timeouts
++#define ve_nf_ct_tcp_timeout_max_retrans	\
++					nf_ct_tcp_timeout_max_retrans
++#define ve_nf_ct_tcp_max_retrans	nf_ct_tcp_max_retrans
++#define ve_nf_ct_tcp_loose		nf_ct_tcp_loose
++#define ve_nf_ct_tcp_be_liberal		nf_ct_tcp_be_liberal
++#define ve_tcp_sysctl_table_users	tcp_sysctl_table_users
++#define ve_tcp_sysctl_header		tcp_sysctl_header
++/* UDP:*/
++#define ve_nf_ct_udp_timeout		nf_ct_udp_timeout
++#define ve_nf_ct_udp_timeout_stream	nf_ct_udp_timeout_stream
++#define ve_udp_sysctl_table_users	udp_sysctl_table_users
++#define ve_udp_sysctl_header		udp_sysctl_header
++/* ICMP: */
++#define ve_nf_ct_icmp_timeout		nf_ct_icmp_timeout
++#define ve_icmp_sysctl_header		icmp_sysctl_header
++/* ICMPV6: */
++#define ve_nf_ct_icmpv6_timeout		nf_ct_icmpv6_timeout
++#define ve_icmpv6_sysctl_header		icmpv6_sysctl_header
++/* GENERIC: */
++#define ve_nf_ct_generic_timeout	nf_ct_generic_timeout
++#define ve_generic_sysctl_header	generic_sysctl_header
++#endif /* CONFIG_SYSCTL */
++
++static inline int nf_ct_proto_icmp_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_icmp_sysctl_cleanup(void)
++{
++}
++static inline int nf_ct_proto_tcp_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_tcp_sysctl_cleanup(void)
++{
++}
++static inline int nf_ct_proto_udp_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_udp_sysctl_cleanup(void)
++{
++}
++static inline int nf_ct_proto_icmpv6_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_icmpv6_sysctl_cleanup(void)
++{
++}
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
+ 
+ #ifdef CONFIG_SYSCTL
+ #ifdef DEBUG_INVALID_PACKETS
+ #define LOG_INVALID(proto) \
+-	(nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW)
++	(ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW)
+ #else
+ #define LOG_INVALID(proto) \
+-	((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \
++	((ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) \
+ 	 && net_ratelimit())
+ #endif
+ #else
+diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
+index 9dc1039..bfa9069 100644
+--- a/include/net/netfilter/nf_nat.h
++++ b/include/net/netfilter/nf_nat.h
+@@ -77,6 +77,8 @@ struct nf_conn_nat
+ #endif
+ };
+ 
++void nf_nat_hash_conntrack(struct nf_conn *ct);
++
+ /* Set up the info structure to map into this range. */
+ extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
+ 				      const struct nf_nat_range *range,
+@@ -85,6 +87,7 @@ extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
+ /* Is this tuple already taken? (not by us)*/
+ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
+ 			     const struct nf_conn *ignored_conntrack);
++extern void ip_nat_hash_conntrack(struct nf_conn *ct);
+ 
+ static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
+ {
+diff --git a/include/net/netfilter/nf_nat_rule.h b/include/net/netfilter/nf_nat_rule.h
+index e4a18ae..8bb00da 100644
+--- a/include/net/netfilter/nf_nat_rule.h
++++ b/include/net/netfilter/nf_nat_rule.h
+@@ -4,7 +4,7 @@
+ #include <net/netfilter/nf_nat.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+ 
+-extern int nf_nat_rule_init(void) __init;
++extern int nf_nat_rule_init(void);
+ extern void nf_nat_rule_cleanup(void);
+ extern int nf_nat_rule_find(struct sk_buff *skb,
+ 			    unsigned int hooknum,
+diff --git a/include/net/netlink_sock.h b/include/net/netlink_sock.h
+new file mode 100644
+index 0000000..ce4701a
+--- /dev/null
++++ b/include/net/netlink_sock.h
+@@ -0,0 +1,23 @@
++#ifndef __NET_NETLINK_SOCK_H
++#define __NET_NETLINK_SOCK_H
++
++struct netlink_sock {
++	/* struct sock has to be the first member of netlink_sock */
++	struct sock		sk;
++	u32			pid;
++	u32			dst_pid;
++	u32			dst_group;
++	u32			flags;
++	u32			subscriptions;
++	u32			ngroups;
++	unsigned long		*groups;
++	unsigned long		state;
++	wait_queue_head_t	wait;
++	struct netlink_callback	*cb;
++	struct mutex		*cb_mutex;
++	struct mutex		cb_def_mutex;
++	void			(*netlink_rcv)(struct sk_buff *skb);
++	struct module		*module;
++};
++
++#endif /* __NET_NETLINK_SOCK_H */
+diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
+index 34ee348..d8588d5 100644
+--- a/include/net/netns/ipv4.h
++++ b/include/net/netns/ipv4.h
+@@ -18,6 +18,7 @@ struct netns_ipv4 {
+ 	struct ctl_table_header	*forw_hdr;
+ 	struct ctl_table_header	*frags_hdr;
+ 	struct ctl_table_header	*ipv4_hdr;
++	struct ctl_table_header *route_hdr;
+ #endif
+ 	struct ipv4_devconf	*devconf_all;
+ 	struct ipv4_devconf	*devconf_dflt;
+@@ -44,5 +45,8 @@ struct netns_ipv4 {
+ 	int sysctl_icmp_ratelimit;
+ 	int sysctl_icmp_ratemask;
+ 	int sysctl_icmp_errors_use_inbound_ifaddr;
++
++	struct timer_list rt_secret_timer;
++	atomic_t rt_genid;
+ };
+ #endif
+diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
+index ac053be..c368713 100644
+--- a/include/net/netns/ipv6.h
++++ b/include/net/netns/ipv6.h
+@@ -13,6 +13,7 @@ struct netns_sysctl_ipv6 {
+ #ifdef CONFIG_SYSCTL
+ 	struct ctl_table_header *table;
+ 	struct ctl_table_header *frags_hdr;
++	struct ctl_table_header *nf_frags_hdr;
+ #endif
+ 	int bindv6only;
+ 	int flush_delay;
+@@ -31,6 +32,7 @@ struct netns_ipv6 {
+ 	struct ipv6_devconf	*devconf_all;
+ 	struct ipv6_devconf	*devconf_dflt;
+ 	struct netns_frags	frags;
++	struct netns_frags	ct_frags;
+ #ifdef CONFIG_NETFILTER
+ 	struct xt_table		*ip6table_filter;
+ 	struct xt_table		*ip6table_mangle;
+@@ -54,5 +56,7 @@ struct netns_ipv6 {
+ 	struct sock             *ndisc_sk;
+ 	struct sock             *tcp_sk;
+ 	struct sock             *igmp_sk;
++
++	struct proc_dir_entry	*proc_dev_snmp;
+ };
+ #endif
+diff --git a/include/net/route.h b/include/net/route.h
+index fc836ff..2ed2c29 100644
+--- a/include/net/route.h
++++ b/include/net/route.h
+@@ -111,7 +111,7 @@ struct in_device;
+ extern int		ip_rt_init(void);
+ extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
+ 				       __be32 src, struct net_device *dev);
+-extern void		rt_cache_flush(int how);
++extern void		rt_cache_flush(struct net *net, int how);
+ extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
+ extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
+ extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
+@@ -138,6 +138,7 @@ static inline void ip_rt_put(struct rtable * rt)
+ #define IPTOS_RT_MASK	(IPTOS_TOS_MASK & ~3)
+ 
+ extern const __u8 ip_tos2prio[16];
++extern int ip_rt_src_check;
+ 
+ static inline char rt_tos2priority(u8 tos)
+ {
+diff --git a/include/net/sock.h b/include/net/sock.h
+index dc42b44..873caf6 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -57,6 +57,8 @@
+ #include <net/dst.h>
+ #include <net/checksum.h>
+ 
++#include <bc/net.h>
++
+ /*
+  * This structure really needs to be cleaned up.
+  * Most of it is for TCP, and not used by any of
+@@ -279,6 +281,8 @@ struct sock {
+   	int			(*sk_backlog_rcv)(struct sock *sk,
+ 						  struct sk_buff *skb);  
+ 	void                    (*sk_destruct)(struct sock *sk);
++	struct sock_beancounter sk_bc;
++	struct ve_struct	*owner_env;
+ };
+ 
+ /*
+@@ -495,6 +499,8 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
+ 	})
+ 
+ extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
++extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
++				unsigned long amount);
+ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
+ extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
+ extern int sk_stream_error(struct sock *sk, int flags, int err);
+@@ -729,7 +735,8 @@ static inline int sk_has_account(struct sock *sk)
+ 	return !!sk->sk_prot->memory_allocated;
+ }
+ 
+-static inline int sk_wmem_schedule(struct sock *sk, int size)
++static inline int sk_wmem_schedule(struct sock *sk, int size,
++		struct sk_buff *skb)
+ {
+ 	if (!sk_has_account(sk))
+ 		return 1;
+@@ -737,12 +744,15 @@ static inline int sk_wmem_schedule(struct sock *sk, int size)
+ 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
+ }
+ 
+-static inline int sk_rmem_schedule(struct sock *sk, int size)
++static inline int sk_rmem_schedule(struct sock *sk,  struct sk_buff *skb)
+ {
+ 	if (!sk_has_account(sk))
+ 		return 1;
+-	return size <= sk->sk_forward_alloc ||
+-		__sk_mem_schedule(sk, size, SK_MEM_RECV);
++	if (!(skb->truesize <= sk->sk_forward_alloc ||
++	      __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV)))
++		return 0;
++
++	return !ub_sockrcvbuf_charge(sk, skb);
+ }
+ 
+ static inline void sk_mem_reclaim(struct sock *sk)
+@@ -862,6 +872,11 @@ extern struct sk_buff 		*sock_alloc_send_skb(struct sock *sk,
+ 						     unsigned long size,
+ 						     int noblock,
+ 						     int *errcode);
++extern struct sk_buff 		*sock_alloc_send_skb2(struct sock *sk,
++						     unsigned long size,
++						     unsigned long size2,
++						     int noblock,
++						     int *errcode);
+ extern void *sock_kmalloc(struct sock *sk, int size,
+ 			  gfp_t priority);
+ extern void sock_kfree_s(struct sock *sk, void *mem, int size);
+@@ -1119,6 +1134,7 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from,
+ 
+ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+ {
++	WARN_ON(skb->destructor);
+ 	sock_hold(sk);
+ 	skb->sk = sk;
+ 	skb->destructor = sock_wfree;
+@@ -1127,6 +1143,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+ 
+ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
+ {
++	WARN_ON(skb->destructor);
+ 	skb->sk = sk;
+ 	skb->destructor = sock_rfree;
+ 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index cf54034..4fa0bca 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -43,6 +43,13 @@
+ #include <net/inet_ecn.h>
+ 
+ #include <linux/seq_file.h>
++#include <bc/net.h>
++
++#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
++#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
++
++#define TW_WSCALE_MASK		0x0f
++#define TW_WSCALE_SPEC		0x10
+ 
+ extern struct inet_hashinfo tcp_hashinfo;
+ 
+@@ -219,7 +226,9 @@ extern int sysctl_tcp_mem[3];
+ extern int sysctl_tcp_wmem[3];
+ extern int sysctl_tcp_rmem[3];
+ extern int sysctl_tcp_app_win;
++#ifndef sysctl_tcp_adv_win_scale
+ extern int sysctl_tcp_adv_win_scale;
++#endif
+ extern int sysctl_tcp_tw_reuse;
+ extern int sysctl_tcp_frto;
+ extern int sysctl_tcp_frto_response;
+@@ -234,6 +243,10 @@ extern int sysctl_tcp_base_mss;
+ extern int sysctl_tcp_workaround_signed_windows;
+ extern int sysctl_tcp_slow_start_after_idle;
+ extern int sysctl_tcp_max_ssthresh;
++extern int sysctl_tcp_use_sg;
++extern int sysctl_tcp_max_tw_kmem_fraction;
++extern int sysctl_tcp_max_tw_buckets_ub;
++
+ 
+ extern atomic_t tcp_memory_allocated;
+ extern atomic_t tcp_sockets_allocated;
+@@ -266,12 +279,17 @@ static inline int tcp_too_many_orphans(struct sock *sk, int num)
+ extern struct proto tcp_prot;
+ 
+ DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+-#define TCP_INC_STATS(field)		SNMP_INC_STATS(tcp_statistics, field)
+-#define TCP_INC_STATS_BH(field)		SNMP_INC_STATS_BH(tcp_statistics, field)
+-#define TCP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(tcp_statistics, field)
+-#define TCP_DEC_STATS(field)		SNMP_DEC_STATS(tcp_statistics, field)
+-#define TCP_ADD_STATS_BH(field, val)	SNMP_ADD_STATS_BH(tcp_statistics, field, val)
+-#define TCP_ADD_STATS_USER(field, val)	SNMP_ADD_STATS_USER(tcp_statistics, field, val)
++#if defined(CONFIG_VE) && defined(CONFIG_INET)
++#define ve_tcp_statistics (get_exec_env()->_tcp_statistics)
++#else
++#define ve_tcp_statistics tcp_statistics
++#endif
++#define TCP_INC_STATS(field)		SNMP_INC_STATS(ve_tcp_statistics, field)
++#define TCP_INC_STATS_BH(field)		SNMP_INC_STATS_BH(ve_tcp_statistics, field)
++#define TCP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(ve_tcp_statistics, field)
++#define TCP_DEC_STATS(field)		SNMP_DEC_STATS(ve_tcp_statistics, field)
++#define TCP_ADD_STATS_BH(field, val)	SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val)
++#define TCP_ADD_STATS_USER(field, val)	SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val)
+ 
+ extern void			tcp_v4_err(struct sk_buff *skb, u32);
+ 
+@@ -545,7 +563,11 @@ extern u32	__tcp_select_window(struct sock *sk);
+  * to use only the low 32-bits of jiffies and hide the ugly
+  * casts with the following macro.
+  */
++#ifdef CONFIG_VE
++#define tcp_time_stamp		((__u32)(jiffies + get_exec_env()->jiffies_fixup))
++#else
+ #define tcp_time_stamp		((__u32)(jiffies))
++#endif
+ 
+ /* This is what the send packet queuing engine uses to pass
+  * TCP per-packet control information to the transmission
+diff --git a/include/net/udp.h b/include/net/udp.h
+index ccce837..62d3396 100644
+--- a/include/net/udp.h
++++ b/include/net/udp.h
+@@ -148,6 +148,11 @@ extern int 	udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ 				   char __user *optval, int optlen,
+ 				   int (*push_pending_frames)(struct sock *));
+ 
++static inline int udp_hashfn(u16 num, unsigned veid)
++{
++	return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1));
++}
++
+ DECLARE_SNMP_STAT(struct udp_mib, udp_statistics);
+ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+ 
+@@ -158,19 +163,31 @@ DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
+ /*
+  * 	SNMP statistics for UDP and UDP-Lite
+  */
++#ifdef CONFIG_VE
++#define ve_udp_statistics (get_exec_env()->_udp_statistics)
++#define ve_udplite_statistics (get_exec_env()->_udplite_statistics)
++#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6)
++#define ve_udplite_stats_in6 (get_exec_env()->_udplite_stats_in6)
++#else
++#define ve_udp_statistics udp_statistics
++#define ve_udplite_statistics udplite_statistics
++#define ve_udp_stats_in6 udp_stats_in6
++#define ve_udplite_stats_in6 udplite_stats_in6
++#endif
++
+ #define UDP_INC_STATS_USER(field, is_udplite)			       do {   \
+-	if (is_udplite) SNMP_INC_STATS_USER(udplite_statistics, field);       \
+-	else		SNMP_INC_STATS_USER(udp_statistics, field);  }  while(0)
++	if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_statistics, field);    \
++	else		SNMP_INC_STATS_USER(ve_udp_statistics, field);  }  while(0)
+ #define UDP_INC_STATS_BH(field, is_udplite) 			       do  {  \
+-	if (is_udplite) SNMP_INC_STATS_BH(udplite_statistics, field);         \
+-	else		SNMP_INC_STATS_BH(udp_statistics, field);    }  while(0)
++	if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_statistics, field);      \
++	else		SNMP_INC_STATS_BH(ve_udp_statistics, field);    }  while(0)
+ 
+ #define UDP6_INC_STATS_BH(field, is_udplite) 			      do  {  \
+-	if (is_udplite) SNMP_INC_STATS_BH(udplite_stats_in6, field);         \
+-	else		SNMP_INC_STATS_BH(udp_stats_in6, field);    } while(0)
++	if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_stats_in6, field);         \
++	else		SNMP_INC_STATS_BH(ve_udp_stats_in6, field);    } while(0)
+ #define UDP6_INC_STATS_USER(field, is_udplite)			       do {    \
+-	if (is_udplite) SNMP_INC_STATS_USER(udplite_stats_in6, field);         \
+-	else		SNMP_INC_STATS_USER(udp_stats_in6, field);    } while(0)
++	if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_stats_in6, field);         \
++	else		SNMP_INC_STATS_USER(ve_udp_stats_in6, field);    } while(0)
+ 
+ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ #define UDPX_INC_STATS_BH(sk, field) \
+diff --git a/init/Kconfig b/init/Kconfig
+index 6199d11..d0807fe 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -208,7 +208,7 @@ config TASK_XACCT
+ 
+ config TASK_IO_ACCOUNTING
+ 	bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
+-	depends on TASK_XACCT
++	depends on TASK_XACCT && BEANCOUNTERS
+ 	help
+ 	  Collect information on the number of bytes of storage I/O which this
+ 	  task has caused.
+@@ -292,7 +292,7 @@ config CGROUP_DEBUG
+ 
+ config CGROUP_NS
+         bool "Namespace cgroup subsystem"
+-        depends on CGROUPS
++        depends on CGROUPS && !VE
+         help
+           Provides a simple namespace cgroup subsystem to
+           provide hierarchical naming of sets of namespaces,
+@@ -308,7 +308,7 @@ config CGROUP_DEVICE
+ 
+ config CPUSETS
+ 	bool "Cpuset support"
+-	depends on SMP && CGROUPS
++	depends on SMP && CGROUPS && !VE
+ 	help
+ 	  This option will let you create and manage CPUSETs which
+ 	  allow dynamically partitioning a system into sets of CPUs and
+@@ -352,17 +352,18 @@ config RT_GROUP_SCHED
+ choice
+ 	depends on GROUP_SCHED
+ 	prompt "Basis for grouping tasks"
+-	default USER_SCHED
++	default VZ_FAIRSCHED
+ 
+ config USER_SCHED
+ 	bool "user id"
++	depends on !VE
+ 	help
+ 	  This option will choose userid as the basis for grouping
+ 	  tasks, thus providing equal CPU bandwidth to each user.
+ 
+ config CGROUP_SCHED
+ 	bool "Control groups"
+- 	depends on CGROUPS
++ 	depends on CGROUPS && !VE
+  	help
+ 	  This option allows you to create arbitrary task groups
+ 	  using the "cgroup" pseudo filesystem and control
+@@ -370,6 +371,12 @@ config CGROUP_SCHED
+ 	  Refer to Documentation/cgroups.txt for more information
+ 	  on "cgroup" pseudo filesystem.
+ 
++config VZ_FAIRSCHED
++	bool "OpenVZ groups"
++	help
++	  This option add customizable task groups with OpenVZ compatible
++	  syscall and procfs interface.
++
+ endchoice
+ 
+ config CGROUP_CPUACCT
+diff --git a/init/calibrate.c b/init/calibrate.c
+index ecb3822..474a8ca 100644
+--- a/init/calibrate.c
++++ b/init/calibrate.c
+@@ -8,6 +8,7 @@
+ #include <linux/delay.h>
+ #include <linux/init.h>
+ #include <linux/timex.h>
++#include <linux/module.h>
+ 
+ unsigned long preset_lpj;
+ static int __init lpj_setup(char *str)
+@@ -104,6 +105,60 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
+ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
+ #endif
+ 
++unsigned long cycles_per_jiffy, cycles_per_clock;
++
++static __devinit void calibrate_cycles(void)
++{
++	unsigned long ticks;
++	cycles_t time;
++
++	ticks = jiffies;
++	while (ticks == jiffies)
++		/* nothing */;
++	time = get_cycles();
++	ticks = jiffies;
++	while (ticks == jiffies)
++		/* nothing */;
++
++	time = get_cycles() - time;
++	cycles_per_jiffy = time;
++	if ((time >> 32) != 0) {
++		printk("CPU too fast! timings are incorrect\n");
++		cycles_per_jiffy = -1;
++	}
++}
++
++EXPORT_SYMBOL(cycles_per_jiffy);
++EXPORT_SYMBOL(cycles_per_clock);
++
++static __devinit void calc_cycles_per_jiffy(void)
++{
++#if 0
++	extern unsigned long fast_gettimeoffset_quotient;
++	unsigned long low, high;
++
++	if (fast_gettimeoffset_quotient != 0) {
++		__asm__("divl %2"
++				:"=a" (low), "=d" (high)
++				:"r" (fast_gettimeoffset_quotient),
++				"0" (0), "1" (1000000/HZ));
++
++		cycles_per_jiffy = low;
++	}
++#endif
++	if (cycles_per_jiffy == 0)
++		calibrate_cycles();
++
++	if (cycles_per_jiffy == 0) {
++		printk(KERN_WARNING "Cycles are stuck! "
++				"Some statistics will not be available.");
++		/* to prevent division by zero in cycles_to_(clocks|jiffies) */
++		cycles_per_jiffy = 1;
++		cycles_per_clock = 1;
++	} else
++		cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC);
++}
++
+ /*
+  * This is the number of bits of precision for the loops_per_jiffy.  Each
+  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
+@@ -169,4 +224,5 @@ void __cpuinit calibrate_delay(void)
+ 			loops_per_jiffy);
+ 	}
+ 
++	calc_cycles_per_jiffy();
+ }
+diff --git a/init/main.c b/init/main.c
+index f7fb200..25c009c 100644
+--- a/init/main.c
++++ b/init/main.c
+@@ -60,6 +60,9 @@
+ #include <linux/sched.h>
+ #include <linux/signal.h>
+ #include <linux/idr.h>
++#include <linux/fairsched.h>
++
++#include <bc/beancounter.h>
+ 
+ #include <asm/io.h>
+ #include <asm/bugs.h>
+@@ -107,6 +110,16 @@ extern void tc_init(void);
+ enum system_states system_state;
+ EXPORT_SYMBOL(system_state);
+ 
++#ifdef CONFIG_VE
++extern void init_ve_system(void);
++extern void init_ve0(void);
++extern void prepare_ve0_process(struct task_struct *tsk);
++#else
++#define init_ve_system()		do { } while (0)
++#define init_ve0()			do { } while (0)
++#define prepare_ve0_process(tsk)	do { } while (0)
++#endif
++
+ /*
+  * Boot command-line arguments
+  */
+@@ -538,6 +551,9 @@ asmlinkage void __init start_kernel(void)
+ 
+ 	smp_setup_processor_id();
+ 
++	prepare_ve0_process(&init_task);
++	init_ve0();
++
+ 	/*
+ 	 * Need to run as early as possible, to initialize the
+ 	 * lockdep hash:
+@@ -556,6 +572,7 @@ asmlinkage void __init start_kernel(void)
+  * enable them
+  */
+ 	lock_kernel();
++	ub_init_early();
+ 	tick_init();
+ 	boot_cpu_init();
+ 	page_address_init();
+@@ -659,6 +676,7 @@ asmlinkage void __init start_kernel(void)
+ 	thread_info_cache_init();
+ 	fork_init(num_physpages);
+ 	proc_caches_init();
++	ub_init_late();
+ 	buffer_init();
+ 	unnamed_dev_init();
+ 	key_init();
+@@ -680,6 +698,10 @@ asmlinkage void __init start_kernel(void)
+ 
+ 	acpi_early_init(); /* before LAPIC and SMP init */
+ 
++#ifdef CONFIG_BC_RSS_ACCOUNTING
++	ub_init_pbc();
++#endif
++
+ 	/* Do the rest non-__init'ed, we're now alive */
+ 	rest_init();
+ }
+@@ -758,6 +780,8 @@ static void __init do_initcalls(void)
+  */
+ static void __init do_basic_setup(void)
+ {
++	init_ve_system();
++
+ 	/* drivers will send hotplug events */
+ 	init_workqueues();
+ 	usermodehelper_init();
+@@ -859,6 +883,7 @@ static int __init kernel_init(void * unused)
+ 	do_pre_smp_initcalls();
+ 
+ 	smp_init();
++	fairsched_init_late();
+ 	sched_init_smp();
+ 
+ 	cpuset_init_smp();
+diff --git a/init/version.c b/init/version.c
+index 9d17d70..ce53b6b 100644
+--- a/init/version.c
++++ b/init/version.c
+@@ -33,6 +33,12 @@ struct uts_namespace init_uts_ns = {
+ };
+ EXPORT_SYMBOL_GPL(init_uts_ns);
+ 
++struct new_utsname virt_utsname = {
++	/* we need only this field */
++	.release        = UTS_RELEASE,
++};
++EXPORT_SYMBOL(virt_utsname);
++
+ /* FIXED STRINGS! Don't touch! */
+ const char linux_banner[] =
+ 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
+index d349746..7e5dde4 100644
+--- a/ipc/ipc_sysctl.c
++++ b/ipc/ipc_sysctl.c
+@@ -225,19 +225,14 @@ static struct ctl_table ipc_kern_table[] = {
+ 	{}
+ };
+ 
+-static struct ctl_table ipc_root_table[] = {
+-	{
+-		.ctl_name	= CTL_KERN,
+-		.procname	= "kernel",
+-		.mode		= 0555,
+-		.child		= ipc_kern_table,
+-	},
++static struct ctl_path ipc_path[] = {
++	{ .ctl_name = CTL_KERN, .procname = "kernel", },
+ 	{}
+ };
+ 
+ static int __init ipc_sysctl_init(void)
+ {
+-	register_sysctl_table(ipc_root_table);
++	register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1);
+ 	return 0;
+ }
+ 
+diff --git a/ipc/msg.c b/ipc/msg.c
+index b4eee1c..4fb6c0f 100644
+--- a/ipc/msg.c
++++ b/ipc/msg.c
+@@ -183,6 +183,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
+ 	int id, retval;
+ 	key_t key = params->key;
+ 	int msgflg = params->flg;
++	int msqid = params->id;
+ 
+ 	msq = ipc_rcu_alloc(sizeof(*msq));
+ 	if (!msq)
+@@ -201,7 +202,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
+ 	/*
+ 	 * ipc_addid() locks msq
+ 	 */
+-	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
++	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid);
+ 	if (id < 0) {
+ 		security_msg_queue_free(msq);
+ 		ipc_rcu_putref(msq);
+@@ -323,6 +324,7 @@ asmlinkage long sys_msgget(key_t key, int msgflg)
+ 
+ 	msg_params.key = key;
+ 	msg_params.flg = msgflg;
++	msg_params.id = -1;
+ 
+ 	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+ }
+@@ -942,3 +944,55 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
+ 			msq->q_ctime);
+ }
+ #endif
++
++#ifdef CONFIG_VE
++#include <linux/module.h>
++
++int sysvipc_setup_msg(key_t key, int msqid, int msgflg)
++{
++	struct ipc_namespace *ns;
++	struct ipc_ops msg_ops;
++	struct ipc_params msg_params;
++
++	ns = current->nsproxy->ipc_ns;
++
++	msg_ops.getnew = newque;
++	msg_ops.associate = msg_security;
++	msg_ops.more_checks = NULL;
++
++	msg_params.key = key;
++	msg_params.flg = msgflg | IPC_CREAT;
++	msg_params.id = msqid;
++
++	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
++}
++EXPORT_SYMBOL_GPL(sysvipc_setup_msg);
++
++int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg)
++{
++	int err = 0;
++	struct msg_queue * msq;
++	struct ipc_namespace *ns;
++	int next_id;
++	int total, in_use;
++
++	ns = current->nsproxy->ipc_ns;
++
++	down_write(&msg_ids(ns).rw_mutex);
++	in_use = msg_ids(ns).in_use;
++	for (total = 0, next_id = 0; total < in_use; next_id++) {
++		msq = idr_find(&msg_ids(ns).ipcs_idr, next_id);
++		if (msq == NULL)
++			continue;
++		ipc_lock_by_ptr(&msq->q_perm);
++		err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg);
++		msg_unlock(msq);
++		if (err)
++			break;
++		total++;
++	}
++	up_write(&msg_ids(ns).rw_mutex);
++	return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_msg);
++#endif
+diff --git a/ipc/msgutil.c b/ipc/msgutil.c
+index c82c215..d058294 100644
+--- a/ipc/msgutil.c
++++ b/ipc/msgutil.c
+@@ -8,6 +8,7 @@
+  * See the file COPYING for more details.
+  */
+ 
++#include <linux/module.h>
+ #include <linux/spinlock.h>
+ #include <linux/init.h>
+ #include <linux/security.h>
+@@ -17,6 +18,8 @@
+ 
+ #include "util.h"
+ 
++#include <bc/kmem.h>
++
+ struct msg_msgseg {
+ 	struct msg_msgseg* next;
+ 	/* the next part of the message follows immediately */
+@@ -25,52 +28,53 @@ struct msg_msgseg {
+ #define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
+ #define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
+ 
+-struct msg_msg *load_msg(const void __user *src, int len)
++struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
++					  void * data), int len, void * data)
+ {
+ 	struct msg_msg *msg;
+ 	struct msg_msgseg **pseg;
+ 	int err;
+ 	int alen;
++	int offset = 0;
+ 
+ 	alen = len;
+ 	if (alen > DATALEN_MSG)
+ 		alen = DATALEN_MSG;
+ 
+-	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
++	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC);
+ 	if (msg == NULL)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	msg->next = NULL;
+ 	msg->security = NULL;
+ 
+-	if (copy_from_user(msg + 1, src, alen)) {
++	if (load(msg + 1, alen, offset, data)) {
+ 		err = -EFAULT;
+ 		goto out_err;
+ 	}
+ 
+ 	len -= alen;
+-	src = ((char __user *)src) + alen;
++	offset += alen;
+ 	pseg = &msg->next;
+ 	while (len > 0) {
+ 		struct msg_msgseg *seg;
+ 		alen = len;
+ 		if (alen > DATALEN_SEG)
+ 			alen = DATALEN_SEG;
+-		seg = kmalloc(sizeof(*seg) + alen,
+-						 GFP_KERNEL);
++		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC);
+ 		if (seg == NULL) {
+ 			err = -ENOMEM;
+ 			goto out_err;
+ 		}
+ 		*pseg = seg;
+ 		seg->next = NULL;
+-		if (copy_from_user(seg + 1, src, alen)) {
++		if (load(seg + 1, alen, offset, data)) {
+ 			err = -EFAULT;
+ 			goto out_err;
+ 		}
+ 		pseg = &seg->next;
+ 		len -= alen;
+-		src = ((char __user *)src) + alen;
++		offset += alen;
+ 	}
+ 
+ 	err = security_msg_msg_alloc(msg);
+@@ -83,33 +87,58 @@ out_err:
+ 	free_msg(msg);
+ 	return ERR_PTR(err);
+ }
++EXPORT_SYMBOL_GPL(sysv_msg_load);
+ 
+-int store_msg(void __user *dest, struct msg_msg *msg, int len)
++static int do_load_msg(void * dst, int len, int offset, void * data)
++{
++	return copy_from_user(dst, data + offset, len);
++}
++
++struct msg_msg *load_msg(const void __user *src, int len)
++{
++	return sysv_msg_load(do_load_msg, len, (void*)src);
++}
++
++int sysv_msg_store(struct msg_msg *msg,
++		   int (*store)(void * src, int len, int offset, void * data),
++		   int len, void * data)
+ {
+ 	int alen;
++	int offset = 0;
+ 	struct msg_msgseg *seg;
+-
++	
+ 	alen = len;
+ 	if (alen > DATALEN_MSG)
+ 		alen = DATALEN_MSG;
+-	if (copy_to_user(dest, msg + 1, alen))
++	if (store(msg + 1, alen, offset, data))
+ 		return -1;
+ 
+ 	len -= alen;
+-	dest = ((char __user *)dest) + alen;
++	offset += alen;
+ 	seg = msg->next;
+ 	while (len > 0) {
+ 		alen = len;
+ 		if (alen > DATALEN_SEG)
+ 			alen = DATALEN_SEG;
+-		if (copy_to_user(dest, seg + 1, alen))
++		if (store(seg + 1, alen, offset, data))
+ 			return -1;
+ 		len -= alen;
+-		dest = ((char __user *)dest) + alen;
++		offset += alen;
+ 		seg = seg->next;
+ 	}
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(sysv_msg_store);
++
++static int do_store_msg(void * src, int len, int offset, void * data)
++{
++	return copy_to_user(data + offset, src, len);
++}
++
++int store_msg(void __user *dest, struct msg_msg *msg, int len)
++{
++	return sysv_msg_store(msg, do_store_msg, len, dest);
++}
+ 
+ void free_msg(struct msg_msg *msg)
+ {
+diff --git a/ipc/sem.c b/ipc/sem.c
+index e9418df..2786746 100644
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -87,6 +87,8 @@
+ #include <asm/uaccess.h>
+ #include "util.h"
+ 
++#include <bc/kmem.h>
++
+ #define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
+ 
+ #define sem_unlock(sma)		ipc_unlock(&(sma)->sem_perm)
+@@ -240,6 +242,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
+ 	key_t key = params->key;
+ 	int nsems = params->u.nsems;
+ 	int semflg = params->flg;
++	int semid = params->id;
+ 
+ 	if (!nsems)
+ 		return -EINVAL;
+@@ -263,7 +266,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
+ 		return retval;
+ 	}
+ 
+-	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
++	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid);
+ 	if (id < 0) {
+ 		security_sem_free(sma);
+ 		ipc_rcu_putref(sma);
+@@ -327,6 +330,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg)
+ 	sem_params.key = key;
+ 	sem_params.flg = semflg;
+ 	sem_params.u.nsems = nsems;
++	sem_params.id = -1;
+ 
+ 	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+ }
+@@ -947,7 +951,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
+ 
+ 	undo_list = current->sysvsem.undo_list;
+ 	if (!undo_list) {
+-		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
++		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC);
+ 		if (undo_list == NULL)
+ 			return -ENOMEM;
+ 		spin_lock_init(&undo_list->lock);
+@@ -1004,7 +1008,8 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
+ 	nsems = sma->sem_nsems;
+ 	sem_getref_and_unlock(sma);
+ 
+-	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
++	new = kzalloc(sizeof(struct sem_undo) +	sizeof(short)*nsems,
++			GFP_KERNEL_UBC);
+ 	if (!new) {
+ 		sem_putref(sma);
+ 		return ERR_PTR(-ENOMEM);
+@@ -1059,7 +1064,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
+ 	if (nsops > ns->sc_semopm)
+ 		return -E2BIG;
+ 	if(nsops > SEMOPM_FAST) {
+-		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
++		sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC);
+ 		if(sops==NULL)
+ 			return -ENOMEM;
+ 	}
+@@ -1341,3 +1346,57 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
+ 			  sma->sem_ctime);
+ }
+ #endif
++
++#ifdef CONFIG_VE
++#include <linux/module.h>
++
++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg)
++{
++	struct ipc_namespace *ns;
++	struct ipc_ops sem_ops;
++	struct ipc_params sem_params;
++
++	ns = current->nsproxy->ipc_ns;
++
++	sem_ops.getnew = newary;
++	sem_ops.associate = sem_security;
++	sem_ops.more_checks = sem_more_checks;
++
++	sem_params.key = key;
++	sem_params.flg = semflg | IPC_CREAT;
++	sem_params.u.nsems = size;
++	sem_params.id = semid;
++
++	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
++}
++EXPORT_SYMBOL_GPL(sysvipc_setup_sem);
++
++int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg)
++{
++	int err = 0;
++	struct sem_array *sma;
++	struct ipc_namespace *ns;
++	int next_id;
++	int total, in_use;
++
++	ns = current->nsproxy->ipc_ns;
++
++	down_write(&sem_ids(ns).rw_mutex);
++	in_use = sem_ids(ns).in_use;
++	for (total = 0, next_id = 0; total < in_use; next_id++) {
++		sma = idr_find(&sem_ids(ns).ipcs_idr, next_id);
++		if (sma == NULL)
++			continue;
++		ipc_lock_by_ptr(&sma->sem_perm);
++		err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg);
++		sem_unlock(sma);
++		if (err)
++			break;
++		total++;
++	}
++	up_write(&sem_ids(ns).rw_mutex);
++	return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_sem);
++EXPORT_SYMBOL_GPL(exit_sem);
++#endif
+diff --git a/ipc/shm.c b/ipc/shm.c
+index 790240c..e9ff453 100644
+--- a/ipc/shm.c
++++ b/ipc/shm.c
+@@ -39,27 +39,17 @@
+ #include <linux/nsproxy.h>
+ #include <linux/mount.h>
+ #include <linux/ipc_namespace.h>
++#include <linux/shmem_fs.h>
+ 
+ #include <asm/uaccess.h>
+ 
+-#include "util.h"
+-
+-struct shm_file_data {
+-	int id;
+-	struct ipc_namespace *ns;
+-	struct file *file;
+-	const struct vm_operations_struct *vm_ops;
+-};
++#include <bc/beancounter.h>
++#include <bc/vmpages.h>
+ 
+-#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
++#include "util.h"
+ 
+-static const struct file_operations shm_file_operations;
+ static struct vm_operations_struct shm_vm_ops;
+ 
+-#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
+-
+-#define shm_unlock(shp)			\
+-	ipc_unlock(&(shp)->shm_perm)
+ 
+ static int newseg(struct ipc_namespace *, struct ipc_params *);
+ static void shm_open(struct vm_area_struct *vma);
+@@ -126,20 +116,6 @@ static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns,
+ 	return container_of(ipcp, struct shmid_kernel, shm_perm);
+ }
+ 
+-/*
+- * shm_lock_(check_) routines are called in the paths where the rw_mutex
+- * is not held.
+- */
+-static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
+-{
+-	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+-
+-	if (IS_ERR(ipcp))
+-		return (struct shmid_kernel *)ipcp;
+-
+-	return container_of(ipcp, struct shmid_kernel, shm_perm);
+-}
+-
+ static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
+ 						int id)
+ {
+@@ -172,6 +148,48 @@ static void shm_open(struct vm_area_struct *vma)
+ 	shm_unlock(shp);
+ }
+ 
++static int shmem_lock(struct shmid_kernel *shp, int lock,
++		struct user_struct *user)
++{
++	struct file *file = shp->shm_file;
++	struct inode *inode = file->f_path.dentry->d_inode;
++	struct shmem_inode_info *info = SHMEM_I(inode);
++	unsigned long size;
++
++	size = shp->shm_segsz + PAGE_SIZE - 1;
++
++#ifdef CONFIG_SHMEM
++	spin_lock(&info->lock);
++	if (lock && !(info->flags & VM_LOCKED)) {
++		if (ub_lockedshm_charge(info, size) < 0)
++			goto out_ch;
++
++		if (!user_shm_lock(inode->i_size, user))
++			goto out_user;
++		info->flags |= VM_LOCKED;
++	}
++	if (!lock && (info->flags & VM_LOCKED) && user) {
++		ub_lockedshm_uncharge(info, size);
++		user_shm_unlock(inode->i_size, user);
++		info->flags &= ~VM_LOCKED;
++	}
++	spin_unlock(&info->lock);
++	return 0;
++
++out_user:
++	ub_lockedshm_uncharge(info, size);
++out_ch:
++	spin_unlock(&info->lock);
++	return -ENOMEM;
++#else
++	if (lock && ub_lockedshm_charge(info, size))
++		return -ENOMEM;
++	if (!lock)
++		ub_lockedshm_uncharge(info, size);
++	return 0;
++#endif
++}
++
+ /*
+  * shm_destroy - free the struct shmid_kernel
+  *
+@@ -187,7 +205,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+ 	shm_rmid(ns, shp);
+ 	shm_unlock(shp);
+ 	if (!is_file_hugepages(shp->shm_file))
+-		shmem_lock(shp->shm_file, 0, shp->mlock_user);
++		shmem_lock(shp, 0, shp->mlock_user);
+ 	else
+ 		user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size,
+ 						shp->mlock_user);
+@@ -319,12 +337,13 @@ int is_file_shm_hugepages(struct file *file)
+ 	return ret;
+ }
+ 
+-static const struct file_operations shm_file_operations = {
++const struct file_operations shm_file_operations = {
+ 	.mmap		= shm_mmap,
+ 	.fsync		= shm_fsync,
+ 	.release	= shm_release,
+ 	.get_unmapped_area	= shm_get_unmapped_area,
+ };
++EXPORT_SYMBOL_GPL(shm_file_operations);
+ 
+ static struct vm_operations_struct shm_vm_ops = {
+ 	.open	= shm_open,	/* callback for a new vm-area open */
+@@ -349,11 +368,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+ 	key_t key = params->key;
+ 	int shmflg = params->flg;
+ 	size_t size = params->u.size;
++	int shmid = params->id;
+ 	int error;
+ 	struct shmid_kernel *shp;
+ 	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ 	struct file * file;
+-	char name[13];
++	char name[64];
+ 	int id;
+ 
+ 	if (size < SHMMIN || size > ns->shm_ctlmax)
+@@ -377,7 +397,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+ 		return error;
+ 	}
+ 
+-	sprintf (name, "SYSV%08x", key);
++	snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key);
+ 	if (shmflg & SHM_HUGETLB) {
+ 		/* hugetlb_file_setup takes care of mlock user accounting */
+ 		file = hugetlb_file_setup(name, size);
+@@ -397,7 +417,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+ 	if (IS_ERR(file))
+ 		goto no_file;
+ 
+-	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
++	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid);
+ 	if (id < 0) {
+ 		error = id;
+ 		goto no_id;
+@@ -470,6 +490,7 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
+ 	shm_params.key = key;
+ 	shm_params.flg = shmflg;
+ 	shm_params.u.size = size;
++	shm_params.id = -1;
+ 
+ 	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+ }
+@@ -778,14 +799,14 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
+ 		if(cmd==SHM_LOCK) {
+ 			struct user_struct * user = current->user;
+ 			if (!is_file_hugepages(shp->shm_file)) {
+-				err = shmem_lock(shp->shm_file, 1, user);
++				err = shmem_lock(shp, 1, user);
+ 				if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){
+ 					shp->shm_perm.mode |= SHM_LOCKED;
+ 					shp->mlock_user = user;
+ 				}
+ 			}
+ 		} else if (!is_file_hugepages(shp->shm_file)) {
+-			shmem_lock(shp->shm_file, 0, shp->mlock_user);
++			shmem_lock(shp, 0, shp->mlock_user);
+ 			shp->shm_perm.mode &= ~SHM_LOCKED;
+ 			shp->mlock_user = NULL;
+ 		}
+@@ -1084,3 +1105,67 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
+ 			  shp->shm_ctim);
+ }
+ #endif
++
++#ifdef CONFIG_VE
++#include <linux/module.h>
++
++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg)
++{
++	struct ipc_namespace *ns;
++	struct ipc_ops shm_ops;
++	struct ipc_params shm_params;
++	struct shmid_kernel *shp;
++	struct file *file;
++	int rv;
++
++	ns = current->nsproxy->ipc_ns;
++
++	shm_ops.getnew = newseg;
++	shm_ops.associate = shm_security;
++	shm_ops.more_checks = shm_more_checks;
++
++	shm_params.key = key;
++	shm_params.flg = shmflg | IPC_CREAT;
++	shm_params.u.size = size;
++	shm_params.id = shmid;
++
++	rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
++	if (rv < 0)
++		return ERR_PTR(rv);
++	shp = shm_lock(ns, rv);
++	BUG_ON(IS_ERR(shp));
++	file = shp->shm_file;
++	get_file(file);
++	shm_unlock(shp);
++	return file;
++}
++EXPORT_SYMBOL_GPL(sysvipc_setup_shm);
++
++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg)
++{
++	int err = 0;
++	struct shmid_kernel* shp;
++	struct ipc_namespace *ns;
++	int next_id;
++	int total, in_use;
++
++	ns = current->nsproxy->ipc_ns;
++
++	down_write(&shm_ids(ns).rw_mutex);
++	in_use = shm_ids(ns).in_use;
++	for (total = 0, next_id = 0; total < in_use; next_id++) {
++		shp = idr_find(&shm_ids(ns).ipcs_idr, next_id);
++		if (shp == NULL)
++			continue;
++		ipc_lock_by_ptr(&shp->shm_perm);
++		err = func(shp, arg);
++		shm_unlock(shp);
++		if (err)
++			break;
++		total++;
++	}
++	up_write(&shm_ids(ns).rw_mutex);
++	return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_shm);
++#endif
+diff --git a/ipc/util.c b/ipc/util.c
+index 3339177..5f1b3a2 100644
+--- a/ipc/util.c
++++ b/ipc/util.c
+@@ -38,6 +38,8 @@
+ 
+ #include <asm/unistd.h>
+ 
++#include <bc/kmem.h>
++
+ #include "util.h"
+ 
+ struct ipc_proc_iface {
+@@ -247,6 +249,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
+  *	@ids: IPC identifier set
+  *	@new: new IPC permission set
+  *	@size: limit for the number of used ids
++ *	@reqid: if >= 0, get this id exactly. If -1 -- don't care.
+  *
+  *	Add an entry 'new' to the IPC ids idr. The permissions object is
+  *	initialised and the first free entry is set up and the id assigned
+@@ -256,10 +259,18 @@ int ipc_get_maxid(struct ipc_ids *ids)
+  *	Called with ipc_ids.rw_mutex held as a writer.
+  */
+  
+-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid)
+ {
+ 	int id, err;
+ 
++	if (reqid >= 0) {
++		id = reqid % SEQ_MULTIPLIER;
++		err = idr_get_new_above(&ids->ipcs_idr, new, id, &id);
++		if (err || id != (reqid % SEQ_MULTIPLIER))
++			return -EEXIST;
++		goto found;
++	}
++
+ 	if (size > IPCMNI)
+ 		size = IPCMNI;
+ 
+@@ -270,14 +281,19 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+ 	if (err)
+ 		return err;
+ 
++found:
+ 	ids->in_use++;
+ 
+ 	new->cuid = new->uid = current->euid;
+ 	new->gid = new->cgid = current->egid;
+ 
+-	new->seq = ids->seq++;
+-	if(ids->seq > ids->seq_max)
+-		ids->seq = 0;
++	if (reqid >= 0) {
++		new->seq = reqid/SEQ_MULTIPLIER;
++	} else {
++		new->seq = ids->seq++;
++		if(ids->seq > ids->seq_max)
++			ids->seq = 0;
++	}
+ 
+ 	new->id = ipc_buildid(id, new->seq);
+ 	spin_lock_init(&new->lock);
+@@ -445,9 +461,9 @@ void* ipc_alloc(int size)
+ {
+ 	void* out;
+ 	if(size > PAGE_SIZE)
+-		out = vmalloc(size);
++		out = ub_vmalloc(size);
+ 	else
+-		out = kmalloc(size, GFP_KERNEL);
++		out = kmalloc(size, GFP_KERNEL_UBC);
+ 	return out;
+ }
+ 
+@@ -530,14 +546,14 @@ void* ipc_rcu_alloc(int size)
+ 	 * workqueue if necessary (for vmalloc). 
+ 	 */
+ 	if (rcu_use_vmalloc(size)) {
+-		out = vmalloc(HDRLEN_VMALLOC + size);
++		out = ub_vmalloc(HDRLEN_VMALLOC + size);
+ 		if (out) {
+ 			out += HDRLEN_VMALLOC;
+ 			container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1;
+ 			container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
+ 		}
+ 	} else {
+-		out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
++		out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC);
+ 		if (out) {
+ 			out += HDRLEN_KMALLOC;
+ 			container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0;
+@@ -724,6 +740,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
+ 
+ 	return out;
+ }
++EXPORT_SYMBOL_GPL(ipc_lock);
+ 
+ /**
+  * ipc_lock_down - Lock an ipc structure with rw_sem held
+@@ -863,7 +880,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
+ 			goto out_unlock;
+ 	}
+ 	if (current->euid == ipcp->cuid ||
+-	    current->euid == ipcp->uid || capable(CAP_SYS_ADMIN))
++	    current->euid == ipcp->uid || capable(CAP_VE_SYS_ADMIN))
+ 		return ipcp;
+ 
+ 	err = -EPERM;
+diff --git a/ipc/util.h b/ipc/util.h
+index cdb966a..e45893d 100644
+--- a/ipc/util.h
++++ b/ipc/util.h
+@@ -39,6 +39,7 @@ struct ipc_params {
+ 		size_t size;	/* for shared memories */
+ 		int nsems;	/* for semaphores */
+ 	} u;			/* holds the getnew() specific param */
++	int id;
+ };
+ 
+ /*
+@@ -68,14 +69,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
+ #define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
+ #endif
+ 
+-#define IPC_SEM_IDS	0
+-#define IPC_MSG_IDS	1
+-#define IPC_SHM_IDS	2
+-
+ #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
+ 
+ /* must be called with ids->rw_mutex acquired for writing */
+-int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
++int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int);
+ 
+ /* must be called with ids->rw_mutex acquired for reading */
+ int ipc_get_maxid(struct ipc_ids *);
+@@ -107,7 +104,6 @@ void ipc_rcu_putref(void *ptr);
+  * ipc_lock: called without that lock held
+  */
+ struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *, int);
+-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+ 
+ void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
+ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
+@@ -149,12 +145,6 @@ static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
+ 	spin_lock(&perm->lock);
+ }
+ 
+-static inline void ipc_unlock(struct kern_ipc_perm *perm)
+-{
+-	spin_unlock(&perm->lock);
+-	rcu_read_unlock();
+-}
+-
+ struct kern_ipc_perm *ipc_lock_check_down(struct ipc_ids *ids, int id);
+ struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
+ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
+diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz
+new file mode 100644
+index 0000000..dfd54fd
+--- /dev/null
++++ b/kernel/Kconfig.openvz
+@@ -0,0 +1,91 @@
++# Copyright (C) 2005  SWsoft
++# All rights reserved.
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++menu "OpenVZ"
++
++config VE
++	bool "Virtual Environment support"
++	default y
++	select NAMESPACES
++	select PID_NS
++	select IPC_NS
++	select UTS_NS
++	select NET_NS
++	select USER_NS
++	select CGROUPS
++	select CGROUP_DEVICE
++	select GROUP_SCHED
++	select FAIR_GROUP_SCHED
++	help
++	  This option adds support of virtual Linux running on the original box
++	  with fully supported virtual network driver, tty subsystem and
++	  configurable access for hardware and other resources.
++
++config VE_CALLS
++	tristate "VE calls interface"
++	depends on VE
++	select VZ_DEV
++	default m
++	help
++	  This option controls how to build vzmon code containing VE calls.
++	  By default it's build in module vzmon.o
++
++config VZ_GENCALLS
++	bool
++	default y
++
++config VE_NETDEV
++	tristate "VE network device"
++	depends on VE_CALLS && NET
++	select VZ_DEV
++	default m
++	help
++	  This option controls whether to build venet device. This is a
++	  common interface for networking in VE.
++
++config VE_ETHDEV
++	tristate "Virtual ethernet device"
++	depends on VE_CALLS && NET
++	select VZ_DEV
++	default m
++	help
++	  This option controls whether to build virtual ethernet device.
++
++config VZ_DEV
++	tristate "VE device"
++	default m
++	help
++	  This option adds support of vzdev device, which is used by
++	  user-space applications to control Virtual Environments.
++
++config VE_IPTABLES
++	bool "VE netfiltering"
++	depends on VE && VE_NETDEV && INET && NETFILTER
++	default y
++	help
++	  This option controls whether to build VE netfiltering code.
++
++config VZ_WDOG
++	tristate "VE watchdog module"
++	depends on VE_CALLS
++	default m
++	help
++	  This option controls building of vzwdog module, which dumps
++	  a lot of useful system info on console periodically.
++ 
++config VZ_CHECKPOINT
++ 	tristate "Checkpointing & restoring Virtual Environments"
++ 	depends on VE_CALLS && INET
++	select PM
++	select PM_SLEEP
++	select TUN
++	select VE_ETHDEV
++	select VE_NETDEV
++ 	default n
++ 	help
++ 	  This option adds two modules, "cpt" and "rst", which allow
++ 	  to save a running Virtual Environment and restore it
++ 	  on another host (live migration) or on the same host (checkpointing).
++
++endmenu
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 1c9938a..d16fa33 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -14,6 +14,10 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+ obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+ obj-y += time/
++obj-$(CONFIG_BEANCOUNTERS) += bc/
++obj-y += ve/
++obj-$(CONFIG_VZ_CHECKPOINT) += cpt/
++
+ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+ obj-$(CONFIG_LOCKDEP) += lockdep.o
+ ifeq ($(CONFIG_PROC_FS),y)
+@@ -38,7 +42,11 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+ obj-$(CONFIG_KEXEC) += kexec.o
+ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
+ obj-$(CONFIG_COMPAT) += compat.o
++ifeq ($(CONFIG_VE),n)
+ obj-$(CONFIG_CGROUPS) += cgroup.o
++else
++obj-$(CONFIG_CGROUPS) += cgroup_lite.o
++endif
+ obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
+ obj-$(CONFIG_CPUSETS) += cpuset.o
+ obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+@@ -69,6 +77,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+ obj-$(CONFIG_MARKERS) += marker.o
+ obj-$(CONFIG_LATENCYTOP) += latencytop.o
++obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o
+ 
+ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ # According to Alan Modra <alan at linuxcare.com.au>, the -fno-omit-frame-pointer is
+diff --git a/kernel/audit.c b/kernel/audit.c
+index e092f1c..ddc9b19 100644
+--- a/kernel/audit.c
++++ b/kernel/audit.c
+@@ -666,6 +666,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+ 	char			*ctx = NULL;
+ 	u32			len;
+ 
++	if (!ve_is_super(skb->owner_env))
++		return -ECONNREFUSED;
++
+ 	err = audit_netlink_ok(skb, msg_type);
+ 	if (err)
+ 		return err;
+diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
+index 98c50cc..450142b 100644
+--- a/kernel/auditfilter.c
++++ b/kernel/auditfilter.c
+@@ -164,8 +164,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+ 	inotify_init_watch(&parent->wdata);
+ 	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+ 	get_inotify_watch(&parent->wdata);
+-	wd = inotify_add_watch(audit_ih, &parent->wdata,
+-			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
++	wd = inotify_add_watch_dget(audit_ih, &parent->wdata,
++			&ndp->path, AUDIT_IN_WATCH);
+ 	if (wd < 0) {
+ 		audit_free_parent(&parent->wdata);
+ 		return ERR_PTR(wd);
+diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig
+new file mode 100644
+index 0000000..2c3de4a
+--- /dev/null
++++ b/kernel/bc/Kconfig
+@@ -0,0 +1,111 @@
++#
++# User resources part (UBC)
++#
++# Copyright (C) 2005  SWsoft
++# All rights reserved.
++#
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++menu "User resources"
++
++config BEANCOUNTERS
++	bool "Enable user resource accounting"
++	default y
++	help 
++          This patch provides accounting and allows to configure
++          limits for user's consumption of exhaustible system resources.
++          The most important resource controlled by this patch is unswappable 
++          memory (either mlock'ed or used by internal kernel structures and 
++          buffers). The main goal of this patch is to protect processes
++          from running short of important resources because of an accidental
++          misbehavior of processes or malicious activity aiming to ``kill'' 
++          the system. It's worth to mention that resource limits configured 
++          by setrlimit(2) do not give an acceptable level of protection 
++          because they cover only small fraction of resources and work on a 
++          per-process basis.  Per-process accounting doesn't prevent malicious
++          users from spawning a lot of resource-consuming processes.
++
++config BC_RSS_ACCOUNTING
++	bool "Account physical memory usage"
++	default y
++	depends on BEANCOUNTERS
++	help
++          This allows to estimate per beancounter physical memory usage.
++          Implemented alghorithm accounts shared pages of memory as well,
++          dividing them by number of beancounter which use the page.
++
++config BC_IO_ACCOUNTING
++	bool "Account disk IO"
++	default y
++	depends on BC_RSS_ACCOUNTING
++	help
++	  When on this option allows seeing disk IO activity caused by
++	  tasks from each UB
++
++config BC_IO_SCHED
++	bool "UBC I/O priority"
++	default y
++	depends on BC_IO_ACCOUNTING && IOSCHED_CFQ
++	help
++	  This option controls whether to build CFQ I/O scheduler
++	  with support of UBC I/O priority.
++
++config BC_SWAP_ACCOUNTING
++	bool "Account swap usage"
++	default y
++	depends on BEANCOUNTERS
++	help
++          This allows accounting of swap usage.
++
++config BC_PROC
++	bool "Report resource usage in /proc"
++	default y
++	depends on BEANCOUNTERS
++	help
++          Allows a system administrator to inspect resource accounts and limits.
++
++config BC_DEBUG
++	bool "User resources debug features"
++	default n
++	depends on BEANCOUNTERS
++	help
++	  Enables to setup debug features for user resource accounting
++
++config BC_DEBUG_IO
++	bool "Debug IO accounting"
++	default y
++	depends on BC_DEBUG && BC_IO_ACCOUNTING
++	help
++	  Debugging for IO accointing.
++
++config BC_DEBUG_KMEM
++	bool "Debug kmemsize with cache counters"
++	default n
++	depends on BC_DEBUG
++	help
++	  Adds /proc/user_beancounters_debug entry to get statistics
++	  about cache usage of each beancounter
++
++config BC_KEEP_UNUSED
++	bool "Keep unused beancounter alive"
++	default y
++	depends on BC_DEBUG
++	help
++	  If on, unused beancounters are kept on the hash and maxheld value
++	  can be looked through.
++
++config BC_DEBUG_ITEMS
++	bool "Account resources in items rather than in bytes"
++	default y
++	depends on BC_DEBUG
++	help
++	  When true some of the resources (e.g. kmemsize) are accounted
++	  in items instead of bytes.
++
++config BC_UNLIMITED
++	bool "Use unlimited ubc settings"
++	default y
++	depends on BC_DEBUG
++	help
++	  When ON all limits and barriers are set to max values.
++endmenu
+diff --git a/kernel/bc/Makefile b/kernel/bc/Makefile
+new file mode 100644
+index 0000000..e0e6529
+--- /dev/null
++++ b/kernel/bc/Makefile
+@@ -0,0 +1,16 @@
++#
++# User resources part (UBC)
++#
++# Copyright (C) 2005  SWsoft
++# All rights reserved.
++#
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \
++	 vm_pages.o statd.o oom_kill.o
++
++obj-$(CONFIG_NET) += net.o
++obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o
++obj-$(CONFIG_BC_PROC)  += proc.o
++obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o
++obj-$(CONFIG_BC_IO_SCHED) += io_prio.o
+diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
+new file mode 100644
+index 0000000..48fa1cc
+--- /dev/null
++++ b/kernel/bc/beancounter.c
+@@ -0,0 +1,676 @@
++/*
++ *  linux/kernel/bc/beancounter.c
++ *
++ *  Copyright (C) 1998  Alan Cox
++ *                1998-2000  Andrey V. Savochkin <saw at saw.sw.com.sg>
++ *  Copyright (C) 2000-2005 SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * TODO:
++ *   - more intelligent limit check in mremap(): currently the new size is
++ *     charged and _then_ old size is uncharged
++ *     (almost done: !move_vma case is completely done,
++ *      move_vma in its current implementation requires too many conditions to
++ *      do things right, because it may be not only expansion, but shrinking
++ *      also, plus do_munmap will require an additional parameter...)
++ *   - problem: bad pmd page handling
++ *   - consider /proc redesign
++ *   - TCP/UDP ports
++ *   + consider whether __charge_beancounter_locked should be inline
++ *
++ * Changes:
++ *   1999/08/17  Marcelo Tosatti <marcelo at conectiva.com.br>
++ *	- Set "barrier" and "limit" parts of limits atomically.
++ *   1999/10/06  Marcelo Tosatti <marcelo at conectiva.com.br>
++ *	- setublimit system call.
++ */
++
++#include <linux/slab.h>
++#include <linux/module.h>
++#include <linux/mm.h>
++
++#include <bc/beancounter.h>
++#include <bc/hash.h>
++#include <bc/vmpages.h>
++#include <bc/proc.h>
++#include <bc/io_prio.h>
++
++static struct kmem_cache *ub_cachep;
++static struct user_beancounter default_beancounter;
++struct user_beancounter ub0;
++EXPORT_SYMBOL_GPL(ub0);
++
++const char *ub_rnames[] = {
++	"kmemsize",	/* 0 */
++	"lockedpages",
++	"privvmpages",
++	"shmpages",
++	"dummy",
++	"numproc",	/* 5 */
++	"physpages",
++	"vmguarpages",
++	"oomguarpages",
++	"numtcpsock",
++	"numflock",	/* 10 */
++	"numpty",
++	"numsiginfo",
++	"tcpsndbuf",
++	"tcprcvbuf",
++	"othersockbuf",	/* 15 */
++	"dgramrcvbuf",
++	"numothersock",
++	"dcachesize",
++	"numfile",
++	"dummy",	/* 20 */
++	"dummy",
++	"dummy",
++	"numiptent",
++	"unused_privvmpages",	/* UB_RESOURCES */
++	"tmpfs_respages",
++	"swap_pages",
++	"held_pages",
++};
++
++static void init_beancounter_struct(struct user_beancounter *ub);
++static void init_beancounter_store(struct user_beancounter *ub);
++static void init_beancounter_nolimits(struct user_beancounter *ub);
++
++int print_ub_uid(struct user_beancounter *ub, char *buf, int size)
++{
++	if (ub->parent != NULL)
++		return snprintf(buf, size, "%u.%u",
++				ub->parent->ub_uid, ub->ub_uid);
++	else
++		return snprintf(buf, size, "%u", ub->ub_uid);
++}
++EXPORT_SYMBOL(print_ub_uid);
++
++#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
++#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
++struct hlist_head ub_hash[UB_HASH_SIZE];
++DEFINE_SPINLOCK(ub_hash_lock);
++LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */
++EXPORT_SYMBOL(ub_hash);
++EXPORT_SYMBOL(ub_hash_lock);
++EXPORT_SYMBOL(ub_list_head);
++
++/*
++ *	Per user resource beancounting. Resources are tied to their luid.
++ *	The resource structure itself is tagged both to the process and
++ *	the charging resources (a socket doesn't want to have to search for
++ *	things at irq time for example). Reference counters keep things in
++ *	hand.
++ *
++ *	The case where a user creates resource, kills all his processes and
++ *	then starts new ones is correctly handled this way. The refcounters
++ *	will mean the old entry is still around with resource tied to it.
++ */
++
++static inline void free_ub(struct user_beancounter *ub)
++{
++	free_percpu(ub->ub_percpu);
++	kmem_cache_free(ub_cachep, ub);
++}
++
++static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash,
++		uid_t uid, struct user_beancounter *parent)
++{
++	struct user_beancounter *ub;
++	struct hlist_node *ptr;
++
++	hlist_for_each_entry (ub, ptr, hash, ub_hash)
++		if (ub->ub_uid == uid && ub->parent == parent)
++			return get_beancounter(ub);
++
++	return NULL;
++}
++
++struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
++{
++	struct user_beancounter *new_ub, *ub;
++	unsigned long flags;
++	struct hlist_head *hash;
++
++	hash = &ub_hash[ub_hash_fun(uid)];
++	new_ub = NULL;
++retry:
++	spin_lock_irqsave(&ub_hash_lock, flags);
++	ub = bc_lookup_hash(hash, uid, NULL);
++	if (ub != NULL) {
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++		if (new_ub != NULL)
++			free_ub(new_ub);
++		return ub;
++	}
++
++	if (!create) {
++		/* no ub found */
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++		return NULL;
++	}
++
++	if (new_ub != NULL) {
++		list_add_rcu(&new_ub->ub_list, &ub_list_head);
++		hlist_add_head(&new_ub->ub_hash, hash);
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++		return new_ub;
++	}
++	spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++	/* alloc new ub */
++	new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, 
++			GFP_KERNEL);
++	if (new_ub == NULL)
++		return NULL;
++
++	ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub);
++	memcpy(new_ub, &default_beancounter, sizeof(*new_ub));
++	init_beancounter_struct(new_ub);
++	new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
++	if (new_ub->ub_percpu == NULL)
++		goto fail_free;
++	new_ub->ub_uid = uid;
++	goto retry;
++
++fail_free:
++	kmem_cache_free(ub_cachep, new_ub);
++	return NULL;
++}
++EXPORT_SYMBOL(get_beancounter_byuid);
++
++struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p,
++		int id, int create)
++{
++	struct user_beancounter *new_ub, *ub;
++	unsigned long flags;
++	struct hlist_head *hash;
++
++	hash = &ub_hash[ub_subhash_fun(p, id)];
++	new_ub = NULL;
++retry:
++	spin_lock_irqsave(&ub_hash_lock, flags);
++	ub = bc_lookup_hash(hash, id, p);
++	if (ub != NULL) {
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++		if (new_ub != NULL) {
++			put_beancounter(new_ub->parent);
++			free_ub(new_ub);
++		}
++		return ub;
++	}
++
++	if (!create) {
++		/* no ub found */
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++		return NULL;
++	}
++
++	if (new_ub != NULL) {
++		list_add_rcu(&new_ub->ub_list, &ub_list_head);
++		hlist_add_head(&new_ub->ub_hash, hash);
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++		return new_ub;
++	}
++	spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++	/* alloc new ub */
++	new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, 
++			GFP_KERNEL);
++	if (new_ub == NULL)
++		return NULL;
++
++	ub_debug(UBD_ALLOC, "Creating sub %p\n", new_ub);
++	memset(new_ub, 0, sizeof(*new_ub));
++	init_beancounter_nolimits(new_ub);
++	init_beancounter_store(new_ub);
++	init_beancounter_struct(new_ub);
++	new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
++	if (new_ub->ub_percpu == NULL)
++		goto fail_free;
++	new_ub->ub_uid = id;
++	new_ub->parent = get_beancounter(p);
++	goto retry;
++
++fail_free:
++	kmem_cache_free(ub_cachep, new_ub);
++	return NULL;
++}
++EXPORT_SYMBOL(get_subbeancounter_byid);
++
++static void put_warn(struct user_beancounter *ub)
++{
++	char id[64];
++
++	print_ub_uid(ub, id, sizeof(id));
++	printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n",
++			atomic_read(&ub->ub_refcount), id, ub);
++}
++
++#ifdef CONFIG_BC_KEEP_UNUSED
++#define release_beancounter(ub)	do { } while (0)
++#else
++static int verify_res(struct user_beancounter *ub, int resource,
++		unsigned long held)
++{
++	char id[64];
++
++	if (likely(held == 0))
++		return 1;
++
++	print_ub_uid(ub, id, sizeof(id));
++	printk(KERN_WARNING "Ub %s helds %lu in %s on put\n",
++			id, held, ub_rnames[resource]);
++	return 0;
++}
++
++static inline void bc_verify_held(struct user_beancounter *ub)
++{
++	int i, clean;
++
++	clean = 1;
++	for (i = 0; i < UB_RESOURCES; i++)
++		clean &= verify_res(ub, i, ub->ub_parms[i].held);
++
++	clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages);
++	clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages);
++	clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages);
++	clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages);
++
++	ub_debug_trace(!clean, 5, 60*HZ);
++}
++
++static void bc_free_rcu(struct rcu_head *rcu)
++{
++	struct user_beancounter *ub;
++
++	ub = container_of(rcu, struct user_beancounter, rcu);
++	free_ub(ub);
++}
++
++static void delayed_release_beancounter(struct work_struct *w)
++{
++	struct user_beancounter *ub, *parent;
++	unsigned long flags;
++
++	ub = container_of(w, struct user_beancounter, cleanup.work);
++again:
++	local_irq_save(flags);
++	if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) {
++		/* raced with get_beancounter_byuid */
++		local_irq_restore(flags);
++		return;
++	}
++
++	hlist_del(&ub->ub_hash);
++	list_del_rcu(&ub->ub_list);
++	spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++	bc_verify_held(ub);
++	ub_free_counters(ub);
++ 	bc_fini_ioprio(&ub->iopriv);
++	parent = ub->parent;
++
++	call_rcu(&ub->rcu, bc_free_rcu);
++	if (parent) {
++		ub = parent;
++		goto again;
++	}
++}
++
++static inline void release_beancounter(struct user_beancounter *ub)
++{
++	struct execute_work *ew;
++
++	ew = &ub->cleanup;
++	INIT_WORK(&ew->work, delayed_release_beancounter);
++	schedule_work(&ew->work);
++}
++#endif
++
++void __put_beancounter(struct user_beancounter *ub)
++{
++	unsigned long flags;
++
++	/* equevalent to atomic_dec_and_lock_irqsave() */
++	local_irq_save(flags);
++	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
++		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
++			put_warn(ub);
++		local_irq_restore(flags);
++		return;
++	}
++
++	if (unlikely(ub == get_ub0())) {
++		printk(KERN_ERR "Trying to put ub0\n");
++		spin_unlock_irqrestore(&ub_hash_lock, flags);
++		return;
++	}
++
++	/* prevent get_beancounter_byuid + put_beancounter() reentrance */
++	atomic_inc(&ub->ub_refcount);
++	spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++	release_beancounter(ub);
++}
++EXPORT_SYMBOL(__put_beancounter);
++
++void put_beancounter_safe(struct user_beancounter *ub)
++{
++	synchronize_rcu();
++	__put_beancounter(ub);
++}
++EXPORT_SYMBOL(put_beancounter_safe);
++
++/*
++ *	Generic resource charging stuff
++ */
++
++int __charge_beancounter_locked(struct user_beancounter *ub,
++		int resource, unsigned long val, enum ub_severity strict)
++{
++	ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n",
++			val, resource, ub, ub->ub_parms[resource].held);
++	/*
++	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
++	 * at the moment is possible so an overflow is impossible.  
++	 */
++	ub->ub_parms[resource].held += val;
++
++	switch (strict) {
++		case UB_HARD:
++			if (ub->ub_parms[resource].held >
++					ub->ub_parms[resource].barrier)
++				break;
++		case UB_SOFT:
++			if (ub->ub_parms[resource].held >
++					ub->ub_parms[resource].limit)
++				break;
++		case UB_FORCE:
++			ub_adjust_maxheld(ub, resource);
++			return 0;
++		default:
++			BUG();
++	}
++
++	if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl))
++		printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n",
++		       ub_rnames[resource], ub->ub_uid);
++	ub->ub_parms[resource].failcnt++;
++	ub->ub_parms[resource].held -= val;
++	return -ENOMEM;
++}
++
++int charge_beancounter(struct user_beancounter *ub,
++		int resource, unsigned long val, enum ub_severity strict)
++{
++	int retval;
++	struct user_beancounter *p, *q;
++	unsigned long flags;
++
++	retval = -EINVAL;
++	if (val > UB_MAXVALUE)
++		goto out;
++
++	local_irq_save(flags);
++	for (p = ub; p != NULL; p = p->parent) {
++		spin_lock(&p->ub_lock);
++		retval = __charge_beancounter_locked(p, resource, val, strict);
++		spin_unlock(&p->ub_lock);
++		if (retval)
++			goto unroll;
++	}
++out_restore:
++	local_irq_restore(flags);
++out:
++	return retval;
++
++unroll:
++	for (q = ub; q != p; q = q->parent) {
++		spin_lock(&q->ub_lock);
++		__uncharge_beancounter_locked(q, resource, val);
++		spin_unlock(&q->ub_lock);
++	}
++	goto out_restore;
++}
++
++EXPORT_SYMBOL(charge_beancounter);
++
++void __charge_beancounter_notop(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	struct user_beancounter *p;
++	unsigned long flags;
++
++	local_irq_save(flags);
++	for (p = ub; p->parent != NULL; p = p->parent) {
++		spin_lock(&p->ub_lock);
++		__charge_beancounter_locked(p, resource, val, UB_FORCE);
++		spin_unlock(&p->ub_lock);
++	}
++	local_irq_restore(flags);
++}
++
++EXPORT_SYMBOL(__charge_beancounter_notop);
++
++void uncharge_warn(struct user_beancounter *ub, int resource,
++		unsigned long val, unsigned long held)
++{
++	char id[64];
++
++	print_ub_uid(ub, id, sizeof(id));
++	printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n",
++			val, held, ub_rnames[resource], id);
++	ub_debug_trace(1, 10, 10*HZ);
++}
++
++void __uncharge_beancounter_locked(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n",
++			val, resource, ub, ub->ub_parms[resource].held);
++	if (ub->ub_parms[resource].held < val) {
++		uncharge_warn(ub, resource,
++				val, ub->ub_parms[resource].held);
++		val = ub->ub_parms[resource].held;
++	}
++	ub->ub_parms[resource].held -= val;
++}
++
++void uncharge_beancounter(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	unsigned long flags;
++	struct user_beancounter *p;
++
++	for (p = ub; p != NULL; p = p->parent) {
++		spin_lock_irqsave(&p->ub_lock, flags);
++		__uncharge_beancounter_locked(p, resource, val);
++		spin_unlock_irqrestore(&p->ub_lock, flags);
++	}
++}
++
++EXPORT_SYMBOL(uncharge_beancounter);
++
++void __uncharge_beancounter_notop(struct user_beancounter *ub,
++		int resource, unsigned long val)
++{
++	struct user_beancounter *p;
++	unsigned long flags;
++
++	local_irq_save(flags);
++	for (p = ub; p->parent != NULL; p = p->parent) {
++		spin_lock(&p->ub_lock);
++		__uncharge_beancounter_locked(p, resource, val);
++		spin_unlock(&p->ub_lock);
++	}
++	local_irq_restore(flags);
++}
++
++EXPORT_SYMBOL(__uncharge_beancounter_notop);
++
++
++/*
++ *	Rate limiting stuff.
++ */
++int ub_ratelimit(struct ub_rate_info *p)
++{
++	unsigned long cjif, djif;
++	unsigned long flags;
++	static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
++	long new_bucket;
++
++	spin_lock_irqsave(&ratelimit_lock, flags);
++	cjif = jiffies;
++	djif = cjif - p->last;
++	if (djif < p->interval) {
++		if (p->bucket >= p->burst) {
++			spin_unlock_irqrestore(&ratelimit_lock, flags);
++			return 0;
++		}
++		p->bucket++;
++	} else {
++		new_bucket = p->bucket - (djif / (unsigned)p->interval);
++		if (new_bucket < 0)
++			new_bucket = 0;
++		p->bucket = new_bucket + 1;
++	}
++	p->last = cjif;
++	spin_unlock_irqrestore(&ratelimit_lock, flags);
++	return 1;
++}
++EXPORT_SYMBOL(ub_ratelimit);
++
++
++/*
++ *	Initialization
++ *
++ *	struct user_beancounter contains
++ *	 - limits and other configuration settings,
++ *	   with a copy stored for accounting purposes,
++ *	 - structural fields: lists, spinlocks and so on.
++ *
++ *	Before these parts are initialized, the structure should be memset
++ *	to 0 or copied from a known clean structure.  That takes care of a lot
++ *	of fields not initialized explicitly.
++ */
++
++static void init_beancounter_struct(struct user_beancounter *ub)
++{
++	ub->ub_magic = UB_MAGIC;
++	atomic_set(&ub->ub_refcount, 1);
++	spin_lock_init(&ub->ub_lock);
++	INIT_LIST_HEAD(&ub->ub_tcp_sk_list);
++	INIT_LIST_HEAD(&ub->ub_other_sk_list);
++#ifdef CONFIG_BC_DEBUG_KMEM
++	INIT_LIST_HEAD(&ub->ub_cclist);
++#endif
++	bc_init_ioprio(&ub->iopriv);
++}
++
++static void init_beancounter_store(struct user_beancounter *ub)
++{
++	int k;
++
++	for (k = 0; k < UB_RESOURCES; k++) {
++		memcpy(&ub->ub_store[k], &ub->ub_parms[k],
++				sizeof(struct ubparm));
++	}
++}
++
++static void init_beancounter_nolimits(struct user_beancounter *ub)
++{
++	int k;
++
++	for (k = 0; k < UB_RESOURCES; k++) {
++		ub->ub_parms[k].limit = UB_MAXVALUE;
++		/* FIXME: whether this is right for physpages and guarantees? */
++		ub->ub_parms[k].barrier = UB_MAXVALUE;
++	}
++
++	/* FIXME: set unlimited rate? */
++	ub->ub_limit_rl.burst = 4;
++	ub->ub_limit_rl.interval = 300*HZ;
++}
++
++static void init_beancounter_syslimits(struct user_beancounter *ub)
++{
++	unsigned long mp;
++	extern int max_threads;
++	int k;
++
++	mp = num_physpages;
++	ub->ub_parms[UB_KMEMSIZE].limit = 
++		mp > (192*1024*1024 >> PAGE_SHIFT) ?
++				32*1024*1024 : (mp << PAGE_SHIFT) / 6;
++	ub->ub_parms[UB_LOCKEDPAGES].limit = 8;
++	ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE;
++	ub->ub_parms[UB_SHMPAGES].limit = 64;
++	ub->ub_parms[UB_NUMPROC].limit = max_threads / 2;
++	ub->ub_parms[UB_NUMTCPSOCK].limit = 1024;
++	ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */
++	ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */
++	ub->ub_parms[UB_NUMOTHERSOCK].limit = 256;
++	ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */
++	ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */
++	ub->ub_parms[UB_NUMFLOCK].limit = 1024;
++	ub->ub_parms[UB_NUMPTY].limit = 16;
++	ub->ub_parms[UB_NUMSIGINFO].limit = 1024;
++	ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024;
++	ub->ub_parms[UB_NUMFILE].limit = 1024;
++
++	for (k = 0; k < UB_RESOURCES; k++)
++		ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
++
++	ub->ub_limit_rl.burst = 4;
++	ub->ub_limit_rl.interval = 300*HZ;
++}
++
++#ifdef CONFIG_SMP
++static struct percpu_data ub0_percpu;
++#endif
++static struct ub_percpu_struct ub0_percpu_data[NR_CPUS];
++
++void __init ub_init_early(void)
++{
++	struct user_beancounter *ub;
++
++	init_cache_counters();
++	ub = get_ub0();
++	memset(ub, 0, sizeof(*ub));
++	ub->ub_uid = 0;
++	init_beancounter_nolimits(ub);
++	init_beancounter_store(ub);
++	init_beancounter_struct(ub);
++	ub->ub_percpu = static_percpu_ptr(&ub0_percpu, ub0_percpu_data);
++
++	memset(&current->task_bc, 0, sizeof(struct task_beancounter));
++	(void)set_exec_ub(ub);
++	current->task_bc.task_ub = get_beancounter(ub);
++	__charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE);
++	current->task_bc.fork_sub = get_beancounter(ub);
++	ub_init_task_bc(&current->task_bc);
++	init_mm.mm_ub = get_beancounter(ub);
++
++	hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]);
++	list_add(&ub->ub_list, &ub_list_head);
++}
++
++void __init ub_init_late(void)
++{
++	ub_cachep = kmem_cache_create("user_beancounters",
++			sizeof(struct user_beancounter),
++			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
++
++	memset(&default_beancounter, 0, sizeof(default_beancounter));
++#ifdef CONFIG_BC_UNLIMITED
++	init_beancounter_nolimits(&default_beancounter);
++#else
++	init_beancounter_syslimits(&default_beancounter);
++#endif
++	init_beancounter_store(&default_beancounter);
++	init_beancounter_struct(&default_beancounter);
++}
+diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c
+new file mode 100644
+index 0000000..2242d64
+--- /dev/null
++++ b/kernel/bc/dcache.c
+@@ -0,0 +1,399 @@
++/*
++ *  kernel/bc/dcache.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/dcache.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/sysctl.h>
++#include <linux/swap.h>
++#include <linux/stop_machine.h>
++#include <linux/cpumask.h>
++#include <linux/nmi.h>
++#include <linux/rwsem.h>
++#include <linux/rcupdate.h>
++#include <linux/highmem.h>
++#include <asm/bitops.h>
++
++#include <bc/beancounter.h>
++#include <bc/kmem.h>
++#include <bc/dcache.h>
++#include <bc/dcache_op.h>
++
++/*
++ * Locking
++ *                          traverse  dcache_lock  d_lock
++ *        ub_dentry_charge   +         -            +
++ *      ub_dentry_uncharge   +         +            -
++ * ub_dentry_charge_nofail   +         +            -
++ *
++ * d_inuse changes are atomic, with special handling of "not in use" <->
++ * "in use" (-1 <-> 0) transitions.  We have two sources of non-atomicity
++ * here: (1) in many operations we need to change d_inuse of both dentry and
++ * its parent, and (2) on state transitions we need to adjust the account.
++ *
++ * Regarding (1): we do not have (and do not want) a single lock covering all
++ * operations, so in general it's impossible to get a consistent view of
++ * a tree with respect to d_inuse counters (except by swsuspend).  It also
++ * means if a dentry with d_inuse of 0 gets one new in-use child and loses
++ * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0,
++ * and we can't say which way.
++ * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since
++ * uncharge can be done only after return from charge (with d_genocide being
++ * the only apparent exception).
++ * Regarding (2): there is a similar uncertainty with the dcache account.
++ * If the account is equal to the limit, one more dentry is started to be
++ * used and one is put, the account will either hit the limit (and an error
++ * will be returned), or decrement will happen before increment.
++ *
++ * These races do not really matter.
++ * The only things we want are:
++ *  - if a system is suspenede with no in-use dentries, all d_inuse counters
++ *    should be correct (-1);
++ *  - d_inuse counters should always be >= -1.
++ * This holds if ->parent references are accessed and maintained properly.
++ * In subtle moments (like d_move) dentries exchanging their parents should
++ * both be in-use.  At d_genocide time, lookups and charges are assumed to be
++ * impossible.
++ */
++
++/*
++ * Hierarchical accounting
++ * UB argument must NOT be NULL
++ */
++
++static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, 
++		enum ub_severity sv)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv))
++		goto out_mem;
++	if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv))
++		goto out_dcache;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return 0;
++
++out_dcache:
++	__uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
++out_mem:
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return -ENOMEM;
++}
++
++static void do_uncharge_dcache(struct user_beancounter *ub, 
++		unsigned long size)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	__uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
++	__uncharge_beancounter_locked(ub, UB_DCACHESIZE, size);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static int charge_dcache(struct user_beancounter *ub, unsigned long size, 
++		enum ub_severity sv)
++{
++	struct user_beancounter *p, *q;
++
++	for (p = ub; p != NULL; p = p->parent) {
++		if (do_charge_dcache(p, size, sv))
++			goto unroll;
++	}
++	return 0;
++
++unroll:
++	for (q = ub; q != p; q = q->parent)
++		do_uncharge_dcache(q, size);
++	return -ENOMEM;
++}
++
++void uncharge_dcache(struct user_beancounter *ub, unsigned long size)
++{
++	for (; ub != NULL; ub = ub->parent)
++		do_uncharge_dcache(ub, size);
++}
++
++/*
++ * Simple helpers to do maintain account and d_ub field.
++ */
++
++static inline int d_charge(struct dentry_beancounter *d_bc)
++{
++	struct user_beancounter *ub;
++
++	ub = get_beancounter(get_exec_ub());
++	if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) {
++		put_beancounter(ub);
++		return -1;
++	}
++	d_bc->d_ub = ub;
++	return 0;
++}
++
++static inline void d_forced_charge(struct dentry_beancounter *d_bc)
++{
++	struct user_beancounter *ub;
++
++	ub = get_beancounter(get_exec_ub());
++	charge_dcache(ub, d_bc->d_ubsize, UB_FORCE);
++	d_bc->d_ub = ub;
++}
++
++/*
++ * Minor helpers
++ */
++
++extern struct kmem_cache *dentry_cache; 
++extern struct kmem_cache *inode_cachep;
++static struct rw_semaphore ub_dentry_alloc_sem;
++
++static inline unsigned long d_charge_size(struct dentry *dentry)
++{
++	/* dentry's d_name is already set to appropriate value (see d_alloc) */
++	return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) +
++		(dname_external(dentry) ?
++		 kmem_dname_objuse((void *)dentry->d_name.name) : 0);
++}
++
++/*
++ * Entry points from dcache.c
++ */
++
++/* 
++ * Set initial d_inuse on d_alloc.
++ * Called with no locks, preemption disabled.
++ */
++int __ub_dentry_alloc(struct dentry *dentry)
++{
++	struct dentry_beancounter *d_bc;
++
++	d_bc = &dentry->dentry_bc;
++	d_bc->d_ub = get_beancounter(get_exec_ub());
++	atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */
++	d_bc->d_ubsize = d_charge_size(dentry);
++
++	if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD))
++		goto failure;
++	return 0;
++
++failure:
++	put_beancounter(d_bc->d_ub);
++	d_bc->d_ub = NULL;
++	return -ENOMEM;
++}
++void __ub_dentry_alloc_start(void)
++{
++	down_read(&ub_dentry_alloc_sem);
++	current->task_bc.dentry_alloc = 1;
++}
++
++void __ub_dentry_alloc_end(void)
++{
++	current->task_bc.dentry_alloc = 0;
++	up_read(&ub_dentry_alloc_sem);
++}
++
++/*
++ * It is assumed that parent is already in use, so traverse upwards is
++ * limited to one ancestor only.
++ * Called under d_lock and rcu_read_lock.
++ */
++int __ub_dentry_charge(struct dentry *dentry)
++{
++	struct dentry_beancounter *d_bc;
++	struct dentry *parent;
++	int ret;
++
++	if (ub_dget_testone(dentry)) {
++		d_bc = &dentry->dentry_bc;
++		/* state transition -1 => 0 */
++		if (d_charge(d_bc))
++			goto failure;
++
++		if (dentry != dentry->d_parent) {
++			parent = dentry->d_parent;
++			if (ub_dget_testone(parent))
++				BUG();
++		}
++	}
++	return 0;
++
++failure:
++	/*
++	 * Here we would like to fail the lookup.
++	 * It is not easy: if d_lookup fails, callers expect that a dentry
++	 * with the given name doesn't exist, and create a new one.
++	 * So, first we forcedly charge for this dentry.
++	 * Then try to remove it from cache safely.  If it turns out to be
++	 * possible, we can return error.
++	 */
++	d_forced_charge(d_bc);
++
++	if (dentry != dentry->d_parent) {
++		parent = dentry->d_parent;
++		if (ub_dget_testone(parent))
++			BUG();
++	}
++
++	ret = 0;
++	if (spin_trylock(&dcache_lock)) {
++		if (!list_empty(&dentry->d_subdirs)) {
++			spin_unlock(&dentry->d_lock);
++			spin_unlock(&dcache_lock);
++			rcu_read_unlock();
++			shrink_dcache_parent(dentry);
++			rcu_read_lock();
++			spin_lock(&dcache_lock);
++			spin_lock(&dentry->d_lock);
++		}
++		if (atomic_read(&dentry->d_count) == 1) {
++			__d_drop(dentry);
++			ret = -1;
++		}
++		spin_unlock(&dcache_lock);
++	}
++
++	return ret;
++}
++
++/*
++ * Go up in the tree decreasing d_inuse.
++ * Called under dcache_lock.
++ */
++void __ub_dentry_uncharge(struct dentry *dentry)
++{
++	struct dentry *parent;
++	struct user_beancounter *ub;
++	unsigned long size;
++
++	/* go up until state doesn't change or and root is reached */
++	size = dentry->dentry_bc.d_ubsize;
++	ub = dentry->dentry_bc.d_ub;
++	while (ub_dput_testzero(dentry)) {
++		/* state transition 0 => -1 */
++		uncharge_dcache(ub, size);
++		put_beancounter(ub);
++
++		parent = dentry->d_parent;
++		if (dentry == parent)
++			break;
++
++		dentry = parent;
++		size = dentry->dentry_bc.d_ubsize;
++		ub = dentry->dentry_bc.d_ub;
++	}
++}
++
++/* 
++ * Forced charge for __dget_locked, where API doesn't allow to return error.
++ * Called under dcache_lock.
++ */
++void __ub_dentry_charge_nofail(struct dentry *dentry)
++{
++	struct dentry *parent;
++
++	while (ub_dget_testone(dentry)) {
++		/* state transition -1 => 0 */
++		d_forced_charge(&dentry->dentry_bc);
++
++		parent = dentry->d_parent;
++		if (dentry == parent)
++			break;
++		dentry = parent;
++	}
++}
++
++/*
++ * Adaptive accounting
++ */
++
++int ub_dentry_on = 1;
++int ub_dentry_alloc_barrier;
++EXPORT_SYMBOL(ub_dentry_on);
++
++static unsigned long checklowat = 0;
++static unsigned long checkhiwat = ULONG_MAX;
++
++static int sysctl_ub_dentry_chk = 10;
++#define sysctl_ub_lowat	sysctl_ub_watermark[0]
++#define sysctl_ub_hiwat sysctl_ub_watermark[1]
++static DECLARE_RWSEM(ub_dentry_alloc_sem);
++/* 1024th of lowmem size */
++static unsigned int sysctl_ub_watermark[2] = {0, 100};
++
++static void ub_dentry_set_limits(unsigned long pages, unsigned long cap)
++{
++	down_write(&ub_dentry_alloc_sem);
++	preempt_disable();
++	checklowat = (pages >> 10) * sysctl_ub_lowat;
++	checkhiwat = (pages >> 10) * sysctl_ub_hiwat;
++	if (checkhiwat > cap) {
++		checkhiwat = cap;
++		checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat;
++	}
++	preempt_enable();
++	up_write(&ub_dentry_alloc_sem);
++}
++
++static int ub_dentry_proc_handler(ctl_table *ctl, int write, struct file *filp,
++			  void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++	int r;
++
++	r = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
++	if (!r && write)
++		ub_dentry_set_limits(totalram_pages - totalhigh_pages,
++				ULONG_MAX);
++	return r;
++}
++
++static ctl_table ub_dentry_sysctl_table[] = {
++	{
++		.procname	= "dentry_check",
++		.data		= &sysctl_ub_dentry_chk,
++		.maxlen		= sizeof(sysctl_ub_dentry_chk),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{
++		.procname	= "dentry_watermark",
++		.data		= &sysctl_ub_lowat,
++		.maxlen		= sizeof(sysctl_ub_lowat) * 2,
++		.mode		= 0644,
++		.proc_handler	= ub_dentry_proc_handler,
++	},
++	{ .ctl_name = 0 }
++};
++static ctl_table ub_dentry_sysctl_root[] = {
++	{
++		.procname	= "ubc",
++		.mode		= 0555,
++		.child		= ub_dentry_sysctl_table,
++	},
++	{ .ctl_name = 0 }
++};
++
++static int __init ub_dentry_init(void)
++{
++	/*
++	 * Initial watermarks are limited, to limit walk time.
++	 * 384MB translates into 0.8 sec on PIII 866MHz.
++	 */
++	ub_dentry_set_limits(totalram_pages - totalhigh_pages,
++			384 * 1024 * 1024 / PAGE_SIZE);
++	if (register_sysctl_table(ub_dentry_sysctl_root) == NULL)
++		return -ENOMEM;
++	return 0;
++}
++__initcall(ub_dentry_init);
+diff --git a/kernel/bc/io_acct.c b/kernel/bc/io_acct.c
+new file mode 100644
+index 0000000..e8d6c38
+--- /dev/null
++++ b/kernel/bc/io_acct.c
+@@ -0,0 +1,500 @@
++/*
++ *  kernel/bc/io_acct.c
++ *
++ *  Copyright (C) 2006  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ *  Pavel Emelianov <xemul at openvz.org>
++ *
++ */
++
++#include <linux/mm.h>
++#include <linux/mempool.h>
++#include <linux/proc_fs.h>
++#include <linux/virtinfo.h>
++#include <linux/pagemap.h>
++
++#include <bc/beancounter.h>
++#include <bc/io_acct.h>
++#include <bc/rss_pages.h>
++#include <bc/vmpages.h>
++#include <bc/proc.h>
++
++static struct mempool_s *pb_pool;
++
++#define PB_MIN_IO	(1024)
++
++static inline struct page_beancounter *io_pb_alloc(void)
++{
++	return mempool_alloc(pb_pool, GFP_ATOMIC);
++}
++
++static inline void io_pb_free(struct page_beancounter *pb)
++{
++	mempool_free(pb, pb_pool);
++}
++
++struct page_beancounter **page_pblist(struct page *page)
++{
++	struct page_beancounter **pb, *iopb;
++
++	pb = &page_pbc(page);
++	iopb = iopb_to_pb(*pb);
++
++	return iopb == NULL ? pb : &iopb->page_pb_list;
++}
++
++/*
++ * We save the context page was set dirty to use it later
++ * when the real write starts. If the page is mapped then
++ * IO pb is stores like this:
++ *
++ * Before saving:
++ *
++ *  +- page -------+
++ *  | ...          |
++ *  | page_pb      +---+
++ *  +--------------+   |   +-----+    +-----+          +-----+
++ *                     +-> | pb1 | -> | pb2 | - ... -> | pbN | -+
++ *                         +-----+    +-----+          +-----+  |
++ *                            ^                                 |
++ *                            +---------------------------------+
++ *
++ * After saving:
++ *
++ *  +- page -------+      +- io pb ------+
++ *  | ...          |      | ...          |
++ *  | page_pb      +----> | page_pb_list +-+
++ *  +--------------+      +--------------+ |
++ *                                         |
++ *                     +-------------------+
++ *                     |
++ *                     |   +-----+    +-----+          +-----+
++ *                     +-> | pb1 | -> | pb2 | - ... -> | pbN | -+
++ *                         +-----+    +-----+          +-----+  |
++ *                            ^                                 |
++ *                            +---------------------------------+
++ *
++ * And the page_pblist(...) function returns pointer to the place that
++ * points to this pbX ring.
++ */
++
++#ifdef CONFIG_BC_DEBUG_IO
++static LIST_HEAD(pb_io_list);
++static unsigned long anon_pages, not_released;
++
++static inline void io_debug_save(struct page_beancounter *pb,
++		struct page_beancounter *mpb)
++{
++	pb->io_debug = (mpb == NULL);
++	list_add(&pb->io_list, &pb_io_list);
++}
++
++static inline void io_debug_release(struct page_beancounter *pb)
++{
++	list_del(&pb->io_list);
++}
++
++void ub_io_release_debug(struct page *page)
++{
++	struct page_beancounter *pb;
++	static int once = 0;
++
++	pb = page_pbc(page);
++	if (likely(iopb_to_pb(pb) == NULL))
++		return;
++
++	if (!once) {
++		printk("BUG: Page has an IO bc but is not expectd to\n");
++		dump_stack();
++		once = 1;
++	}
++
++	spin_lock(&pb_lock);
++	not_released++;
++	pb = iopb_to_pb(pb);
++	page_pbc(page) = NULL;
++	io_debug_release(pb);
++	pb->ub->io_pb_held--;
++	spin_unlock(&pb_lock);
++
++	put_beancounter(pb->ub);
++	io_pb_free(pb);
++}
++
++static inline int io_debug_precheck_save(struct page *page)
++{
++	if (unlikely(PageAnon(page))) {
++		anon_pages++;
++		return 1;
++	}
++
++	return 0;
++}
++
++static inline int io_debug_precheck_release(struct page *page)
++{
++	return 0;
++}
++#else
++#define io_debug_save(pb, mpb)	do { } while (0)
++#define io_debug_release(pb)	do { } while (0)
++#define io_debug_precheck_save(page)		(0)
++#define io_debug_precheck_release(p)		(0)
++#endif
++
++static inline void set_page_io(struct page *page, struct page_beancounter *pb,
++		struct page_beancounter *mapped_pb)
++{
++	unsigned long val;
++
++	val = (unsigned long)pb | PAGE_IO_MARK;
++	pb->page = page;
++
++	page_pbc(page) = (struct page_beancounter *)val;
++	io_debug_save(pb, mapped_pb);
++	pb->ub->io_pb_held++;
++}
++
++static inline void put_page_io(struct page *page, struct page_beancounter *pb)
++{
++	pb->ub->io_pb_held--;
++	io_debug_release(pb);
++	page_pbc(page) = pb->page_pb_list;
++}
++
++void ub_io_save_context(struct page *page, size_t bytes_dirtied)
++{
++	struct user_beancounter *ub;
++	struct page_beancounter *pb, *mapped_pb, *io_pb;
++
++	if (unlikely(in_interrupt())) {
++		WARN_ON_ONCE(1);
++		return;
++	}
++
++	/*
++	 * FIXME - this can happen from atomic context and
++	 * it's probably not that good to loose some requests
++	 */
++
++	pb = io_pb_alloc();
++	io_pb = NULL;
++
++	spin_lock(&pb_lock);
++	if (io_debug_precheck_save(page))
++		goto out_unlock;
++
++	mapped_pb = page_pbc(page);
++	io_pb = iopb_to_pb(mapped_pb);
++	if (io_pb != NULL) {
++		/*
++		 * this page has an IO - release it and force a new one
++		 * We could also race with page cleaning - see below
++		 */
++		mapped_pb = io_pb->page_pb_list;
++		put_page_io(page, io_pb);
++	}
++
++	/*
++	 * If the page is mapped we must save the context
++	 * it maps to. If the page isn't mapped we use current
++	 * context as this is a regular write.
++	 */
++
++	if (mapped_pb != NULL)
++		ub = top_beancounter(mapped_pb->ub);
++	else
++		ub = get_io_ub();
++
++	if (!PageDirty(page)) {
++		/*
++		 * race with clear_page_dirty(_for_io) - account
++		 * writes for ub_io_release_context()
++		 */
++		if (io_pb != NULL)
++			io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE;
++		if (pb != NULL)
++			io_pb_free(pb);
++		goto out_unlock;
++	}
++
++	if (pb == NULL) {
++		ub->bytes_dirty_missed += bytes_dirtied;
++		goto out_unlock;
++	}
++
++	/*
++	 * the page may become clean here, but the context will be seen
++	 * in ub_io_release_context()
++	 */
++
++	pb->ub = get_beancounter(ub);
++	pb->page_pb_list = mapped_pb;
++	ub->bytes_dirtied += bytes_dirtied;
++
++	set_page_io(page, pb, mapped_pb);
++
++out_unlock:
++	spin_unlock(&pb_lock);
++
++	if (io_pb != NULL) {
++		put_beancounter(io_pb->ub);
++		io_pb_free(io_pb);
++	}
++}
++
++void ub_io_release_context(struct page *page, size_t wrote)
++{
++	struct page_beancounter *pb;
++
++	if (io_debug_precheck_release(page))
++		return;
++
++	if (unlikely(in_interrupt())) {
++		WARN_ON_ONCE(1);
++		return;
++	}
++
++	spin_lock(&pb_lock);
++	pb = iopb_to_pb(page_pbc(page));
++	if (unlikely(pb == NULL))
++		/*
++		 * this may happen if we failed to allocate
++		 * context in ub_io_save_context or raced with it
++		 */
++		goto out_unlock;
++
++	if (wrote)
++		pb->ub->bytes_wrote += wrote;
++
++	put_page_io(page, pb);
++out_unlock:
++	spin_unlock(&pb_lock);
++
++	if (pb != NULL) {
++		put_beancounter(pb->ub);
++		io_pb_free(pb);
++	}
++}
++
++void __init ub_init_io(struct kmem_cache *pb_cachep)
++{
++	pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep);
++	if (pb_pool == NULL)
++		panic("Can't create pb_pool");
++}
++
++#ifdef CONFIG_PROC_FS
++#define in_flight(var)	(var > var##_done ? var - var##_done : 0)
++
++static int bc_ioacct_show(struct seq_file *f, void *v)
++{
++	int i;
++	unsigned long long read, write, cancel;
++	unsigned long sync, sync_done;
++	unsigned long fsync, fsync_done;
++	unsigned long fdsync, fdsync_done;
++	unsigned long frsync, frsync_done;
++	unsigned long reads, writes;
++	unsigned long long rchar, wchar;
++	struct user_beancounter *ub;
++
++	ub = seq_beancounter(f);
++
++	read = write = cancel = 0;
++	sync = sync_done = fsync = fsync_done =
++		fdsync = fdsync_done = frsync = frsync_done = 0;
++	reads = writes = 0;
++	rchar = wchar = 0;
++	for_each_online_cpu(i) {
++		struct ub_percpu_struct *ub_percpu;
++		ub_percpu = per_cpu_ptr(ub->ub_percpu, i);
++
++		read += ub_percpu->bytes_read;
++		write += ub_percpu->bytes_wrote;
++		cancel += ub_percpu->bytes_cancelled;
++
++		sync += ub_percpu->sync;
++		fsync += ub_percpu->fsync;
++		fdsync += ub_percpu->fdsync;
++		frsync += ub_percpu->frsync;
++		sync_done += ub_percpu->sync_done;
++		fsync_done += ub_percpu->fsync_done;
++		fdsync_done += ub_percpu->fdsync_done;
++		frsync_done += ub_percpu->frsync_done;
++
++		reads += ub_percpu->read;
++		writes += ub_percpu->write;
++		rchar += ub_percpu->rchar;
++		wchar += ub_percpu->wchar;
++	}
++
++	seq_printf(f, bc_proc_llu_fmt, "read", read);
++	seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write);
++	seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied);
++	seq_printf(f, bc_proc_llu_fmt, "cancel", cancel);
++	seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed);
++
++	seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync);
++	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync);
++	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync);
++	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync);
++
++	seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync));
++	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync));
++	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync));
++	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync));
++
++	seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads);
++	seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar);
++	seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes);
++	seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar);
++
++	seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held);
++	return 0;
++}
++
++static struct bc_proc_entry bc_ioacct_entry = {
++	.name = "ioacct",
++	.u.show = bc_ioacct_show,
++};
++
++#ifdef CONFIG_BC_DEBUG_IO
++#define PTR_SIZE (int)(sizeof(void *) * 2)
++#define INT_SIZE (int)(sizeof(int) * 2)
++
++static int bc_io_show(struct seq_file *f, void *v)
++{
++	struct list_head *lh;
++	struct page_beancounter *pb;
++	struct page *pg;
++
++	lh = (struct list_head *)v;
++	if (lh == &pb_io_list) {
++		seq_printf(f, "Races: anon %lu missed %lu\n",
++				anon_pages, not_released);
++
++		seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s "
++				"%-*s %-*s %-1s %-*s %-*s\n",
++				PTR_SIZE, "pb", "",
++				PTR_SIZE, "page", "flg",
++				INT_SIZE, "cnt", INT_SIZE, "mcnt",
++				PTR_SIZE, "pb_list",
++				PTR_SIZE, "page_pb", "",
++				PTR_SIZE, "mapping",
++				INT_SIZE, "ub");
++		return 0;
++	}
++
++	pb = list_entry(lh, struct page_beancounter, io_list);
++	pg = pb->page;
++	seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n",
++			pb, pb->io_debug ? 'e' : 'm', pg,
++			PageDirty(pg) ? 'D' : 'd',
++			PageAnon(pg) ? 'A' : 'a',
++			PageWriteback(pg) ? 'W' : 'w',
++			PageLocked(pg) ? 'L' : 'l',
++			INT_SIZE, page_count(pg),
++			INT_SIZE, page_mapcount(pg),
++			pb->page_pb_list, page_pbc(pg),
++			iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!',
++			pg->mapping, pb->ub->ub_uid);
++	return 0;
++}
++
++static void *bc_io_start(struct seq_file *f, loff_t *ppos)
++{
++	spin_lock(&pb_lock);
++	return seq_list_start_head(&pb_io_list, *ppos);
++}
++
++static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos)
++{
++	return seq_list_next(v, &pb_io_list, ppos);
++}
++
++static void bc_io_stop(struct seq_file *f, void *v)
++{
++	spin_unlock(&pb_lock);
++}
++
++static struct seq_operations bc_io_seq_ops = {
++	.start = bc_io_start,
++	.next  = bc_io_next,
++	.stop  = bc_io_stop,
++	.show  = bc_io_show,
++};
++
++static int bc_io_open(struct inode *inode, struct file *filp)
++{
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return -EACCES;
++
++	return seq_open(filp, &bc_io_seq_ops);
++}
++static struct file_operations bc_io_debug_ops = {
++	.open		= bc_io_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++
++static struct bc_proc_entry bc_ioacct_debug_entry = {
++	.name		= "ioacct_debug",
++	.u.fops		= &bc_io_debug_ops,
++};
++#endif
++
++static int bc_ioacct_notify(struct vnotifier_block *self,
++		unsigned long event, void *arg, int old_ret)
++{
++	struct user_beancounter *ub;
++	unsigned long *vm_events;
++	unsigned long long bin, bout;
++	int i;
++
++	if (event != VIRTINFO_VMSTAT)
++		return old_ret;
++
++	ub = top_beancounter(get_exec_ub());
++	if (ub == get_ub0())
++		return old_ret;
++
++	/* Think over: do we need to account here bytes_dirty_missed? */
++	bout = ub->bytes_wrote;
++	bin = 0;
++	for_each_online_cpu(i) {
++		bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote;
++		bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read;
++	}
++
++	/* convert to Kbytes */
++	bout >>= 10;
++	bin >>= 10;
++
++	vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS;
++	vm_events[PGPGOUT] = (unsigned long)bout;
++	vm_events[PGPGIN] = (unsigned long)bin;
++	return NOTIFY_OK;
++}
++
++static struct vnotifier_block bc_ioacct_nb = {
++	.notifier_call = bc_ioacct_notify,
++};
++
++static int __init bc_ioacct_init(void)
++{
++#ifdef CONFIG_BC_DEBUG_IO
++	bc_register_proc_root_entry(&bc_ioacct_debug_entry);
++#endif
++	bc_register_proc_entry(&bc_ioacct_entry);
++
++	virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb);
++	return 0;
++}
++
++late_initcall(bc_ioacct_init);
++#endif
+diff --git a/kernel/bc/io_prio.c b/kernel/bc/io_prio.c
+new file mode 100644
+index 0000000..20aa133
+--- /dev/null
++++ b/kernel/bc/io_prio.c
+@@ -0,0 +1,288 @@
++/*
++ *  kernel/bc/io_prio.c
++ *
++ *  Copyright (C) 2007 SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ *  Vasily Tarasov <vtaras at openvz.org>
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/cfq-iosched.h>
++#include <bc/io_prio.h>
++#include <bc/beancounter.h>
++#include <bc/hash.h>
++#include <bc/io_acct.h>
++#include <linux/blkdev.h>
++
++struct cfq_bc_data *__find_cfq_bc(struct ub_iopriv *iopriv,
++							struct cfq_data *cfqd)
++{
++	struct cfq_bc_data *cfq_bc;
++
++	list_for_each_entry(cfq_bc, &iopriv->cfq_bc_head, cfq_bc_list)
++		if (cfq_bc->cfqd == cfqd)
++			return cfq_bc;
++
++	return NULL;
++}
++
++struct cfq_bc_data *bc_find_cfq_bc(struct ub_iopriv *iopriv,
++					struct cfq_data *cfqd)
++{
++	struct cfq_bc_data *cfq_bc;
++	unsigned long flags;
++
++	read_lock_irqsave(&iopriv->cfq_bc_list_lock, flags);
++	cfq_bc = __find_cfq_bc(iopriv, cfqd);
++	read_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags);
++	return cfq_bc;
++}
++struct cfq_bc_data *bc_findcreate_cfq_bc(struct ub_iopriv *iopriv,
++					struct cfq_data *cfqd, gfp_t gfp_mask)
++{
++	struct cfq_bc_data *cfq_bc_new;
++	struct cfq_bc_data *cfq_bc;
++	unsigned long flags;
++
++	cfq_bc = bc_find_cfq_bc(iopriv, cfqd);
++	if (cfq_bc)
++		return cfq_bc;
++
++	cfq_bc_new = kzalloc(sizeof(*cfq_bc_new), gfp_mask);
++	if (!cfq_bc_new)
++		return NULL;
++
++	cfq_init_cfq_bc(cfq_bc_new);
++	cfq_bc_new->cfqd = cfqd;
++	cfq_bc_new->ub_iopriv = iopriv;
++
++	write_lock_irqsave(&iopriv->cfq_bc_list_lock, flags);
++	cfq_bc = __find_cfq_bc(iopriv, cfqd);
++	if (cfq_bc)
++		kfree(cfq_bc_new);
++	else {
++		list_add_tail(&cfq_bc_new->cfq_bc_list,
++					&iopriv->cfq_bc_head);
++		cfq_bc = cfq_bc_new;
++	}
++	write_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags);
++
++	return cfq_bc;
++}
++
++void bc_init_ioprio(struct ub_iopriv *iopriv)
++{
++	INIT_LIST_HEAD(&iopriv->cfq_bc_head);
++	rwlock_init(&iopriv->cfq_bc_list_lock);
++	iopriv->ioprio = UB_IOPRIO_BASE;
++}
++
++static void inline bc_cfq_bc_check_empty(struct cfq_bc_data *cfq_bc)
++{
++	BUG_ON(!RB_EMPTY_ROOT(&cfq_bc->service_tree.rb));
++}
++
++static void bc_release_cfq_bc(struct cfq_bc_data *cfq_bc)
++{
++	struct cfq_data *cfqd;
++	elevator_t *eq;
++	int i;
++
++	cfqd = cfq_bc->cfqd;
++	eq = cfqd->queue->elevator;
++
++	for (i = 0; i < CFQ_PRIO_LISTS; i++) {
++		if (cfq_bc->async_cfqq[0][i]) {
++			eq->ops->put_queue(cfq_bc->async_cfqq[0][i]);
++			cfq_bc->async_cfqq[0][i] = NULL;
++		}
++		if (cfq_bc->async_cfqq[1][i]) {
++			eq->ops->put_queue(cfq_bc->async_cfqq[1][i]);
++			cfq_bc->async_cfqq[1][i] = NULL;
++		}
++	}
++	if (cfq_bc->async_idle_cfqq) {
++		eq->ops->put_queue(cfq_bc->async_idle_cfqq);
++		cfq_bc->async_idle_cfqq = NULL;
++	}
++	/* 
++	 * Note: this cfq_bc is already not in active list,
++	 * but can be still pointed from cfqd as active.
++	 */
++	cfqd->active_cfq_bc = NULL;
++
++	bc_cfq_bc_check_empty(cfq_bc);
++	list_del(&cfq_bc->cfq_bc_list);
++	kfree(cfq_bc);
++}
++
++void bc_fini_ioprio(struct ub_iopriv *iopriv)
++{
++	struct cfq_bc_data *cfq_bc;
++	struct cfq_bc_data *cfq_bc_tmp;
++	unsigned long flags;
++	spinlock_t *queue_lock;
++
++	/* 
++	 * Don't get cfq_bc_list_lock since ub is already dead,
++	 * but async cfqqs are still in hash list, consequently
++	 * queue_lock should be hold.
++	 */
++	list_for_each_entry_safe(cfq_bc, cfq_bc_tmp,
++			&iopriv->cfq_bc_head, cfq_bc_list) {
++		queue_lock = cfq_bc->cfqd->queue->queue_lock;
++		spin_lock_irqsave(queue_lock, flags);
++		bc_release_cfq_bc(cfq_bc);
++		spin_unlock_irqrestore(queue_lock, flags);
++	}
++}
++
++void bc_cfq_exit_queue(struct cfq_data *cfqd)
++{
++	struct cfq_bc_data *cfq_bc;
++	struct user_beancounter *ub;
++
++	local_irq_disable();
++	for_each_beancounter(ub) {
++		write_lock(&ub->iopriv.cfq_bc_list_lock);
++		cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd);
++		if (!cfq_bc) {
++			write_unlock(&ub->iopriv.cfq_bc_list_lock);
++			continue;
++		}
++		bc_release_cfq_bc(cfq_bc);
++		write_unlock(&ub->iopriv.cfq_bc_list_lock);
++	}
++	local_irq_enable();
++}
++
++int bc_expired(struct cfq_data *cfqd)
++{
++	return time_after(jiffies, cfqd->slice_end) ?  1 : 0;
++}
++
++static inline int bc_empty(struct cfq_bc_data *cfq_bc)
++{
++	/*
++	 * consider BC as empty only if there is no requests
++	 * in elevator _and_ in driver
++	 */
++	if (!cfq_bc->rqnum && !cfq_bc->on_dispatch)
++		return 1;
++
++	return 0;
++}
++ 
++static inline unsigned long bc_time_slice_by_ioprio(unsigned int ioprio,
++ 							unsigned int base_slice)
++{
++ 	return	base_slice +
++ 		(base_slice * (ioprio - UB_IOPRIO_MIN))
++ 		/ (UB_IOPRIO_MAX - UB_IOPRIO_MIN - 1);
++}
++ 
++static inline void bc_set_active(struct cfq_data *cfqd)
++{
++	if (list_empty(&cfqd->act_cfq_bc_head)) {
++		cfqd->active_cfq_bc = NULL;
++		return;
++	}
++
++	cfqd->active_cfq_bc = list_first_entry(&cfqd->act_cfq_bc_head,
++					struct cfq_bc_data, act_cfq_bc_list);
++	list_move_tail(&cfqd->active_cfq_bc->act_cfq_bc_list,
++						&cfqd->act_cfq_bc_head);
++	cfqd->slice_end = jiffies +
++		bc_time_slice_by_ioprio(cfqd->active_cfq_bc->ub_iopriv->ioprio,
++							cfqd->cfq_ub_slice);
++}
++
++void bc_schedule_active(struct cfq_data *cfqd)
++{
++	if (bc_expired(cfqd) || !cfqd->active_cfq_bc ||
++				bc_empty(cfqd->active_cfq_bc))
++		bc_set_active(cfqd);
++}
++
++void bc_inc_rqnum(struct cfq_queue *cfqq)
++{
++	struct cfq_bc_data *cfq_bc;
++
++	cfq_bc = cfqq->cfq_bc;
++
++	if (!cfq_bc->rqnum)
++		list_add_tail(&cfq_bc->act_cfq_bc_list,
++				&cfqq->cfqd->act_cfq_bc_head);
++
++	cfq_bc->rqnum++;
++}
++
++void bc_dec_rqnum(struct cfq_queue *cfqq)
++{
++	struct cfq_bc_data *cfq_bc;
++
++	cfq_bc = cfqq->cfq_bc;
++
++	cfq_bc->rqnum--;
++
++	if (!cfq_bc->rqnum)
++		list_del(&cfq_bc->act_cfq_bc_list);
++}
++
++unsigned long bc_set_ioprio(int ubid, int ioprio)
++{
++	struct user_beancounter *ub;
++
++	if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX)
++		return -ERANGE;
++
++	ub = get_beancounter_byuid(ubid, 0);
++ 	if (!ub)
++		return -ESRCH;
++
++	ub->iopriv.ioprio = ioprio;
++	put_beancounter(ub);
++ 
++	return 0;
++}
++
++struct user_beancounter *bc_io_switch_context(struct page *page)
++{
++	struct page_beancounter *pb;
++	struct user_beancounter *old_ub = NULL;
++
++	pb = page_iopb(page);
++	pb = iopb_to_pb(pb);
++	if (pb) {
++		get_beancounter(pb->ub);
++		old_ub = set_exec_ub(pb->ub);
++	}
++	
++	return old_ub;
++}
++
++void bc_io_restore_context(struct user_beancounter *ub)
++{
++	struct user_beancounter *old_ub;
++
++	if (ub) {
++		old_ub = set_exec_ub(ub);
++		put_beancounter(old_ub);
++	}
++}
++
++EXPORT_SYMBOL(bc_io_switch_context);
++EXPORT_SYMBOL(bc_io_restore_context);
++EXPORT_SYMBOL(__find_cfq_bc);
++EXPORT_SYMBOL(bc_fini_ioprio);
++EXPORT_SYMBOL(bc_init_ioprio);
++EXPORT_SYMBOL(bc_findcreate_cfq_bc);
++EXPORT_SYMBOL(bc_cfq_exit_queue);
++EXPORT_SYMBOL(bc_expired);
++EXPORT_SYMBOL(bc_schedule_active);
++EXPORT_SYMBOL(bc_inc_rqnum);
++EXPORT_SYMBOL(bc_dec_rqnum);
+diff --git a/kernel/bc/kmem.c b/kernel/bc/kmem.c
+new file mode 100644
+index 0000000..74c4179
+--- /dev/null
++++ b/kernel/bc/kmem.c
+@@ -0,0 +1,406 @@
++/*
++ *  kernel/bc/kmem.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/slab.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/gfp.h>
++#include <linux/swap.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <linux/init.h>
++
++#include <bc/beancounter.h>
++#include <bc/kmem.h>
++#include <bc/rss_pages.h>
++#include <bc/hash.h>
++#include <bc/proc.h>
++
++/*
++ * Initialization
++ */
++
++/*
++ * Slab accounting
++ */
++
++#ifdef CONFIG_BC_DEBUG_KMEM
++
++#define CC_HASH_SIZE	1024
++static struct ub_cache_counter *cc_hash[CC_HASH_SIZE];
++spinlock_t cc_lock;
++
++static void __free_cache_counters(struct user_beancounter *ub,
++		struct kmem_cache *cachep)
++{
++	struct ub_cache_counter *cc, **pprev, *del;
++	int i;
++	unsigned long flags;
++
++	del = NULL;
++	spin_lock_irqsave(&cc_lock, flags);
++	for (i = 0; i < CC_HASH_SIZE; i++) {
++		pprev = &cc_hash[i];
++		cc = cc_hash[i];
++		while (cc != NULL) {
++			if (cc->ub != ub && cc->cachep != cachep) {
++				pprev = &cc->next;
++				cc = cc->next;
++				continue;
++			}
++
++			list_del(&cc->ulist);
++			*pprev = cc->next;
++			cc->next = del;
++			del = cc;
++			cc = *pprev;
++		}
++	}
++	spin_unlock_irqrestore(&cc_lock, flags);
++
++	while (del != NULL) {
++		cc = del->next;
++		kfree(del);
++		del = cc;
++	}
++}
++
++void ub_free_counters(struct user_beancounter *ub)
++{
++	__free_cache_counters(ub, NULL);
++}
++
++void ub_kmemcache_free(struct kmem_cache *cachep)
++{
++	__free_cache_counters(NULL, cachep);
++}
++
++void __init init_cache_counters(void)
++{
++	memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0]));
++	spin_lock_init(&cc_lock);
++}
++
++#define cc_hash_fun(ub, cachep)	(				\
++	(((unsigned long)(ub) >> L1_CACHE_SHIFT) ^		\
++	 ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^		\
++	 ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^		\
++	 ((unsigned long)(cachep) >> (BITS_PER_LONG / 2))	\
++	) & (CC_HASH_SIZE - 1))
++
++static int change_slab_charged(struct user_beancounter *ub,
++		struct kmem_cache *cachep, long val)
++{
++	struct ub_cache_counter *cc, *new_cnt, **pprev;
++	unsigned long flags;
++
++	new_cnt = NULL;
++again:
++	spin_lock_irqsave(&cc_lock, flags);
++	cc = cc_hash[cc_hash_fun(ub, cachep)];
++	while (cc) {
++		if (cc->ub == ub && cc->cachep == cachep)
++			goto found;
++		cc = cc->next;
++	}
++
++	if (new_cnt != NULL)
++		goto insert;
++
++	spin_unlock_irqrestore(&cc_lock, flags);
++
++	new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC);
++	if (new_cnt == NULL)
++		return -ENOMEM;
++
++	new_cnt->counter = 0;
++	new_cnt->ub = ub;
++	new_cnt->cachep = cachep;
++	goto again;
++
++insert:
++	pprev = &cc_hash[cc_hash_fun(ub, cachep)];
++	new_cnt->next = *pprev;
++	*pprev = new_cnt;
++	list_add(&new_cnt->ulist, &ub->ub_cclist);
++	cc = new_cnt;
++	new_cnt = NULL;
++
++found:
++	cc->counter += val;
++	spin_unlock_irqrestore(&cc_lock, flags);
++	if (new_cnt)
++		kfree(new_cnt);
++	return 0;
++}
++
++static inline int inc_slab_charged(struct user_beancounter *ub,
++	struct kmem_cache *cachep)
++{
++	return change_slab_charged(ub, cachep, 1);
++}
++
++static inline void dec_slab_charged(struct user_beancounter *ub,
++	struct kmem_cache *cachep)
++{
++	if (change_slab_charged(ub, cachep, -1) < 0)
++		BUG();
++}
++
++#include <linux/vmalloc.h>
++
++#define inc_pages_charged(ub, order)	ub_percpu_add(ub, \
++					pages_charged, 1 << order)
++#define dec_pages_charged(ub, order)	ub_percpu_sub(ub, \
++					pages_charged, 1 << order)
++
++#ifdef CONFIG_PROC_FS
++static int bc_kmem_debug_show(struct seq_file *f, void *v)
++{
++	struct user_beancounter *ub;
++	struct ub_cache_counter *cc;
++	long pages, vmpages, pbc;
++	int i;
++
++	ub = seq_beancounter(f);
++
++	pages = vmpages = pbc = 0;
++	for_each_online_cpu(i) {
++		pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged;
++		vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged;
++		pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs;
++	}
++	if (pages < 0)
++		pages = 0;
++	if (vmpages < 0)
++		vmpages = 0;
++
++	seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE);
++	seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE);
++	seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc,
++			sizeof(struct page_beancounter));
++
++	spin_lock_irq(&cc_lock);
++	list_for_each_entry (cc, &ub->ub_cclist, ulist) {
++		struct kmem_cache *cachep;
++
++		cachep = cc->cachep;
++		seq_printf(f, bc_proc_lu_lu_fmt,
++				kmem_cache_name(cachep),
++				cc->counter,
++				kmem_cache_objuse(cachep));
++	}
++	spin_unlock_irq(&cc_lock);
++	return 0;
++}
++
++static struct bc_proc_entry bc_kmem_debug_entry = {
++	.name = "kmem_debug",
++	.u.show = bc_kmem_debug_show,
++};
++
++static int __init bc_kmem_debug_init(void)
++{
++	bc_register_proc_entry(&bc_kmem_debug_entry);
++	return 0;
++}
++
++late_initcall(bc_kmem_debug_init);
++#endif
++
++#else
++#define inc_slab_charged(ub, cache)		(0)
++#define dec_slab_charged(ub, cache)		do { } while (0)
++#define inc_pages_charged(ub, cache) 		do { } while (0)
++#define dec_pages_charged(ub, cache)		do { } while (0)
++#endif
++
++#define UB_KMEM_QUANT	(PAGE_SIZE * 4)
++
++/* called with IRQ disabled */
++int ub_kmemsize_charge(struct user_beancounter *ub,
++		unsigned long size,
++		enum ub_severity strict)
++{
++	struct task_beancounter *tbc;
++
++	tbc = &current->task_bc;
++	if (ub != tbc->task_ub || size > UB_KMEM_QUANT)
++		goto just_charge;
++	if (tbc->kmem_precharged >= size) {
++		tbc->kmem_precharged -= size;
++		return 0;
++	}
++
++	if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) {
++		tbc->kmem_precharged += UB_KMEM_QUANT - size;
++		return 0;
++	}
++
++just_charge:
++	return charge_beancounter(ub, UB_KMEMSIZE, size, strict);
++}
++
++/* called with IRQ disabled */
++void ub_kmemsize_uncharge(struct user_beancounter *ub,
++		unsigned long size)
++{
++	struct task_beancounter *tbc;
++
++	if (size > UB_MAXVALUE) {
++		printk("ub_kmemsize_uncharge: size %lu\n", size);
++		dump_stack();
++	}
++
++	tbc = &current->task_bc;
++	if (ub != tbc->task_ub)
++		goto just_uncharge;
++
++	tbc->kmem_precharged += size;
++	if (tbc->kmem_precharged < UB_KMEM_QUANT * 2)
++		return;
++	size = tbc->kmem_precharged - UB_KMEM_QUANT;
++	tbc->kmem_precharged -= size;
++
++just_uncharge:
++	uncharge_beancounter(ub, UB_KMEMSIZE, size);
++}
++
++/* called with IRQ disabled */
++int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags)
++{
++	unsigned int size;
++	struct user_beancounter *ub;
++
++	ub = get_beancounter(get_exec_ub());
++	if (ub == NULL)
++		return 0;
++
++	size = CHARGE_SIZE(kmem_cache_objuse(cachep));
++	if (ub_kmemsize_charge(ub, size,
++				(flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
++		goto out_err;
++
++	if (inc_slab_charged(ub, cachep) < 0) {
++		ub_kmemsize_uncharge(ub, size);
++		goto out_err;
++	}
++	*ub_slab_ptr(cachep, objp) = ub;
++	return 0;
++
++out_err:
++	put_beancounter(ub);
++	return -ENOMEM;
++}
++
++/* called with IRQ disabled */
++void ub_slab_uncharge(struct kmem_cache *cachep, void *objp)
++{
++	unsigned int size;
++	struct user_beancounter **ub_ref;
++
++	ub_ref = ub_slab_ptr(cachep, objp);
++	if (*ub_ref == NULL)
++		return;
++
++	dec_slab_charged(*ub_ref, cachep);
++	size = CHARGE_SIZE(kmem_cache_objuse(cachep));
++	ub_kmemsize_uncharge(*ub_ref, size);
++	put_beancounter(*ub_ref);
++	*ub_ref = NULL;
++}
++
++/*
++ * Pages accounting
++ */
++
++int ub_page_charge(struct page *page, int order, gfp_t mask)
++{
++	struct user_beancounter *ub;
++	unsigned long flags;
++
++	ub = NULL;
++	if (!(mask & __GFP_UBC))
++		goto out;
++
++	ub = get_beancounter(get_exec_ub());
++	if (ub == NULL)
++		goto out;
++
++	local_irq_save(flags);
++	if (ub_kmemsize_charge(ub, CHARGE_ORDER(order),
++				(mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
++		goto err;
++
++	inc_pages_charged(ub, order);
++	local_irq_restore(flags);
++out:
++	BUG_ON(page_ub(page) != NULL);
++	page_ub(page) = ub;
++	return 0;
++
++err:
++	local_irq_restore(flags);
++	BUG_ON(page_ub(page) != NULL);
++	put_beancounter(ub);
++	return -ENOMEM;
++}
++
++void ub_page_uncharge(struct page *page, int order)
++{
++	struct user_beancounter *ub;
++	unsigned long flags;
++
++	ub = page_ub(page);
++	if (ub == NULL)
++		return;
++
++	BUG_ON(ub->ub_magic != UB_MAGIC);
++	dec_pages_charged(ub, order);
++	local_irq_save(flags);
++	ub_kmemsize_uncharge(ub, CHARGE_ORDER(order));
++	local_irq_restore(flags);
++	put_beancounter(ub);
++	page_ub(page) = NULL;
++}
++
++/* 
++ * takes init_mm.page_table_lock 
++ * some outer lock to protect pages from vmalloced area must be held
++ */
++struct user_beancounter *vmalloc_ub(void *obj)
++{
++	struct page *pg;
++
++	pg = vmalloc_to_page(obj);
++	if (pg == NULL)
++		return NULL;
++
++	return page_ub(pg);
++}
++
++EXPORT_SYMBOL(vmalloc_ub);
++
++struct user_beancounter *mem_ub(void *obj)
++{
++	struct user_beancounter *ub;
++
++	if ((unsigned long)obj >= VMALLOC_START &&
++	    (unsigned long)obj  < VMALLOC_END)
++		ub = vmalloc_ub(obj);
++	else
++		ub = slab_ub(obj);
++
++	return ub;
++}
++
++EXPORT_SYMBOL(mem_ub);
+diff --git a/kernel/bc/misc.c b/kernel/bc/misc.c
+new file mode 100644
+index 0000000..20c28a7
+--- /dev/null
++++ b/kernel/bc/misc.c
+@@ -0,0 +1,455 @@
++/*
++ *  kernel/bc/misc.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/tty.h>
++#include <linux/tty_driver.h>
++#include <linux/signal.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/sched.h>
++#include <linux/module.h>
++
++#include <bc/beancounter.h>
++#include <bc/kmem.h>
++#include <bc/proc.h>
++
++#define UB_FILE_MINQUANT	3
++#define UB_FILE_MAXQUANT	10
++#define UB_FILE_INIQUANT	4
++
++static unsigned long ub_file_precharge(struct task_beancounter *task_bc,
++		struct user_beancounter *ub, unsigned long *kmemsize);
++
++extern struct kmem_cache *filp_cachep;
++
++static inline unsigned long ub_file_kmemsize(unsigned long nr)
++{
++	return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr;
++}
++
++/*
++ * Task staff
++ */
++
++static void init_task_sub(struct task_struct *parent,
++		struct task_struct *tsk,
++  		struct task_beancounter *old_bc)
++{
++	struct task_beancounter *new_bc;
++	struct user_beancounter *sub;
++
++	new_bc = &tsk->task_bc;
++	sub = old_bc->fork_sub;
++	new_bc->fork_sub = get_beancounter(sub);
++	new_bc->task_fnode = NULL;
++	new_bc->task_freserv = old_bc->task_freserv;
++	old_bc->task_freserv = NULL;
++	memset(&new_bc->task_data, 0, sizeof(new_bc->task_data));
++	new_bc->pgfault_handle = 0;
++	new_bc->pgfault_allot = 0;
++}
++
++void ub_init_task_bc(struct task_beancounter *tbc)
++{
++	tbc->file_precharged = 0;
++	tbc->file_quant = UB_FILE_INIQUANT;
++	tbc->file_count = 0;
++
++	tbc->kmem_precharged = 0;
++	tbc->dentry_alloc = 0;
++}
++
++int ub_task_charge(struct task_struct *parent, struct task_struct *task)
++{
++	struct task_beancounter *old_bc;
++	struct task_beancounter *new_bc;
++	struct user_beancounter *ub, *pub;
++	unsigned long file_nr, kmemsize;
++	unsigned long flags;
++
++	old_bc = &parent->task_bc;
++	ub = old_bc->fork_sub;
++	new_bc = &task->task_bc;
++	new_bc->task_ub = get_beancounter(ub);
++	new_bc->exec_ub = get_beancounter(ub);
++
++	pub = top_beancounter(ub);
++	spin_lock_irqsave(&pub->ub_lock, flags);
++	if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC,
++					1, UB_HARD) < 0))
++		goto out_numproc;
++
++	ub_init_task_bc(new_bc);
++	file_nr = ub_file_precharge(new_bc, pub, &kmemsize);
++	spin_unlock_irqrestore(&pub->ub_lock, flags);
++
++	charge_beancounter_notop(ub, UB_NUMPROC, 1);
++	if (likely(file_nr)) {
++		charge_beancounter_notop(ub, UB_NUMFILE, file_nr);
++		charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize);
++	}
++
++	init_task_sub(parent, task, old_bc);
++	return 0;
++
++out_numproc:
++	spin_unlock_irqrestore(&pub->ub_lock, flags);
++	__put_beancounter_batch(ub, 2);
++	return -ENOMEM;
++}
++
++extern atomic_t dbgpre;
++
++void ub_task_uncharge(struct task_struct *task)
++{
++	struct task_beancounter *task_bc;
++	struct user_beancounter *pub;
++	unsigned long file_nr, file_kmemsize;
++	unsigned long flags;
++
++	task_bc = &task->task_bc;
++	pub = top_beancounter(task_bc->task_ub);
++	spin_lock_irqsave(&pub->ub_lock, flags);
++	__uncharge_beancounter_locked(pub, UB_NUMPROC, 1);
++	file_nr = task_bc->file_precharged;
++	if (likely(file_nr))
++		__uncharge_beancounter_locked(pub,
++				UB_NUMFILE, file_nr);
++
++	/* see comment in ub_file_charge */
++	task_bc->file_precharged = 0;
++	file_kmemsize = ub_file_kmemsize(file_nr);
++	if (likely(file_kmemsize))
++		__uncharge_beancounter_locked(pub,
++				UB_KMEMSIZE, file_kmemsize);
++	spin_unlock_irqrestore(&pub->ub_lock, flags);
++
++	uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1);
++	if (likely(file_nr)) {
++		uncharge_beancounter_notop(task_bc->task_ub,
++				UB_NUMFILE, file_nr);
++		__put_beancounter_batch(task_bc->task_ub, file_nr);
++	}
++	if (likely(file_kmemsize))
++		uncharge_beancounter_notop(task_bc->task_ub,
++				UB_KMEMSIZE, file_kmemsize);
++}
++
++void ub_task_put(struct task_struct *task)
++{
++	struct task_beancounter *task_bc;
++	struct user_beancounter *pub;
++	unsigned long kmemsize, flags;
++
++	task_bc = &task->task_bc;
++
++	pub = top_beancounter(task_bc->task_ub);
++	spin_lock_irqsave(&pub->ub_lock, flags);
++	kmemsize = task_bc->kmem_precharged;
++	task_bc->kmem_precharged = 0;
++	if (likely(kmemsize))
++		__uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize);
++	spin_unlock_irqrestore(&pub->ub_lock, flags);
++	if (likely(kmemsize))
++		uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize);
++
++	put_beancounter(task_bc->exec_ub);
++	put_beancounter(task_bc->task_ub);
++	put_beancounter(task_bc->fork_sub);
++	/* can't be freed elsewhere, failures possible in the middle of fork */
++	if (task_bc->task_freserv != NULL)
++		kfree(task_bc->task_freserv);
++
++	task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
++	task_bc->task_ub = (struct user_beancounter *)0xdead100c;
++	BUG_ON(task_bc->kmem_precharged != 0);
++}
++
++/*
++ * Files and file locks.
++ */
++/*
++ * For NUMFILE, we do not take a lock and call charge function
++ * for every file.  We try to charge in batches, keeping local reserve on
++ * task.  For experimental purposes, batch size is adaptive and depends
++ * on numfile barrier, number of processes, and the history of successes and
++ * failures of batch charges.
++ *
++ * Per-task fields have the following meaning
++ *   file_precharged    number of files charged to beancounter in advance,
++ *   file_quant         logarithm of batch size
++ *   file_count         counter of charge successes, to reduce batch size
++ *                      fluctuations.
++ */
++static unsigned long ub_file_precharge(struct task_beancounter *task_bc,
++		struct user_beancounter *ub, unsigned long *kmemsize)
++{
++	unsigned long n, kmem;
++
++	n = 1UL << task_bc->file_quant;
++	if (ub->ub_parms[UB_NUMPROC].held >
++			(ub->ub_parms[UB_NUMFILE].barrier >>
++						task_bc->file_quant))
++		goto nopre;
++	if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD)))
++		goto nopre;
++	kmem = ub_file_kmemsize(n);
++	if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE,
++					kmem, UB_HARD)))
++		goto nopre_kmem;
++
++	task_bc->file_precharged += n;
++	get_beancounter_batch(task_bc->task_ub, n);
++	task_bc->file_count++;
++	if (task_bc->file_quant < UB_FILE_MAXQUANT &&
++	    task_bc->file_count >= task_bc->file_quant) {
++		task_bc->file_quant++;
++		task_bc->file_count = 0;
++	}
++	*kmemsize = kmem;
++	return n;
++
++nopre_kmem:
++	__uncharge_beancounter_locked(ub, UB_NUMFILE, n);
++nopre:
++	if (task_bc->file_quant > UB_FILE_MINQUANT)
++		task_bc->file_quant--;
++	task_bc->file_count = 0;
++	return 0;
++}
++
++int ub_file_charge(struct file *f)
++{
++	struct user_beancounter *ub, *pub;
++	struct task_beancounter *task_bc;
++	unsigned long file_nr, kmem;
++	unsigned long flags;
++	int err;
++
++	task_bc = &current->task_bc;
++	ub = get_exec_ub();
++	if (unlikely(ub != task_bc->task_ub))
++		goto just_charge;
++
++	if (likely(task_bc->file_precharged > 0)) {
++		/*
++		 * files are put via RCU in 2.6.16 so during
++		 * this decrement an IRQ can happen and called
++		 * ub_files_uncharge() will mess file_precharged
++		 *
++		 * ub_task_uncharge() is called via RCU also so no
++		 * protection is needed there
++		 *
++		 * Xemul
++		 */
++
++		local_irq_save(flags);
++		task_bc->file_precharged--;
++		local_irq_restore(flags);
++
++		f->f_ub = ub;
++		return 0;
++	}
++
++	pub = top_beancounter(ub);
++	spin_lock_irqsave(&pub->ub_lock, flags);
++	file_nr = ub_file_precharge(task_bc, pub, &kmem);
++	if (unlikely(!file_nr))
++		goto last_try;
++	spin_unlock(&pub->ub_lock);
++	task_bc->file_precharged--;
++	local_irq_restore(flags);
++
++	charge_beancounter_notop(ub, UB_NUMFILE, file_nr);
++	charge_beancounter_notop(ub, UB_KMEMSIZE, kmem);
++	f->f_ub = ub;
++	return 0;
++
++just_charge:
++	pub = top_beancounter(ub);
++	spin_lock_irqsave(&pub->ub_lock, flags);
++last_try:
++	kmem = ub_file_kmemsize(1);
++	err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD);
++	if (likely(!err)) {
++		err = __charge_beancounter_locked(pub, UB_KMEMSIZE,
++				kmem, UB_HARD);
++		if (unlikely(err))
++			__uncharge_beancounter_locked(pub, UB_NUMFILE, 1);
++	}
++	spin_unlock_irqrestore(&pub->ub_lock, flags);
++	if (likely(!err)) {
++		charge_beancounter_notop(ub, UB_NUMFILE, 1);
++		charge_beancounter_notop(ub, UB_KMEMSIZE, kmem);
++		f->f_ub = get_beancounter(ub);
++	}
++	return err;
++}
++
++void ub_file_uncharge(struct file *f)
++{
++	struct user_beancounter *ub, *pub;
++	struct task_beancounter *task_bc;
++	unsigned long nr;
++
++	ub = f->f_ub;
++	task_bc = &current->task_bc;
++	if (likely(ub == task_bc->task_ub)) {
++		task_bc->file_precharged++;
++		pub = top_beancounter(ub);
++		if (ub_barrier_farnr(pub, UB_NUMFILE) &&
++				ub_barrier_farsz(pub, UB_KMEMSIZE))
++			return;
++		if (task_bc->file_precharged < (1UL << task_bc->file_quant))
++			return;
++		nr = task_bc->file_precharged
++			- (1UL << (task_bc->file_quant - 1));
++		task_bc->file_precharged -= nr;
++		__put_beancounter_batch(ub, nr);
++		uncharge_beancounter(ub, UB_NUMFILE, nr);
++		uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr));
++	} else {
++		uncharge_beancounter(ub, UB_NUMFILE, 1);
++		uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1));
++		put_beancounter(ub);
++	}
++}
++
++int ub_flock_charge(struct file_lock *fl, int hard)
++{
++	struct user_beancounter *ub;
++	int err;
++
++	/* No need to get_beancounter here since it's already got in slab */
++	ub = slab_ub(fl);
++	if (ub == NULL)
++		return 0;
++
++	err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
++	if (!err)
++		fl->fl_charged = 1;
++	return err;
++}
++
++void ub_flock_uncharge(struct file_lock *fl)
++{
++	struct user_beancounter *ub;
++
++	/* Ub will be put in slab */
++	ub = slab_ub(fl);
++	if (ub == NULL || !fl->fl_charged)
++		return;
++
++	uncharge_beancounter(ub, UB_NUMFLOCK, 1);
++	fl->fl_charged = 0;
++}
++
++/*
++ * Signal handling
++ */
++
++static int do_ub_siginfo_charge(struct user_beancounter *ub,
++		unsigned long size)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD))
++		goto out_kmem;
++
++	if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD))
++		goto out_num;
++
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return 0;
++
++out_num:
++	__uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
++out_kmem:
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return -ENOMEM;
++}
++
++static void do_ub_siginfo_uncharge(struct user_beancounter *ub,
++		unsigned long size)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	__uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
++	__uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub)
++{
++	unsigned long size;
++	struct user_beancounter *p, *q;
++
++	size = CHARGE_SIZE(kmem_obj_objuse(sq));
++	for (p = ub; p != NULL; p = p->parent) {
++		if (do_ub_siginfo_charge(p, size))
++			goto unroll;
++	}
++
++	sq->sig_ub = get_beancounter(ub);
++	return 0;
++
++unroll:
++	for (q = ub; q != p; q = q->parent)
++		do_ub_siginfo_uncharge(q, size);
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(ub_siginfo_charge);
++
++void ub_siginfo_uncharge(struct sigqueue *sq)
++{
++	unsigned long size;
++	struct user_beancounter *ub, *p;
++
++	p = ub = sq->sig_ub;
++	sq->sig_ub = NULL;
++	size = CHARGE_SIZE(kmem_obj_objuse(sq));
++	for (; ub != NULL; ub = ub->parent)
++		do_ub_siginfo_uncharge(ub, size);
++	put_beancounter(p);
++}
++
++/*
++ * PTYs
++ */
++
++int ub_pty_charge(struct tty_struct *tty)
++{
++	struct user_beancounter *ub;
++	int retval;
++
++	ub = slab_ub(tty);
++	retval = 0;
++	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
++			!test_bit(TTY_CHARGED, &tty->flags)) {
++		retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
++		if (!retval)
++			set_bit(TTY_CHARGED, &tty->flags);
++	}
++	return retval;
++}
++
++void ub_pty_uncharge(struct tty_struct *tty)
++{
++	struct user_beancounter *ub;
++
++	ub = slab_ub(tty);
++	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
++			test_bit(TTY_CHARGED, &tty->flags)) {
++		uncharge_beancounter(ub, UB_NUMPTY, 1);
++		clear_bit(TTY_CHARGED, &tty->flags);
++	}
++}
+diff --git a/kernel/bc/net.c b/kernel/bc/net.c
+new file mode 100644
+index 0000000..e0244b4
+--- /dev/null
++++ b/kernel/bc/net.c
+@@ -0,0 +1,1160 @@
++/*
++ *  linux/kernel/bc/net.c
++ *
++ *  Copyright (C) 1998-2004  Andrey V. Savochkin <saw at saw.sw.com.sg>
++ *  Copyright (C) 2005 SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * TODO:
++ *   - sizeof(struct inode) charge
++ *   = tcp_mem_schedule() feedback based on ub limits
++ *   + measures so that one socket won't exhaust all send buffers,
++ *     see bug in bugzilla
++ *   = sk->socket check for NULL in snd_wakeups
++ *     (tcp_write_space checks for NULL itself)
++ *   + in tcp_close(), orphaned socket abortion should be based on ubc
++ *     resources (same in tcp_out_of_resources)
++ *     Beancounter should also have separate orphaned socket counter...
++ *   + for rcv, in-order segment should be accepted
++ *     if only barrier is exceeded
++ *   = tcp_rmem_schedule() feedback based on ub limits
++ *   - repair forward_alloc mechanism for receive buffers
++ *     It's idea is that some buffer space is pre-charged so that receive fast
++ *     path doesn't need to take spinlocks and do other heavy stuff
++ *   + tcp_prune_queue actions based on ub limits
++ *   + window adjustments depending on available buffers for receive
++ *   - window adjustments depending on available buffers for send
++ *   + race around usewreserv
++ *   + avoid allocating new page for each tiny-gram, see letter from ANK
++ *   + rename ub_sock_lock
++ *   + sk->sleep wait queue probably can be used for all wakeups, and
++ *     sk->ub_wait is unnecessary
++ *   + for UNIX sockets, the current algorithm will lead to
++ *     UB_UNIX_MINBUF-sized messages only for non-blocking case
++ *   - charge for af_packet sockets
++ *   + all datagram sockets should be charged to NUMUNIXSOCK
++ *   - we do not charge for skb copies and clones staying in device queues
++ *   + live-lock if number of sockets is big and buffer limits are small
++ *     [diff-ubc-dbllim3]
++ *   - check that multiple readers/writers on the same socket won't cause fatal
++ *     consequences
++ *   - check allocation/charge orders
++ *   + There is potential problem with callback_lock.  In *snd_wakeup we take
++ *     beancounter first, in sock_def_error_report - callback_lock first.
++ *     then beancounter.  This is not a problem if callback_lock taken
++ *     readonly, but anyway...
++ *   - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator
++ * General kernel problems:
++ *   - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC
++ *     notification won't get signals
++ *   - datagram_poll looks racy
++ *
++ */
++
++#include <linux/net.h>
++#include <linux/slab.h>
++#include <linux/gfp.h>
++#include <linux/err.h>
++#include <linux/socket.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++
++#include <net/sock.h>
++#include <net/tcp.h>
++
++#include <bc/beancounter.h>
++#include <bc/net.h>
++#include <bc/debug.h>
++
++/* by some reason it is not used currently */
++#define UB_SOCK_MAINTAIN_WMEMPRESSURE	0
++
++
++/* Skb truesize definition. Bad place. Den */
++
++static inline int skb_chargesize_head(struct sk_buff *skb)
++{
++	return skb_charge_size(skb_end_pointer(skb) - skb->head +
++				sizeof(struct skb_shared_info));
++}
++
++int skb_charge_fullsize(struct sk_buff *skb)
++{
++	int chargesize;
++	struct sk_buff *skbfrag;
++
++	chargesize = skb_chargesize_head(skb) +
++		PAGE_SIZE * skb_shinfo(skb)->nr_frags;
++	if (likely(skb_shinfo(skb)->frag_list == NULL))
++		return chargesize;
++	for (skbfrag = skb_shinfo(skb)->frag_list;
++	     skbfrag != NULL;
++	     skbfrag = skbfrag->next) {
++		chargesize += skb_charge_fullsize(skbfrag);
++	}
++	return chargesize;
++}
++EXPORT_SYMBOL(skb_charge_fullsize);
++
++static int ub_sock_makewreserv_locked(struct sock *sk,
++		int bufid, unsigned long size);
++
++int __ub_too_many_orphans(struct sock *sk, int count)
++{
++	struct user_beancounter *ub;
++
++	if (sock_has_ubc(sk)) {
++		ub = top_beancounter(sock_bc(sk)->ub);
++		if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2)
++			return 1;
++	}
++	return 0;
++}
++
++/*
++ * Queueing
++ */
++
++static void ub_sock_snd_wakeup(struct user_beancounter *ub)
++{
++	struct list_head *p;
++	struct sock *sk;
++	struct sock_beancounter *skbc;
++	struct socket *sock;
++	unsigned long added;
++
++	while (!list_empty(&ub->ub_other_sk_list)) {
++		p = ub->ub_other_sk_list.next;
++		skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
++		sk = skbc_sock(skbc);
++
++		added = 0;
++		sock = sk->sk_socket;
++		if (sock == NULL) {
++			/* sk being destroyed */
++			list_del_init(&skbc->ub_sock_list);
++			continue;
++		}
++
++		ub_debug(UBD_NET_SLEEP,
++				"Checking queue, waiting %lu, reserv %lu\n",
++				skbc->ub_waitspc, skbc->poll_reserv);
++		added = -skbc->poll_reserv;
++		if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF,
++					skbc->ub_waitspc))
++			break;
++		added += skbc->poll_reserv;
++
++		list_del_init(&skbc->ub_sock_list);
++
++		/*
++		 * See comments in ub_tcp_snd_wakeup.
++		 * Locking note: both unix_write_space and
++		 * sock_def_write_space take callback_lock themselves.
++		 * We take it here just to be on the safe side and to
++		 * act the same way as ub_tcp_snd_wakeup does.
++		 */
++		sock_hold(sk);
++		read_lock(&sk->sk_callback_lock);
++		spin_unlock(&ub->ub_lock);
++
++		sk->sk_write_space(sk);
++		read_unlock(&sk->sk_callback_lock);
++
++		if (skbc->ub != ub && added)
++			charge_beancounter_notop(skbc->ub,
++				       	UB_OTHERSOCKBUF, added);
++		sock_put(sk);
++
++		spin_lock(&ub->ub_lock);
++	}
++}
++
++static void ub_tcp_snd_wakeup(struct user_beancounter *ub)
++{
++	struct list_head *p;
++	struct sock *sk;
++	struct sock_beancounter *skbc;
++	struct socket *sock;
++	unsigned long added;
++
++	while (!list_empty(&ub->ub_tcp_sk_list)) {
++		p = ub->ub_tcp_sk_list.next;
++		skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
++		sk = skbc_sock(skbc);
++
++		added = 0;
++		sock = sk->sk_socket;
++		if (sock == NULL) {
++			/* sk being destroyed */
++			list_del_init(&skbc->ub_sock_list);
++			continue;
++		}
++
++		ub_debug(UBD_NET_SLEEP,
++				"Checking queue, waiting %lu, reserv %lu\n",
++				skbc->ub_waitspc, skbc->poll_reserv);
++		added = -skbc->poll_reserv;
++		if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF,
++					skbc->ub_waitspc))
++			break;
++		added += skbc->poll_reserv;
++
++		list_del_init(&skbc->ub_sock_list);
++
++		/*
++		 * Send async notifications and wake up.
++		 * Locking note: we get callback_lock here because
++		 * tcp_write_space is over-optimistic about calling context
++		 * (socket lock is presumed).  So we get the lock here although
++		 * it belongs to the callback.
++		 */
++		sock_hold(sk);
++		read_lock(&sk->sk_callback_lock);
++		spin_unlock(&ub->ub_lock);
++
++		sk->sk_write_space(sk);
++		read_unlock(&sk->sk_callback_lock);
++
++		if (skbc->ub != ub && added)
++			charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added);
++		sock_put(sk);
++
++		spin_lock(&ub->ub_lock);
++	}
++}
++
++void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size)
++{
++	unsigned long flags;
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long added_reserv;
++
++	if (!sock_has_ubc(sk))
++		return;
++
++	skbc = sock_bc(sk);
++	ub = top_beancounter(skbc->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size);
++	added_reserv = -skbc->poll_reserv;
++	if (!ub_sock_makewreserv_locked(sk, res, size)) {
++		/*
++		 * It looks a bit hackish, but it is compatible with both
++		 * wait_for_xx_ubspace and poll.
++		 * This __set_current_state is equivalent to a wakeup event
++		 * right after spin_unlock_irqrestore.
++		 */
++		__set_current_state(TASK_RUNNING);
++		added_reserv += skbc->poll_reserv;
++		spin_unlock_irqrestore(&ub->ub_lock, flags);
++		if (added_reserv)
++			charge_beancounter_notop(skbc->ub, res, added_reserv);
++		return;
++	}
++
++	ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n");
++	skbc->ub_waitspc = size;
++	if (!list_empty(&skbc->ub_sock_list)) {
++		ub_debug(UBD_NET_SOCKET,
++				"re-adding socket to beancounter %p.\n", ub);
++		goto out;
++	}
++
++	switch (res) {
++		case UB_TCPSNDBUF:
++			list_add_tail(&skbc->ub_sock_list,
++					&ub->ub_tcp_sk_list);
++			break;
++		case UB_OTHERSOCKBUF:
++			list_add_tail(&skbc->ub_sock_list,
++					&ub->ub_other_sk_list);
++			break;
++		default:
++			BUG();
++	}
++out:
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++EXPORT_SYMBOL(ub_sock_snd_queue_add);
++
++long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size)
++{
++	DECLARE_WAITQUEUE(wait, current);
++
++	add_wait_queue(sk->sk_sleep, &wait);
++	for (;;) {
++		if (signal_pending(current))
++			break;
++		set_current_state(TASK_INTERRUPTIBLE);
++		if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size))
++			break;
++
++		if (sk->sk_shutdown & SEND_SHUTDOWN)
++			break;
++		if (sk->sk_err)
++			break;
++		ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size);
++		timeo = schedule_timeout(timeo);
++	}
++	__set_current_state(TASK_RUNNING);
++	remove_wait_queue(sk->sk_sleep, &wait);
++	return timeo;
++}
++
++void ub_sock_sndqueuedel(struct sock *sk)
++{
++	struct user_beancounter *ub;
++	struct sock_beancounter *skbc;
++	unsigned long flags;
++
++	if (!sock_has_ubc(sk))
++		return;
++	skbc = sock_bc(sk);
++
++	/* race with write_space callback of other socket */
++	ub = top_beancounter(skbc->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	list_del_init(&skbc->ub_sock_list);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++/*
++ * Helpers
++ */
++
++static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
++		       unsigned long size, int resource)
++{
++	WARN_ON_ONCE(skb_bc(skb)->ub != NULL);
++
++	skb_bc(skb)->ub = sock_bc(sk)->ub;
++	skb_bc(skb)->charged = size;
++	skb_bc(skb)->resource = resource;
++}
++
++void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
++		       unsigned long size, int resource)
++{
++	if (!sock_has_ubc(sk))
++		return;
++
++	if (sock_bc(sk)->ub == NULL)
++		BUG();
++
++	__ub_skb_set_charge(skb, sk, size, resource);
++
++	/* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */
++	if (skb->sk == NULL)
++		skb->sk = sk;
++}
++
++EXPORT_SYMBOL(ub_skb_set_charge);
++
++static inline void ub_skb_set_uncharge(struct sk_buff *skb)
++{
++	skb_bc(skb)->ub = NULL;
++	skb_bc(skb)->charged = 0;
++	skb_bc(skb)->resource = 0;
++}
++
++static void ub_update_rmem_thres(struct sock_beancounter *skub)
++{
++	struct user_beancounter *ub;
++
++	if (skub && skub->ub) {
++		ub = top_beancounter(skub->ub);
++		ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier /
++			(ub->ub_parms[UB_NUMTCPSOCK].held + 1);
++	}
++}
++
++static inline void ub_sock_wcharge_dec(struct sock *sk,
++		unsigned long chargesize)
++{
++	/* The check sk->sk_family != PF_NETLINK is made as the skb is
++	 * queued to the kernel end of socket while changed to the user one.
++	 * Den */
++	if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) {
++		if (sock_bc(sk)->ub_wcharged > chargesize)
++			sock_bc(sk)->ub_wcharged -= chargesize;
++		else
++			sock_bc(sk)->ub_wcharged = 0;
++	}
++}
++
++/*
++ * Charge socket number
++ */
++
++static inline void sk_alloc_beancounter(struct sock *sk)
++{
++	struct sock_beancounter *skbc;
++
++	skbc = sock_bc(sk);
++	memset(skbc, 0, sizeof(struct sock_beancounter));
++}
++
++static inline void sk_free_beancounter(struct sock *sk)
++{
++}
++
++static int __sock_charge(struct sock *sk, int res)
++{
++	struct sock_beancounter *skbc;
++	struct user_beancounter *cub, *ub;
++	unsigned long added_reserv, added_forw;
++	unsigned long flags;
++
++	cub = get_exec_ub();
++	if (unlikely(cub == NULL))
++		return 0;
++
++	sk_alloc_beancounter(sk);
++	skbc = sock_bc(sk);
++	INIT_LIST_HEAD(&skbc->ub_sock_list);
++
++	ub = top_beancounter(cub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0))
++		goto out_limit;
++
++	added_reserv = 0;
++	added_forw = 0;
++	if (res == UB_NUMTCPSOCK) {
++		added_reserv = skb_charge_size(MAX_TCP_HEADER +
++				1500 - sizeof(struct iphdr) -
++					sizeof(struct tcphdr));
++		added_reserv *= 4;
++		ub->ub_parms[UB_TCPSNDBUF].held += added_reserv;
++		if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) {
++			ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv;
++			added_reserv = 0;
++		}
++		skbc->poll_reserv = added_reserv;
++
++		added_forw = SK_MEM_QUANTUM * 4;
++		ub->ub_parms[UB_TCPRCVBUF].held += added_forw;
++		if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
++			ub->ub_parms[UB_TCPRCVBUF].held -= added_forw;
++			added_forw = 0;
++		}
++		skbc->forw_space = added_forw;
++	}
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	charge_beancounter_notop(cub, res, 1);
++	if (added_reserv)
++		charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv);
++	if (added_forw)
++		charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw);
++
++	skbc->ub = get_beancounter(cub);
++	return 0;
++
++out_limit:
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	sk_free_beancounter(sk);
++	return -ENOMEM;
++}
++
++int ub_tcp_sock_charge(struct sock *sk)
++{
++	int ret;
++
++	ret = __sock_charge(sk, UB_NUMTCPSOCK);
++	ub_update_rmem_thres(sock_bc(sk));
++
++	return ret;
++}
++
++int ub_other_sock_charge(struct sock *sk)
++{
++	return __sock_charge(sk, UB_NUMOTHERSOCK);
++}
++
++EXPORT_SYMBOL(ub_other_sock_charge);
++
++int ub_sock_charge(struct sock *sk, int family, int type)
++{
++	return (IS_TCP_SOCK(family, type) ?
++			ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk));
++}
++
++EXPORT_SYMBOL(ub_sock_charge);
++
++/*
++ * Uncharge socket number
++ */
++
++void ub_sock_uncharge(struct sock *sk)
++{
++	int is_tcp_sock;
++	unsigned long flags;
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long reserv, forw;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return;
++
++	is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type);
++	skbc = sock_bc(sk);
++	ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk);
++
++	ub = top_beancounter(skbc->ub);
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (!list_empty(&skbc->ub_sock_list)) {
++		ub_debug(UBD_NET_SOCKET,
++			 "ub_sock_uncharge: removing from ub(%p) queue.\n",
++			 skbc);
++		list_del_init(&skbc->ub_sock_list);
++	}
++
++	reserv = skbc->poll_reserv;
++	forw = skbc->forw_space;
++	__uncharge_beancounter_locked(ub,
++			(is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
++			reserv);
++	if (forw)
++		__uncharge_beancounter_locked(ub,
++				(is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF),
++				forw);
++	__uncharge_beancounter_locked(ub,
++			(is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
++
++	ub_sock_wcharge_dec(sk, reserv);
++	if (unlikely(skbc->ub_wcharged))
++		printk(KERN_WARNING
++		       "ub_sock_uncharge: wch=%lu for ub %p (%d).\n",
++		       skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid);
++	skbc->poll_reserv = 0;
++	skbc->forw_space = 0;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	uncharge_beancounter_notop(skbc->ub,
++			(is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
++			reserv);
++	if (forw)
++		uncharge_beancounter_notop(skbc->ub,
++				(is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF),
++				forw);
++	uncharge_beancounter_notop(skbc->ub,
++			(is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
++
++	put_beancounter(skbc->ub);
++	sk_free_beancounter(sk);
++}
++
++/*
++ * Special case for netlink_dump - (un)charges precalculated size
++ */
++
++int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)
++{
++	int ret;
++	unsigned long chargesize;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	chargesize = skb_charge_fullsize(skb);
++	ret = charge_beancounter(sock_bc(sk)->ub,
++			UB_OTHERSOCKBUF, chargesize, UB_HARD);
++	if (ret < 0)
++		return ret;
++	ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
++	return ret;
++}
++
++/*
++ * Poll reserve accounting
++ *
++ * This is the core of socket buffer management (along with queueing/wakeup
++ * functions.  The rest of buffer accounting either call these functions, or
++ * repeat parts of their logic for some simpler cases.
++ */
++
++static int ub_sock_makewreserv_locked(struct sock *sk,
++		int bufid, unsigned long size)
++{
++	unsigned long wcharge_added;
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++
++	skbc = sock_bc(sk);
++	if (skbc->poll_reserv >= size) /* no work to be done */
++		goto out;
++
++	ub = top_beancounter(skbc->ub);
++	ub->ub_parms[bufid].held += size - skbc->poll_reserv;
++
++	wcharge_added = 0;
++	/*
++	 * Logic:
++	 *  1) when used memory hits barrier, we set wmem_pressure;
++	 *     wmem_pressure is reset under barrier/2;
++	 *     between barrier/2 and barrier we limit per-socket buffer growth;
++	 *  2) each socket is guaranteed to get (limit-barrier)/maxsockets
++	 *     calculated on the base of memory eaten after the barrier is hit
++	 */
++	skbc = sock_bc(sk);
++#if UB_SOCK_MAINTAIN_WMEMPRESSURE
++	if (!ub_hfbarrier_hit(ub, bufid)) {
++		if (ub->ub_wmem_pressure)
++			ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 "
++				"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++				sk, size, skbc->poll_reserv,
++				ub->ub_parms[bufid].held,
++				skbc->ub_wcharged, sk->sk_sndbuf);
++		ub->ub_wmem_pressure = 0;
++	}
++#endif
++	if (ub_barrier_hit(ub, bufid)) {
++#if UB_SOCK_MAINTAIN_WMEMPRESSURE
++		if (!ub->ub_wmem_pressure)
++			ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 "
++				"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++				sk, size, skbc->poll_reserv,
++				ub->ub_parms[bufid].held,
++				skbc->ub_wcharged, sk->sk_sndbuf);
++		ub->ub_wmem_pressure = 1;
++#endif
++		if (sk->sk_family == PF_NETLINK)
++			goto unroll;
++		wcharge_added = size - skbc->poll_reserv;
++		skbc->ub_wcharged += wcharge_added;
++		if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit +
++				ub->ub_parms[bufid].barrier >
++					ub->ub_parms[bufid].limit)
++			goto unroll_wch;
++	}
++	if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit)
++		goto unroll;
++
++	ub_adjust_maxheld(ub, bufid);
++	skbc->poll_reserv = size;
++out:
++	return 0;
++
++unroll_wch:
++	skbc->ub_wcharged -= wcharge_added;
++unroll:
++	ub_debug(UBD_NET_SEND,
++			"makewres: deny "
++			"sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++			sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held,
++			skbc->ub_wcharged, sk->sk_sndbuf);
++	ub->ub_parms[bufid].failcnt++;
++	ub->ub_parms[bufid].held -= size - skbc->poll_reserv;
++
++	if (sk->sk_socket != NULL) {
++		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
++		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++	}
++	return -ENOMEM;
++}
++
++int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size)
++{
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long flags;
++	unsigned long added_reserv;
++	int err;
++
++	skbc = sock_bc(sk);
++
++	/*
++	 * This function provides that there is sufficient reserve upon return
++	 * only if sk has only one user.  We can check poll_reserv without
++	 * serialization and avoid locking if the reserve already exists.
++	 */
++	if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size))
++		return 0;
++
++	ub = top_beancounter(skbc->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	added_reserv = -skbc->poll_reserv;
++	err = ub_sock_makewreserv_locked(sk, bufid, size);
++	added_reserv += skbc->poll_reserv;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	if (added_reserv)
++		charge_beancounter_notop(skbc->ub, bufid, added_reserv);
++
++	return err;
++}
++
++EXPORT_SYMBOL(ub_sock_make_wreserv);
++
++int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size)
++{
++	struct sock_beancounter *skbc;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	/* optimize for the case if socket has sufficient reserve */
++	ub_sock_make_wreserv(sk, bufid, size);
++	skbc = sock_bc(sk);
++	if (likely(skbc->poll_reserv >= size)) {
++		skbc->poll_reserv -= size;
++		return 0;
++	}
++	return -ENOMEM;
++}
++
++EXPORT_SYMBOL(ub_sock_get_wreserv);
++
++static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid,
++		unsigned long size, unsigned long ressize)
++{
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long extra;
++	unsigned long flags;
++
++	skbc = sock_bc(sk);
++	ub = top_beancounter(skbc->ub);
++
++	extra = 0;
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	skbc->poll_reserv += size;
++	if (skbc->poll_reserv > ressize) {
++		extra = skbc->poll_reserv - ressize;
++		ub_sock_wcharge_dec(sk, extra);
++		skbc->poll_reserv = ressize;
++
++		__uncharge_beancounter_locked(ub, bufid, extra);
++		if (bufid == UB_TCPSNDBUF)
++			ub_tcp_snd_wakeup(ub);
++		else
++			ub_sock_snd_wakeup(ub);
++	}
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	if (extra)
++		uncharge_beancounter_notop(skbc->ub, bufid, extra);
++}
++
++void ub_sock_ret_wreserv(struct sock *sk, int bufid,
++		unsigned long size, unsigned long ressize)
++{
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return;
++
++	skbc = sock_bc(sk);
++	ub = top_beancounter(skbc->ub);
++	/* check if the reserve can be kept */
++	if (ub_barrier_farsz(ub, bufid)) {
++		skbc->poll_reserv += size;
++		return;
++	}
++	ub_sock_do_ret_wreserv(sk, bufid, size, ressize);
++}
++
++/*
++ * UB_DGRAMRCVBUF
++ */
++
++static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++	unsigned long chargesize;
++
++	chargesize = skb_charge_fullsize(skb);
++	if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF,
++				 chargesize, UB_HARD))
++		return -ENOMEM;
++
++	ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF);
++	return 0;
++}
++
++int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	if (IS_TCP_SOCK(sk->sk_family, sk->sk_type))
++		return ub_tcprcvbuf_charge(sk, skb);
++	else
++		return ub_dgramrcvbuf_charge(sk, skb);
++}
++
++EXPORT_SYMBOL(ub_sockrcvbuf_charge);
++
++static void ub_sockrcvbuf_uncharge(struct sk_buff *skb)
++{
++	uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF,
++			     skb_bc(skb)->charged);
++	ub_skb_set_uncharge(skb);
++}
++
++/*
++ * UB_TCPRCVBUF
++ */
++
++int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb,
++			    enum ub_severity strict)
++{
++	int retval;
++	unsigned long flags;
++	struct user_beancounter *ub;
++	struct sock_beancounter *skbc;
++	unsigned long chargesize;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++	skbc = sock_bc(sk);
++
++	chargesize = skb_charge_fullsize(skb);
++	if (likely(skbc->forw_space >= chargesize)) {
++		skbc->forw_space -= chargesize;
++		__ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
++		return 0;
++	}
++
++	/*
++	 * Memory pressure reactions:
++	 *  1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND)
++	 *  2) set UB_RMEM_SHRINK and tcp_clamp_window()
++	 *     tcp_collapse_queues() if rmem_alloc > rcvbuf
++	 *  3) drop OFO, tcp_purge_ofo()
++	 *  4) drop all.
++	 * Currently, we do #2 and #3 at once (which means that current
++	 * collapsing of OFO queue in tcp_collapse_queues() is a waste of time,
++	 * for example...)
++	 * On memory pressure we jump from #0 to #3, and when the pressure
++	 * subsides, to #1.
++	 */
++	retval = 0;
++	ub = top_beancounter(sock_bc(sk)->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_parms[UB_TCPRCVBUF].held += chargesize;
++	if (ub->ub_parms[UB_TCPRCVBUF].held >
++			ub->ub_parms[UB_TCPRCVBUF].barrier &&
++			strict != UB_FORCE)
++		goto excess;
++	ub_adjust_maxheld(ub, UB_TCPRCVBUF);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++out:
++	if (retval == 0) {
++		charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF,
++				chargesize);
++		ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
++	}
++	return retval;
++
++excess:
++	ub->ub_rmem_pressure = UB_RMEM_SHRINK;
++	if (strict == UB_HARD)
++		retval = -ENOMEM;
++	if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit)
++		retval = -ENOMEM;
++	/*
++	 * We try to leave numsock*maxadvmss as a reserve for sockets not
++	 * queueing any data yet (if the difference between the barrier and the
++	 * limit is enough for this reserve).
++	 */
++	if (ub->ub_parms[UB_TCPRCVBUF].held +
++			ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss
++			> ub->ub_parms[UB_TCPRCVBUF].limit &&
++			atomic_read(&sk->sk_rmem_alloc))
++		retval = -ENOMEM;
++	if (retval) {
++		ub->ub_parms[UB_TCPRCVBUF].held -= chargesize;
++		ub->ub_parms[UB_TCPRCVBUF].failcnt++;
++	}
++	ub_adjust_maxheld(ub, UB_TCPRCVBUF);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	goto out;
++}
++EXPORT_SYMBOL(ub_sock_tcp_chargerecv);
++
++static void ub_tcprcvbuf_uncharge(struct sk_buff *skb)
++{
++	unsigned long flags;
++	unsigned long held, bar;
++	int prev_pres;
++	struct user_beancounter *ub;
++
++	ub = top_beancounter(skb_bc(skb)->ub);
++	if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
++		sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged;
++		ub_skb_set_uncharge(skb);
++		return;
++	}
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) {
++		printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n",
++				skb_bc(skb)->charged,
++				ub, ub->ub_parms[UB_TCPRCVBUF].held);
++		/* ass-saving bung */
++		skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held;
++	}
++	ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged;
++	held = ub->ub_parms[UB_TCPRCVBUF].held;
++	bar = ub->ub_parms[UB_TCPRCVBUF].barrier;
++	prev_pres = ub->ub_rmem_pressure;
++	if (held <= bar - (bar >> 2))
++		ub->ub_rmem_pressure = UB_RMEM_EXPAND;
++	else if (held <= bar)
++		ub->ub_rmem_pressure = UB_RMEM_KEEP;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF,
++			skb_bc(skb)->charged);
++	ub_skb_set_uncharge(skb);
++}
++
++
++/*
++ * UB_OTHERSOCKBUF and UB_TCPSNDBUF
++ */
++
++static void ub_socksndbuf_uncharge(struct sk_buff *skb)
++{
++	unsigned long flags;
++	struct user_beancounter *ub, *cub;
++	unsigned long chargesize;
++
++	cub = skb_bc(skb)->ub;
++	ub = top_beancounter(cub);
++	chargesize = skb_bc(skb)->charged;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	__uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize);
++	if (skb->sk != NULL && sock_has_ubc(skb->sk))
++		ub_sock_wcharge_dec(skb->sk, chargesize);
++	ub_sock_snd_wakeup(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize);
++	ub_skb_set_uncharge(skb);
++}
++
++/* expected to be called under socket lock */
++static void ub_tcpsndbuf_uncharge(struct sk_buff *skb)
++{
++	/*
++	 * ub_sock_ret_wreserv call is abused here, we just want to uncharge
++	 * skb size.  However, to reduce duplication of the code doing
++	 * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call
++	 * a function that already does all of this.  2006/04/27  SAW
++	 */
++	ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged,
++			sock_bc(skb->sk)->poll_reserv);
++	ub_skb_set_uncharge(skb);
++}
++
++void ub_skb_uncharge(struct sk_buff *skb)
++{
++	switch (skb_bc(skb)->resource) {
++		case UB_TCPSNDBUF:
++			ub_tcpsndbuf_uncharge(skb);
++			break;
++		case UB_TCPRCVBUF:
++			ub_tcprcvbuf_uncharge(skb);
++			break;
++		case UB_DGRAMRCVBUF:
++			ub_sockrcvbuf_uncharge(skb);
++			break;
++		case UB_OTHERSOCKBUF:
++			ub_socksndbuf_uncharge(skb);
++			break;
++	}
++}
++
++EXPORT_SYMBOL(ub_skb_uncharge);	/* due to skb_orphan()/conntracks */
++
++/*
++ * Other sock reserve managment
++ */
++
++int ub_sock_getwres_other(struct sock *sk, unsigned long size)
++{
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long flags;
++	unsigned long added_reserv;
++	int err;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	/*
++	 * Nothing except beancounter lock protects skbc->poll_reserv.
++	 * So, take the lock and do the job.
++	 * Dances with added_reserv repeat ub_sock_make_wreserv.
++	 */
++	skbc = sock_bc(sk);
++	ub = top_beancounter(skbc->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	added_reserv = -skbc->poll_reserv;
++	err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size);
++	added_reserv += skbc->poll_reserv;
++	if (!err)
++		skbc->poll_reserv -= size;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	if (added_reserv)
++		charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv);
++
++	return err;
++}
++EXPORT_SYMBOL(ub_sock_getwres_other);
++
++void ub_sock_retwres_other(struct sock *sk,
++		unsigned long size, unsigned long ressize)
++{
++	if (unlikely(!sock_has_ubc(sk)))
++		return;
++
++	ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize);
++}
++
++/*
++ * TCP send buffers accouting. Paged part
++ */
++
++int ub_sock_tcp_chargepage(struct sock *sk)
++{
++	struct sock_beancounter *skbc;
++	unsigned long extra;
++	int err;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	skbc = sock_bc(sk);
++	ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE);
++	if (likely(skbc->poll_reserv >= PAGE_SIZE)) {
++		skbc->poll_reserv -= PAGE_SIZE;
++		return 0;
++	}
++
++	/*
++	 * Ok, full page is not available.
++	 * However, this function must succeed if poll previously indicated
++	 * that write is possible.  We better make a forced charge here
++	 * than reserve a whole page in poll.
++	 */
++	err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE);
++	if (unlikely(err < 0))
++		goto out;
++	if (skbc->poll_reserv < PAGE_SIZE) {
++		extra = PAGE_SIZE - skbc->poll_reserv;
++		err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra,
++				UB_FORCE);
++		if (err < 0)
++			goto out;
++		skbc->poll_reserv += extra;
++	}
++	skbc->poll_reserv -= PAGE_SIZE;
++	return 0;
++
++out:
++	return err;
++}
++
++void ub_sock_tcp_detachpage(struct sock *sk)
++{
++	struct sk_buff *skb;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return;
++
++	/* The page is just detached from socket. The last skb in queue
++	   with paged part holds referrence to it */
++	skb = skb_peek_tail(&sk->sk_write_queue);
++	if (skb == NULL) {
++	   	/* If the queue is empty - all data is sent and page is about
++		   to be freed */
++		ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE,
++				sock_bc(sk)->poll_reserv);
++	} else {
++		/* Last skb is a good aproximation for a last skb with
++		   paged part */
++		skb_bc(skb)->charged += PAGE_SIZE;
++	}
++}
++
++/*
++ * TCPSNDBUF charge functions below are called in the following cases:
++ *  - sending of SYN, SYN-ACK, FIN, the latter charge is forced by
++ *    some technical reasons in TCP code;
++ *  - fragmentation of TCP packets.
++ * These functions are allowed but not required to use poll_reserv.
++ * Originally, these functions didn't do that, since it didn't make
++ * any sense.  Now, since poll_reserv now has a function of general reserve,
++ * they use it.
++ */
++int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb,
++			    enum ub_severity strict)
++{
++	int ret;
++	unsigned long chargesize;
++	struct sock_beancounter *skbc;
++	struct user_beancounter *ub;
++	unsigned long flags;
++
++	if (unlikely(!sock_has_ubc(sk)))
++		return 0;
++
++	skbc = sock_bc(sk);
++	chargesize = skb_charge_fullsize(skb);
++	if (likely(skbc->poll_reserv >= chargesize)) {
++		skbc->poll_reserv -= chargesize;
++		__ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
++		/* XXX hack, see ub_skb_set_charge */
++		skb->sk = sk;
++		return 0;
++	}
++
++	ub = top_beancounter(skbc->ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF,
++			chargesize, strict);
++	/*
++	 * Note: this check is not equivalent of the corresponding check
++	 * in makewreserv.  It's similar in spirit, but an equivalent check
++	 * would be too long and complicated here.
++	 */
++	if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF))
++		skbc->ub_wcharged += chargesize;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	if (likely(!ret)) {
++		charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize);
++		ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
++	}
++	return ret;
++}
++EXPORT_SYMBOL(ub_sock_tcp_chargesend);
++
++void ub_sock_tcp_unchargesend(struct sock *sk, unsigned long size)
++{
++	if (unlikely(!sock_has_ubc(sk)))
++		return;
++	/* see ub_tcpsndbuf_uncharge */
++	ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, sock_bc(sk)->poll_reserv);
++}
++
++/*
++ * Initialization
++ */
++
++int __init skbc_cache_init(void)
++{
++	return 0;
++}
+diff --git a/kernel/bc/oom_kill.c b/kernel/bc/oom_kill.c
+new file mode 100644
+index 0000000..c79e826
+--- /dev/null
++++ b/kernel/bc/oom_kill.c
+@@ -0,0 +1,200 @@
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/swap.h>
++#include <linux/cpuset.h>
++#include <linux/module.h>
++
++#include <bc/beancounter.h>
++#include <bc/oom_kill.h>
++#include <bc/hash.h>
++
++#define UB_OOM_TIMEOUT	(5 * HZ)
++
++int oom_generation;
++int oom_kill_counter;
++static DEFINE_SPINLOCK(oom_lock);
++static DECLARE_WAIT_QUEUE_HEAD(oom_wq);
++
++static inline int ub_oom_completed(struct task_struct *tsk)
++{
++	if (test_tsk_thread_flag(tsk, TIF_MEMDIE))
++		/* we were oom killed - just die */
++		return 1;
++	if (tsk->task_bc.oom_generation != oom_generation)
++		/* some task was succesfully killed */
++		return 1;
++	return 0;
++}
++
++static void ub_clear_oom(void)
++{
++	struct user_beancounter *ub;
++
++	rcu_read_lock();
++	for_each_beancounter(ub)
++		ub->ub_oom_noproc = 0;
++	rcu_read_unlock();
++}
++
++/* Called with cpuset_lock held */
++int ub_oom_lock(void)
++{
++	int timeout;
++	DEFINE_WAIT(oom_w);
++	struct task_struct *tsk;
++
++	tsk = current;
++
++	spin_lock(&oom_lock);
++	if (!oom_kill_counter)
++		goto out_do_oom;
++
++	timeout = UB_OOM_TIMEOUT;
++	while (1) {
++		if (ub_oom_completed(tsk)) {
++			spin_unlock(&oom_lock);
++			return -EINVAL;
++		}
++
++		if (timeout == 0)
++			break;
++
++		__set_current_state(TASK_UNINTERRUPTIBLE);
++		add_wait_queue(&oom_wq, &oom_w);
++		spin_unlock(&oom_lock);
++		cpuset_unlock();
++
++		timeout = schedule_timeout(timeout);
++
++		cpuset_lock();
++		spin_lock(&oom_lock);
++		remove_wait_queue(&oom_wq, &oom_w);
++	}
++
++out_do_oom:
++	ub_clear_oom();
++	return 0;
++}
++
++static inline long ub_current_overdraft(struct user_beancounter *ub)
++{
++	return ub->ub_parms[UB_OOMGUARPAGES].held +
++		((ub->ub_parms[UB_KMEMSIZE].held
++		  + ub->ub_parms[UB_TCPSNDBUF].held
++		  + ub->ub_parms[UB_TCPRCVBUF].held
++		  + ub->ub_parms[UB_OTHERSOCKBUF].held
++		  + ub->ub_parms[UB_DGRAMRCVBUF].held)
++		 >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier;
++}
++
++int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk)
++{
++	struct user_beancounter *mm_ub;
++
++	if (ub == NULL)
++		return 0;
++
++	task_lock(tsk);
++	if (tsk->mm == NULL)
++		mm_ub = NULL;
++	else
++		mm_ub = tsk->mm->mm_ub;
++
++	while (mm_ub != NULL && mm_ub != ub)
++		mm_ub = mm_ub->parent;
++	task_unlock(tsk);
++
++	return mm_ub != ub;
++}
++
++struct user_beancounter *ub_oom_select_worst(void)
++{
++	struct user_beancounter *ub, *walkp;
++	long ub_maxover;
++
++	ub_maxover = 0;
++	ub = NULL;
++
++	rcu_read_lock();
++	for_each_beancounter (walkp) {
++		long ub_overdraft;
++
++		if (walkp->parent != NULL)
++			continue;
++		if (walkp->ub_oom_noproc)
++			continue;
++
++		ub_overdraft = ub_current_overdraft(walkp);
++		if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) {
++			put_beancounter(ub);
++			ub = walkp;
++			ub_maxover = ub_overdraft;
++		}
++	}
++
++	if (ub)
++		ub->ub_oom_noproc = 1;
++	rcu_read_unlock();
++
++	return ub;
++}
++
++void ub_oom_mm_killed(struct user_beancounter *ub)
++{
++	static struct ub_rate_info ri = { 5, 60*HZ };
++
++	/* increment is serialized with oom_lock */
++	ub->ub_parms[UB_OOMGUARPAGES].failcnt++;
++
++	if (ub_ratelimit(&ri))
++		show_mem();
++}
++
++void ub_oom_unlock(void)
++{
++	spin_unlock(&oom_lock);
++}
++
++void ub_oom_task_dead(struct task_struct *tsk)
++{
++	spin_lock(&oom_lock);
++	oom_kill_counter = 0;
++	oom_generation++;
++
++	printk("OOM killed process %s (pid=%d, ve=%d) exited, "
++			"free=%lu gen=%d.\n",
++			tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env),
++			nr_free_pages(), oom_generation);
++	/* if there is time to sleep in ub_oom_lock -> sleep will continue */
++	wake_up_all(&oom_wq);
++	spin_unlock(&oom_lock);
++}
++
++void ub_out_of_memory(struct user_beancounter *scope)
++{
++	struct user_beancounter *ub;
++	struct task_struct *p;
++
++	cpuset_lock();
++	spin_lock(&oom_lock);
++	ub_clear_oom();
++	ub = get_beancounter(scope);
++
++	read_lock(&tasklist_lock);
++retry:
++	p = select_bad_process(ub, NULL);
++	if (p == NULL || PTR_ERR(p) == -1UL)
++		goto unlock;
++
++	if (oom_kill_process(p, (gfp_t)-1, -1, NULL, "UB Out of memory"))
++		goto retry;
++
++	put_beancounter(ub);
++
++unlock:
++	read_unlock(&tasklist_lock);
++	spin_unlock(&oom_lock);
++	cpuset_unlock();
++}
++EXPORT_SYMBOL(ub_out_of_memory);
+diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c
+new file mode 100644
+index 0000000..5b1ae4b
+--- /dev/null
++++ b/kernel/bc/proc.c
+@@ -0,0 +1,682 @@
++/*
++ *  kernel/bc/proc.c 
++ *
++ *  Copyright (C) 2006 OpenVZ. SWsoft Inc.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/init.h>
++#include <linux/module.h>
++
++#include <bc/beancounter.h>
++#include <bc/hash.h>
++#include <bc/rss_pages.h>
++#include <bc/proc.h>
++
++/* Generic output formats */
++#if BITS_PER_LONG == 32
++const char *bc_proc_lu_fmt = "\t%-20s %10lu\n";
++const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
++const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
++const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n";
++#else
++const char *bc_proc_lu_fmt = "\t%-20s %21lu\n";
++const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
++const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
++const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n";
++#endif
++
++#if BITS_PER_LONG == 32
++static const char *head_fmt = "%10s  %-12s %10s %10s %10s %10s %10s\n";
++static const char *res_fmt = "%10s  %-12s %10lu %10lu %10lu %10lu %10lu\n";
++#else
++static const char *head_fmt = "%10s  %-12s %20s %20s %20s %20s %20s\n";
++static const char *res_fmt = "%10s  %-12s %20lu %20lu %20lu %20lu %20lu\n";
++#endif
++
++static void ub_show_res(struct seq_file *f, struct user_beancounter *ub,
++		int r, int show_uid)
++{
++	int len;
++	char ub_uid[64];
++
++	if (show_uid && r == 0) {
++		len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2);
++		ub_uid[len] = ':';
++		ub_uid[len + 1] = '\0';
++	} else
++		strcpy(ub_uid, "");
++
++	seq_printf(f, res_fmt, ub_uid, ub_rnames[r],
++			ub->ub_parms[r].held,
++			ub->ub_parms[r].maxheld,
++			ub->ub_parms[r].barrier,
++			ub->ub_parms[r].limit,
++			ub->ub_parms[r].failcnt);
++}
++
++static void __show_resources(struct seq_file *f, struct user_beancounter *ub,
++		int show_uid)
++{
++	int i;
++
++	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
++		if (strcmp(ub_rnames[i], "dummy") != 0)
++			ub_show_res(f, ub, i, show_uid);
++
++	for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++)
++		ub_show_res(f, ub, i, show_uid);
++}
++
++static int bc_resources_show(struct seq_file *f, void *v)
++{
++	__show_resources(f, seq_beancounter(f), 0);
++	return 0;
++}
++
++static struct bc_proc_entry bc_resources_entry = {
++	.name = "resources",
++	.u.show = bc_resources_show,
++};
++
++#ifdef CONFIG_UBC_DEBUG
++static int bc_debug_show(struct seq_file *f, void *v)
++{
++	struct user_beancounter *ub;
++	char buf[64];
++
++	ub = seq_beancounter(f);
++	print_ub_uid(ub, buf, sizeof(buf));
++	seq_printf(f, "uid: %s\n", buf);
++	seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount));
++
++	seq_printf(f, "bc: %p\n", ub);
++	seq_printf(f, "par: %p\n", ub->parent);
++	seq_printf(f, "priv: %p\n", ub->private_data);
++	return 0;
++}
++
++static struct bc_proc_entry bc_debug_entry = {
++	.name = "debug",
++	.u.show = bc_debug_show,
++};
++#endif
++
++static int ub_show(struct seq_file *f, void *v)
++{
++	int i;
++
++	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
++		ub_show_res(f, (struct user_beancounter *)v, i, 1);
++	return 0;
++}
++
++static int res_show(struct seq_file *f, void *v)
++{
++	__show_resources(f, (struct user_beancounter *)v, 1);
++	return 0;
++}
++
++static int ub_accessible(struct user_beancounter *exec,
++		struct user_beancounter *target)
++{
++	struct user_beancounter *p, *q;
++
++	p = top_beancounter(exec);
++	q = top_beancounter(target);
++
++	return (p == get_ub0() || p == q);
++}
++
++static void ub_show_header(struct seq_file *f)
++{
++	seq_printf(f, "Version: 2.5\n");
++	seq_printf(f, head_fmt, "uid", "resource",
++			"held", "maxheld", "barrier", "limit", "failcnt");
++}
++
++static void *ub_start(struct seq_file *f, loff_t *ppos)
++{
++	struct user_beancounter *ub;
++	struct user_beancounter *exec_ub; 
++	unsigned long pos;
++
++	pos = *ppos;
++	if (pos == 0)
++		ub_show_header(f);
++
++	exec_ub = get_exec_ub();
++
++	rcu_read_lock();
++	for_each_beancounter(ub) {
++		if (ub->parent != NULL)
++			continue;
++		if (!ub_accessible(exec_ub, ub))
++			continue;
++		if (pos-- == 0)
++			return ub;
++	}
++	return NULL;
++}
++
++static void *ub_next(struct seq_file *f, void *v, loff_t *ppos)
++{
++	struct user_beancounter *ub;
++	struct list_head *entry;
++	struct user_beancounter *exec_ub;
++
++	exec_ub = get_exec_ub();
++	ub = (struct user_beancounter *)v;
++
++	entry = &ub->ub_list;
++
++	list_for_each_continue_rcu(entry, &ub_list_head) {
++		ub = list_entry(entry, struct user_beancounter, ub_list);
++		if (ub->parent != NULL)
++			continue;
++		if (!ub_accessible(exec_ub, ub))
++			continue;
++
++		(*ppos)++;
++		return ub;
++	}
++	return NULL;
++}
++
++static void ub_stop(struct seq_file *f, void *v)
++{
++	rcu_read_unlock();
++}
++
++static struct seq_operations ub_seq_ops = {
++	.start = ub_start,
++	.next  = ub_next,
++	.stop  = ub_stop,
++	.show  = ub_show,
++};
++
++static int ub_open(struct inode *inode, struct file *filp)
++{
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return -EACCES;
++
++	return seq_open(filp, &ub_seq_ops);
++}
++
++static struct file_operations ub_file_operations = {
++	.open		= ub_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++
++static struct seq_operations res_seq_ops = {
++	.start = ub_start,
++	.next  = ub_next,
++	.stop  = ub_stop,
++	.show  = res_show,
++};
++
++static int res_open(struct inode *inode, struct file *filp)
++{
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return -EACCES;
++
++	return seq_open(filp, &res_seq_ops);
++}
++
++static struct file_operations resources_operations = {
++	.open		= res_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= seq_release,
++};
++
++static struct bc_proc_entry bc_all_resources_entry = {
++	.name = "resources",
++	.u.fops = &resources_operations,
++};
++
++/*
++ * Generic showing stuff
++ */
++
++static int cookies, num_entries;
++static struct bc_proc_entry *bc_entries __read_mostly;
++static struct bc_proc_entry *bc_root_entries __read_mostly;
++static DEFINE_SPINLOCK(bc_entries_lock);
++static struct proc_dir_entry *bc_proc_root;
++
++void bc_register_proc_entry(struct bc_proc_entry *e)
++{
++	spin_lock(&bc_entries_lock);
++	e->cookie = ++cookies;
++	e->next = bc_entries;
++	bc_entries = e;
++	num_entries++;
++	spin_unlock(&bc_entries_lock);
++}
++
++EXPORT_SYMBOL(bc_register_proc_entry);
++
++void bc_register_proc_root_entry(struct bc_proc_entry *e)
++{
++	spin_lock(&bc_entries_lock);
++	e->cookie = ++cookies;
++	e->next = bc_root_entries;
++	bc_root_entries = e;
++	bc_proc_root->nlink++;
++	spin_unlock(&bc_entries_lock);
++}
++
++EXPORT_SYMBOL(bc_register_proc_root_entry);
++
++/*
++ * small helpers
++ */
++
++static inline unsigned long bc_make_ino(struct user_beancounter *ub)
++{
++	unsigned long ret;
++
++	ret = 0xbc000000;
++	if (ub->parent)
++		ret |= ((ub->parent->ub_uid) << 4);
++	ret |= (ub->ub_uid + 1);
++	return ret;
++}
++
++static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de)
++{
++	return 0xbe000000 + de->cookie;
++}
++
++static int bc_d_delete(struct dentry *d)
++{
++	return 1;
++}
++
++static void bc_d_release(struct dentry *d)
++{
++	put_beancounter((struct user_beancounter *)d->d_fsdata);
++}
++
++static struct inode_operations bc_entry_iops;
++static struct file_operations bc_entry_fops;
++static struct dentry_operations bc_dentry_ops = {
++	.d_delete = bc_d_delete,
++	.d_release = bc_d_release,
++};
++
++/*
++ * common directory operations' helpers
++ */
++
++static int bc_readdir(struct file *file, filldir_t filler, void *data,
++		struct user_beancounter *parent)
++{
++	int err = 0;
++	loff_t pos, filled;
++	struct user_beancounter *ub, *prev;
++	struct bc_proc_entry *pde;
++
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return -EPERM;
++
++	pos = file->f_pos;
++	if (pos == 0) {
++		err = (*filler)(data, ".", 1, pos,
++				file->f_dentry->d_inode->i_ino, DT_DIR);
++		if (err < 0) {
++			err = 0;
++			goto out;
++		}
++		pos++;
++	}
++
++	if (pos == 1) {
++		err = (*filler)(data, "..", 2, pos,
++				parent_ino(file->f_dentry), DT_DIR);
++		if (err < 0) {
++			err = 0;
++			goto out;
++		}
++		pos++;
++	}
++
++	filled = 2;
++	for (pde = (parent == NULL ? bc_root_entries : bc_entries);
++			pde != NULL; pde = pde->next) {
++		if (filled++ < pos)
++			continue;
++
++		err = (*filler)(data, pde->name, strlen(pde->name), pos,
++				bc_make_file_ino(pde), DT_REG);
++		if (err < 0) {
++			err = 0;
++			goto out;
++		}
++		pos++;
++	}
++
++	rcu_read_lock();
++	prev = NULL;
++	ub = list_entry(&ub_list_head, struct user_beancounter, ub_list);
++	while (1) {
++		int len;
++		unsigned long ino;
++		char buf[64];
++
++		ub = list_entry(rcu_dereference(ub->ub_list.next),
++				struct user_beancounter, ub_list);
++		if (&ub->ub_list == &ub_list_head)
++			break;
++
++		if (ub->parent != parent)
++			continue;
++
++		if (filled++ < pos)
++			continue;
++
++		if (!get_beancounter_rcu(ub))
++			continue;
++
++		rcu_read_unlock();
++		put_beancounter(prev);
++
++		len = print_ub_uid(ub, buf, sizeof(buf));
++		ino = bc_make_ino(ub);
++
++		err = (*filler)(data, buf, len, pos, ino, DT_DIR);
++		if (err < 0) {
++			err = 0;
++			put_beancounter(ub);
++			goto out;
++		}
++
++		rcu_read_lock();
++		prev = ub;
++		pos++;
++	}
++	rcu_read_unlock();
++	put_beancounter(prev);
++out:
++	file->f_pos = pos;
++	return err;
++}
++
++static int bc_looktest(struct inode *ino, void *data)
++{
++	return ino->i_op == &bc_entry_iops && ino->i_private == data;
++}
++
++static int bc_lookset(struct inode *ino, void *data)
++{
++	struct user_beancounter *ub;
++
++	ub = (struct user_beancounter *)data;
++	ino->i_private = data;
++	ino->i_ino = bc_make_ino(ub);
++	ino->i_fop = &bc_entry_fops;
++	ino->i_op = &bc_entry_iops;
++	ino->i_mode = S_IFDIR | S_IRUSR | S_IXUGO;
++	/* subbeancounters are not included, but who cares? */
++	ino->i_nlink = num_entries + 2;
++	ino->i_gid = 0;
++	ino->i_uid = 0;
++	return 0;
++}
++
++static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir,
++		struct dentry *dentry)
++{
++	struct inode *ino;
++
++	ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub);
++	if (ino == NULL)
++		goto out_put;
++
++	unlock_new_inode(ino);
++	dentry->d_op = &bc_dentry_ops;
++	dentry->d_fsdata = ub;
++	d_add(dentry, ino);
++	return NULL;
++
++out_put:
++	put_beancounter(ub);
++	return ERR_PTR(-ENOENT);
++}
++
++/*
++ * files (bc_proc_entry) manipulations
++ */
++
++static struct dentry *bc_lookup_file(struct inode *dir,
++		struct dentry *dentry, struct bc_proc_entry *root,
++		int (*test)(struct inode *, void *),
++		int (*set)(struct inode *, void *))
++{
++	struct bc_proc_entry *pde;
++	struct inode *ino;
++
++	for (pde = root; pde != NULL; pde = pde->next)
++		if (strcmp(pde->name, dentry->d_name.name) == 0)
++			break;
++
++	if (pde == NULL)
++		return ERR_PTR(-ESRCH);
++
++	ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde);
++	if (ino == NULL)
++		return ERR_PTR(-ENOENT);
++
++	unlock_new_inode(ino);
++	dentry->d_op = &bc_dentry_ops;
++	d_add(dentry, ino);
++	return NULL;
++}
++
++static int bc_file_open(struct inode *ino, struct file *filp)
++{
++	struct bc_proc_entry *de;
++	struct user_beancounter *ub;
++
++	de = (struct bc_proc_entry *)ino->i_private;
++	ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata;
++	BUG_ON(ub->ub_magic != UB_MAGIC);
++
++	/*
++	 * ub can't disappear: we hold d_parent, he holds the beancounter
++	 */
++	return single_open(filp, de->u.show, ub);
++}
++
++static struct file_operations bc_file_ops = {
++	.open		= bc_file_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
++static int bc_looktest_entry(struct inode *ino, void *data)
++{
++	return ino->i_fop == &bc_file_ops && ino->i_private == data;
++}
++
++static int bc_lookset_entry(struct inode *ino, void *data)
++{
++	struct bc_proc_entry *de;
++
++	de = (struct bc_proc_entry *)data;
++	ino->i_private = data;
++	ino->i_ino = bc_make_file_ino(de);
++	ino->i_fop = &bc_file_ops,
++	ino->i_mode = S_IFREG | S_IRUSR;
++	ino->i_nlink = 1;
++	ino->i_gid = 0;
++	ino->i_uid = 0;
++	return 0;
++}
++
++static inline struct dentry *bc_lookup_files(struct inode *dir,
++		struct dentry *de)
++{
++	return bc_lookup_file(dir, de, bc_entries,
++			bc_looktest_entry, bc_lookset_entry);
++}
++
++static int bc_looktest_root_entry(struct inode *ino, void *data)
++{
++	struct bc_proc_entry *de;
++
++	de = (struct bc_proc_entry *)data;
++	return ino->i_fop == de->u.fops && ino->i_private == data;
++}
++
++static int bc_lookset_root_entry(struct inode *ino, void *data)
++{
++	struct bc_proc_entry *de;
++
++	de = (struct bc_proc_entry *)data;
++	ino->i_private = data;
++	ino->i_ino = bc_make_file_ino(de);
++	ino->i_fop = de->u.fops;
++	ino->i_mode = S_IFREG | S_IRUSR;
++	ino->i_nlink = 1;
++	ino->i_gid = 0;
++	ino->i_uid = 0;
++	return 0;
++}
++
++static inline struct dentry *bc_lookup_root_files(struct inode *dir,
++		struct dentry *de)
++{
++	return bc_lookup_file(dir, de, bc_root_entries,
++			bc_looktest_root_entry, bc_lookset_root_entry);
++}
++
++/*
++ * /proc/bc/.../<id> directory operations
++ */
++
++static int bc_entry_readdir(struct file *file, void *data, filldir_t filler)
++{
++	return bc_readdir(file, filler, data,
++			(struct user_beancounter *)file->f_dentry->d_fsdata);
++}
++
++static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry,
++		struct nameidata *nd)
++{
++	int id;
++	char *end;
++	struct user_beancounter *par, *ub;
++	struct dentry *de;
++
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return ERR_PTR(-EPERM);
++
++	de = bc_lookup_files(dir, dentry);
++	if (de != ERR_PTR(-ESRCH))
++		return de;
++
++	id = simple_strtol(dentry->d_name.name, &end, 10);
++	if (*end != '.')
++		return ERR_PTR(-ENOENT);
++
++	par = (struct user_beancounter *)dir->i_private;
++	if (par->ub_uid != id)
++		return ERR_PTR(-ENOENT);
++
++	id = simple_strtol(end + 1, &end, 10);
++	if (*end != '\0')
++		return ERR_PTR(-ENOENT);
++
++	ub = get_subbeancounter_byid(par, id, 0);
++	if (ub == NULL)
++		return ERR_PTR(-ENOENT);
++
++	return bc_lookup(ub, dir, dentry);
++}
++
++static struct file_operations bc_entry_fops = {
++	.read = generic_read_dir,
++	.readdir = bc_entry_readdir,
++};
++
++static struct inode_operations bc_entry_iops = {
++	.lookup = bc_entry_lookup,
++};
++
++/*
++ * /proc/bc directory operations
++ */
++
++static int bc_root_readdir(struct file *file, void *data, filldir_t filler)
++{
++	return bc_readdir(file, filler, data, NULL);
++}
++
++static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry,
++		struct nameidata *nd)
++{
++	int id;
++	char *end;
++	struct user_beancounter *ub;
++	struct dentry *de;
++
++	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
++		return ERR_PTR(-EPERM);
++
++	de = bc_lookup_root_files(dir, dentry);
++	if (de != ERR_PTR(-ESRCH))
++		return de;
++
++	id = simple_strtol(dentry->d_name.name, &end, 10);
++	if (*end != '\0')
++		return ERR_PTR(-ENOENT);
++
++	ub = get_beancounter_byuid(id, 0);
++	if (ub == NULL)
++		return ERR_PTR(-ENOENT);
++
++	return bc_lookup(ub, dir, dentry);
++}
++
++static struct file_operations bc_root_fops = {
++	.read = generic_read_dir,
++	.readdir = bc_root_readdir,
++};
++
++static struct inode_operations bc_root_iops = {
++	.lookup = bc_root_lookup,
++};
++
++static int __init ub_init_proc(void)
++{
++	struct proc_dir_entry *entry;
++
++	bc_proc_root = create_proc_entry("bc",
++			S_IFDIR | S_IRUGO | S_IXUGO, NULL);
++	if (bc_proc_root == NULL)
++		panic("Can't create /proc/bc entry");
++
++	bc_proc_root->proc_fops = &bc_root_fops;
++	bc_proc_root->proc_iops = &bc_root_iops;
++
++	bc_register_proc_entry(&bc_resources_entry);
++#ifdef CONFIG_UBC_DEBUG
++	bc_register_proc_entry(&bc_debug_entry);
++#endif
++	bc_register_proc_root_entry(&bc_all_resources_entry);
++
++	entry = proc_create("user_beancounters",
++			S_IRUGO, &glob_proc_root, &ub_file_operations);
++	return 0;
++}
++
++core_initcall(ub_init_proc);
+diff --git a/kernel/bc/rss_pages.c b/kernel/bc/rss_pages.c
+new file mode 100644
+index 0000000..391585e
+--- /dev/null
++++ b/kernel/bc/rss_pages.c
+@@ -0,0 +1,437 @@
++/*
++ *  kernel/bc/rss_pages.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/mm.h>
++#include <linux/gfp.h>
++#include <linux/vmalloc.h>
++
++#include <bc/beancounter.h>
++#include <bc/hash.h>
++#include <bc/vmpages.h>
++#include <bc/rss_pages.h>
++#include <bc/io_acct.h>
++
++static struct kmem_cache *pb_cachep;
++spinlock_t pb_lock = SPIN_LOCK_UNLOCKED;
++static struct page_beancounter **pb_hash_table;
++static unsigned int pb_hash_mask;
++
++/*
++ * Auxiliary staff
++ */
++
++static inline struct page_beancounter *next_page_pb(struct page_beancounter *p)
++{
++	return list_entry(p->page_list.next, struct page_beancounter,
++			page_list);
++}
++
++static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p)
++{
++	return list_entry(p->page_list.prev, struct page_beancounter,
++			page_list);
++}
++
++/*
++ * Held pages manipulation
++ */
++static inline void set_held_pages(struct user_beancounter *bc)
++{
++	/* all three depend on ub_held_pages */
++	__ub_update_physpages(bc);
++	__ub_update_oomguarpages(bc);
++	__ub_update_privvm(bc);
++}
++
++static inline void do_dec_held_pages(struct user_beancounter *ub, int value)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_held_pages -= value;
++	set_held_pages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static void dec_held_pages(struct user_beancounter *ub, int value)
++{
++	for (; ub != NULL; ub = ub->parent)
++		do_dec_held_pages(ub, value);
++}
++
++static inline void do_inc_held_pages(struct user_beancounter *ub, int value)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_held_pages += value;
++	set_held_pages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static void inc_held_pages(struct user_beancounter *ub, int value)
++{
++	for (; ub != NULL; ub = ub->parent)
++		do_inc_held_pages(ub, value);
++}
++
++/*
++ * Alloc - free
++ */
++
++inline int pb_alloc(struct page_beancounter **pbc)
++{
++	*pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL);
++	if (*pbc != NULL) {
++		(*pbc)->next_hash = NULL;
++		(*pbc)->pb_magic = PB_MAGIC;
++	}
++	return (*pbc == NULL);
++}
++
++inline void pb_free(struct page_beancounter **pb)
++{
++	if (*pb != NULL) {
++		kmem_cache_free(pb_cachep, *pb);
++		*pb = NULL;
++	}
++}
++
++void pb_free_list(struct page_beancounter **p_pb)
++{
++	struct page_beancounter *list, *pb;
++	
++	list = *p_pb;
++	if (list == PBC_COPY_SAME)
++		return;
++
++	while (list) {
++		pb = list;
++		list = list->next_hash;
++		pb_free(&pb);
++	}
++	*p_pb = NULL;
++}
++
++/*
++ * head -> <new objs> -> <old objs> -> ...
++ */
++static int __alloc_list(struct page_beancounter **head, int num)
++{
++	struct page_beancounter *pb;
++
++	while (num > 0) {
++		if (pb_alloc(&pb))
++			return -1;
++		pb->next_hash = *head;
++		*head = pb;
++		num--;
++	}
++
++	return num;
++}
++
++/* 
++ * Ensure that the list contains at least num elements.
++ * p_pb points to an initialized list, may be of the zero length. 
++ *
++ * mm->page_table_lock should be held
++ */
++int pb_alloc_list(struct page_beancounter **p_pb, int num)
++{
++	struct page_beancounter *list;
++
++	for (list = *p_pb; list != NULL && num; list = list->next_hash, num--);
++	if (!num)
++		return 0;
++
++	/*
++	 *  *p_pb(after)       *p_pb (before)
++	 *     \                  \
++	 *     <new objs> -...-> <old objs> -> ...
++	 */
++	if (__alloc_list(p_pb, num) < 0)
++		goto nomem;
++	return 0;
++
++nomem:
++	pb_free_list(p_pb);
++	return -ENOMEM;
++}
++
++/*
++ * Allocates a page_beancounter for each
++ * user_beancounter in a hash
++ */
++int pb_alloc_all(struct page_beancounter **pbs)
++{
++	int need_alloc;
++	struct user_beancounter *ub;
++
++	need_alloc = 0;
++	rcu_read_lock();
++	for_each_beancounter(ub)
++		need_alloc++;
++	rcu_read_unlock();
++
++	if (!__alloc_list(pbs, need_alloc))
++		return 0;
++
++	pb_free_list(pbs);
++	return -ENOMEM;
++}
++
++/*
++ * Hash routines
++ */
++
++static inline int pb_hash(struct user_beancounter *ub, struct page *page)
++{
++	return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask;
++}
++
++/* pb_lock should be held */
++static inline void insert_pb(struct page_beancounter *p, struct page *page,
++		struct user_beancounter *ub, int hash)
++{
++	p->page = page;
++	p->ub = get_beancounter(ub);
++	p->next_hash = pb_hash_table[hash];
++	pb_hash_table[hash] = p;
++	inc_pbc_count(ub);
++}
++
++/*
++ * Heart
++ */
++
++static int __pb_dup_ref(struct page *page, struct user_beancounter *bc,
++		int hash)
++{
++	struct page_beancounter *p;
++
++	for (p = pb_hash_table[hash];
++			p != NULL && (p->page != page || p->ub != bc);
++			p = p->next_hash);
++	if (p == NULL)
++		return -1;
++
++	PB_COUNT_INC(p->refcount);
++	return 0;
++}
++
++static void __pb_add_ref(struct page *page, struct user_beancounter *bc,
++		struct page_beancounter **ppb, int hash)
++{
++	struct page_beancounter *head, *p, **hp;
++	int shift;
++
++	p = *ppb;
++	*ppb = p->next_hash;
++
++	insert_pb(p, page, bc, hash);
++	hp = page_pblist(page);
++	head = *hp;
++
++	if (head != NULL) {
++		/* 
++		 * Move the first element to the end of the list.
++		 * List head (pb_head) is set to the next entry.
++		 * Note that this code works even if head is the only element
++		 * on the list (because it's cyclic). 
++		 */
++		BUG_ON(head->pb_magic != PB_MAGIC);
++		*hp = next_page_pb(head);
++		PB_SHIFT_INC(head->refcount);
++		shift = PB_SHIFT_GET(head->refcount);
++		/* 
++		 * Update user beancounter, the share of head has been changed.
++		 * Note that the shift counter is taken after increment. 
++		 */
++		dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift);
++		/* add the new page beancounter to the end of the list */
++		head = *hp;
++		list_add_tail(&p->page_list, &head->page_list);
++	} else {
++		*hp = p;
++		shift = 0;
++		INIT_LIST_HEAD(&p->page_list);
++	}
++
++	p->refcount = PB_REFCOUNT_MAKE(shift, 1);
++	/* update user beancounter for the new page beancounter */
++	inc_held_pages(bc, UB_PAGE_WEIGHT >> shift);
++}
++
++void pb_add_ref(struct page *page, struct mm_struct *mm,
++		struct page_beancounter **p_pb)
++{
++	int hash;
++	struct user_beancounter *bc;
++
++	bc = mm->mm_ub;
++	if (bc == NULL)
++		return;
++
++	if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++		return;
++
++	hash = pb_hash(bc, page);
++
++	spin_lock(&pb_lock);
++	if (__pb_dup_ref(page, bc, hash))
++		__pb_add_ref(page, bc, p_pb, hash);
++	spin_unlock(&pb_lock);
++}
++
++void pb_dup_ref(struct page *page, struct mm_struct *mm,
++		struct page_beancounter **p_pb)
++{
++	int hash;
++	struct user_beancounter *bc;
++
++	bc = mm->mm_ub;
++	if (bc == NULL)
++		return;
++
++	if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++		return;
++
++	hash = pb_hash(bc, page);
++
++	spin_lock(&pb_lock);
++	if (*page_pblist(page) == NULL)
++		/*
++		 * pages like ZERO_PAGE must not be accounted in pbc
++		 * so on fork we just skip them
++		 */
++		goto out_unlock;
++
++	if (unlikely(*p_pb != PBC_COPY_SAME))
++		__pb_add_ref(page, bc, p_pb, hash);
++	else if (unlikely(__pb_dup_ref(page, bc, hash)))
++		WARN_ON(1);
++out_unlock:
++	spin_unlock(&pb_lock);
++}
++
++void pb_remove_ref(struct page *page, struct mm_struct *mm)
++{
++	int hash;
++	struct user_beancounter *bc;
++	struct page_beancounter *p, **q, *f;
++	int shift, shiftt;
++
++	bc = mm->mm_ub;
++	if (bc == NULL)
++		return;
++
++	if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++		return;
++
++	hash = pb_hash(bc, page);
++
++	spin_lock(&pb_lock);
++	for (q = pb_hash_table + hash, p = *q;
++			p != NULL && (p->page != page || p->ub != bc);
++			q = &p->next_hash, p = *q);
++	if (p == NULL)
++		goto out_unlock;
++
++	PB_COUNT_DEC(p->refcount);
++	if (PB_COUNT_GET(p->refcount))
++		/* 
++		 * More references from the same user beancounter exist.
++		 * Nothing needs to be done. 
++		 */
++		goto out_unlock;
++
++	/* remove from the hash list */
++	f = p;
++	*q = p->next_hash;
++
++	shift = PB_SHIFT_GET(p->refcount);
++
++	dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift);
++
++	q = page_pblist(page);
++	if (*q == p) {
++		if (list_empty(&p->page_list)) {
++			*q = NULL;
++			goto out_free;
++		}
++
++		*q = next_page_pb(p);
++	}
++	list_del(&p->page_list);
++
++	/* Now balance the list.  Move the tail and adjust its shift counter. */
++	p = prev_page_pb(*q);
++	shiftt = PB_SHIFT_GET(p->refcount);
++	*q = p;
++	PB_SHIFT_DEC(p->refcount);
++
++	inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
++
++	/* 
++	 * If the shift counter of the moved beancounter is different from the
++	 * removed one's, repeat the procedure for one more tail beancounter 
++	 */
++	if (shiftt > shift) {
++		p = prev_page_pb(*q);
++		*q = p;
++		PB_SHIFT_DEC(p->refcount);
++		inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
++	}
++out_free:
++	dec_pbc_count(f->ub);
++	spin_unlock(&pb_lock);
++
++	put_beancounter(f->ub);
++	pb_free(&f);
++	return;
++
++out_unlock:
++	spin_unlock(&pb_lock);
++}
++
++struct user_beancounter *pb_grab_page_ub(struct page *page)
++{
++	struct page_beancounter *pb;
++	struct user_beancounter *ub;
++
++	spin_lock(&pb_lock);
++	pb = *page_pblist(page);
++	ub = (pb == NULL ? ERR_PTR(-EINVAL) :
++			get_beancounter(pb->ub));
++	spin_unlock(&pb_lock);
++	return ub;
++}
++
++void __init ub_init_pbc(void)
++{
++	unsigned long hash_size;
++
++	pb_cachep = kmem_cache_create("page_beancounter", 
++			sizeof(struct page_beancounter), 0,
++			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
++	hash_size = num_physpages >> 2;
++	for (pb_hash_mask = 1;
++		(hash_size & pb_hash_mask) != hash_size;
++		pb_hash_mask = (pb_hash_mask << 1) + 1);
++	hash_size = pb_hash_mask + 1;
++	printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size);
++	pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *));
++	memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *));
++
++	ub_init_io(pb_cachep);
++}
+diff --git a/kernel/bc/statd.c b/kernel/bc/statd.c
+new file mode 100644
+index 0000000..bf6354b
+--- /dev/null
++++ b/kernel/bc/statd.c
+@@ -0,0 +1,453 @@
++/*
++ *  kernel/bc/statd.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/timer.h>
++#include <linux/sched.h>
++#include <linux/init.h>
++#include <linux/jiffies.h>
++#include <linux/list.h>
++#include <linux/errno.h>
++#include <linux/suspend.h>
++#include <linux/freezer.h>
++
++#include <asm/uaccess.h>
++#include <asm/param.h>
++
++#include <bc/beancounter.h>
++#include <bc/hash.h>
++#include <bc/statd.h>
++
++static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED;
++static LIST_HEAD(ubs_notify_list);
++static long ubs_min_interval;
++static ubstattime_t ubs_start_time, ubs_end_time;
++static struct timer_list ubs_timer;
++
++static int ubstat_get_list(void __user *buf, long size)
++{
++	int retval;
++	struct user_beancounter *ub, *ubp;
++	long *page, *ptr, *end;
++	int len;
++
++	page = (long *)__get_free_page(GFP_KERNEL);
++	if (page == NULL)
++		return -ENOMEM;
++
++	retval = 0;
++	ubp = NULL;
++	ptr = page;
++	end = page + PAGE_SIZE / sizeof(*ptr);
++
++	spin_lock_irq(&ub_hash_lock);
++	for_each_beancounter(ub) {
++		if (ub->parent != NULL)
++			continue;
++		*ptr++ = ub->ub_uid;
++		if (ptr != end)
++			continue;
++
++		get_beancounter(ub);
++		spin_unlock_irq(&ub_hash_lock);
++
++		put_beancounter(ubp);
++		ubp = ub;
++
++		len = min_t(long, (ptr - page) * sizeof(*ptr), size);
++		if (copy_to_user(buf, page, len)) {
++			retval = -EFAULT;
++			goto out_put;
++		}
++		retval += len;
++		if (len < PAGE_SIZE)
++			goto out_put;
++		buf += len;
++		size -= len;
++
++		ptr = page;
++		end = page + PAGE_SIZE / sizeof(*ptr);
++
++		spin_lock_irq(&ub_hash_lock);
++	}
++	spin_unlock_irq(&ub_hash_lock);
++
++	put_beancounter(ubp);
++	size = min_t(long, (ptr - page) * sizeof(*ptr), size);
++	if (size > 0 && copy_to_user(buf, page, size)) {
++		retval = -EFAULT;
++		goto out_put;
++	}
++	retval += size;
++
++out_put:
++	put_beancounter(ubp);
++	free_page((unsigned long)page);
++	return retval;
++}
++
++static int ubstat_gettime(void __user *buf, long size)
++{
++	ubgettime_t data;
++	int retval;
++
++	spin_lock(&ubs_notify_lock);
++	data.start_time = ubs_start_time;
++	data.end_time = ubs_end_time;
++	data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
++	spin_unlock(&ubs_notify_lock);
++
++	retval = min_t(long, sizeof(data), size);
++	if (copy_to_user(buf, &data, retval))
++		retval = -EFAULT;
++	return retval;
++}
++
++static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
++{
++	struct {
++		ubstattime_t	start_time;
++		ubstattime_t	end_time;
++		ubstatparm_t	param[1];
++	} *data;
++
++	data = kbuf;
++	data->start_time = ubs_start_time;
++	data->end_time = ubs_end_time;
++
++	data->param[0].maxheld = ub->ub_store[res].maxheld;
++	data->param[0].failcnt = ub->ub_store[res].failcnt;
++
++	return sizeof(*data);
++}
++
++static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
++{
++	int wrote;
++	struct {
++		ubstattime_t	start_time;
++		ubstattime_t	end_time;
++		ubstatparm_t	param[UB_RESOURCES];
++	} *data;
++	int resource;
++
++	data = kbuf;
++	data->start_time = ubs_start_time;
++	data->end_time = ubs_end_time;
++	wrote = sizeof(data->start_time) + sizeof(data->end_time);
++
++	for (resource = 0; resource < UB_RESOURCES; resource++) {
++		if (size < wrote + sizeof(data->param[resource]))
++			break;
++		data->param[resource].maxheld = ub->ub_store[resource].maxheld;
++		data->param[resource].failcnt = ub->ub_store[resource].failcnt;
++		wrote += sizeof(data->param[resource]); 
++	}
++
++	return wrote;
++}
++
++static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
++		int size)
++{
++	int wrote;
++	struct {
++		ubstattime_t	start_time;
++		ubstattime_t	end_time;
++		ubstatparmf_t	param[UB_RESOURCES];
++	} *data;
++	int resource;
++
++	data = kbuf;
++	data->start_time = ubs_start_time;
++	data->end_time = ubs_end_time;
++	wrote = sizeof(data->start_time) + sizeof(data->end_time);
++
++	for (resource = 0; resource < UB_RESOURCES; resource++) {
++		if (size < wrote + sizeof(data->param[resource]))
++			break;
++		/* The beginning of ubstatparmf_t matches struct ubparm. */
++		memcpy(&data->param[resource], &ub->ub_store[resource],
++				sizeof(ub->ub_store[resource]));
++		data->param[resource].__unused1 = 0;
++		data->param[resource].__unused2 = 0;
++		wrote += sizeof(data->param[resource]);
++	}
++	return wrote;
++}
++
++static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
++		void __user *buf, long size)
++{
++	void *kbuf;
++	int retval;
++
++	kbuf = (void *)__get_free_page(GFP_KERNEL);
++	if (kbuf == NULL)
++		return -ENOMEM;
++
++	spin_lock(&ubs_notify_lock);
++	switch (UBSTAT_CMD(cmd)) {
++		case UBSTAT_READ_ONE:
++			retval = -EINVAL;
++			if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
++				break;
++			retval = ubstat_do_read_one(ub,
++					UBSTAT_PARMID(cmd), kbuf);
++			break;
++		case UBSTAT_READ_ALL:
++			retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
++			break;
++		case UBSTAT_READ_FULL:
++			retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
++			break;
++		default:
++			retval = -EINVAL;
++	}
++	spin_unlock(&ubs_notify_lock);
++
++	if (retval > 0) {
++		retval = min_t(long, retval, size);
++		if (copy_to_user(buf, kbuf, retval))
++			retval = -EFAULT;
++	}
++
++	free_page((unsigned long)kbuf);
++	return retval;
++}
++
++static int ubstat_handle_notifrq(ubnotifrq_t *req)
++{
++	int retval;
++	struct ub_stat_notify *new_notify;
++	struct list_head *entry;
++	struct task_struct *tsk_to_free;
++
++	new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL);
++	if (new_notify == NULL)
++		return -ENOMEM;
++
++	tsk_to_free = NULL;
++	INIT_LIST_HEAD(&new_notify->list);
++
++	spin_lock(&ubs_notify_lock);
++	list_for_each(entry, &ubs_notify_list) {
++		struct ub_stat_notify *notify;
++
++		notify = list_entry(entry, struct ub_stat_notify, list);
++		if (notify->task == current) {
++			kfree(new_notify);
++			new_notify = notify;
++			break;
++		}
++	}
++
++	retval = -EINVAL;
++	if (req->maxinterval < 1)
++		goto out_unlock;
++	if (req->maxinterval > TIME_MAX_SEC)
++		req->maxinterval = TIME_MAX_SEC;
++	if (req->maxinterval < ubs_min_interval) {
++		unsigned long dif;
++
++		ubs_min_interval = req->maxinterval;
++		dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
++		if (dif > req->maxinterval)
++			mod_timer(&ubs_timer,
++					ubs_timer.expires -
++					(dif - req->maxinterval) * HZ);
++	}
++
++	if (entry != &ubs_notify_list) {
++		list_del(&new_notify->list);
++		tsk_to_free = new_notify->task;
++	}
++	if (req->signum) {
++		new_notify->task = current;
++		get_task_struct(new_notify->task);
++		new_notify->signum = req->signum;
++		list_add(&new_notify->list, &ubs_notify_list);
++	} else
++		kfree(new_notify);
++	retval = 0;
++out_unlock:
++	spin_unlock(&ubs_notify_lock);
++	if (tsk_to_free != NULL)
++		put_task_struct(tsk_to_free);
++	return retval;
++}
++
++/*
++ * former sys_ubstat
++ */
++long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
++		void __user *buf, long size)
++{
++	int retval;
++	struct user_beancounter *ub;
++
++	if (func == UBSTAT_UBPARMNUM)
++		return UB_RESOURCES;
++	if (func == UBSTAT_UBLIST)
++		return ubstat_get_list(buf, size);
++	if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)))
++		return -EPERM;
++
++	if (func == UBSTAT_GETTIME) {
++		retval = ubstat_gettime(buf, size);
++		goto notify;
++	}
++
++	ub = get_exec_ub();
++	if (ub != NULL && ub->ub_uid == arg1)
++		get_beancounter(ub);
++	else /* FIXME must be if (ve_is_super) */
++		ub = get_beancounter_byuid(arg1, 0);
++
++	if (ub == NULL)
++		return -ESRCH;
++
++	retval = ubstat_get_stat(ub, func, buf, size);
++	put_beancounter(ub);
++notify:
++	/* Handle request for notification */
++	if (retval >= 0) {
++		ubnotifrq_t notifrq;
++		int err;
++
++		err = -EFAULT;
++		if (!copy_from_user(&notifrq, (void __user *)arg2,
++					sizeof(notifrq)))
++			err = ubstat_handle_notifrq(&notifrq);
++		if (err)
++			retval = err;
++	}
++
++	return retval;
++}
++
++static void ubstat_save_onestat(struct user_beancounter *ub)
++{
++	int resource;
++
++	/* called with local irq disabled */
++	spin_lock(&ub->ub_lock);
++	for (resource = 0; resource < UB_RESOURCES; resource++) {
++		memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
++			sizeof(struct ubparm));
++		ub->ub_parms[resource].minheld = 
++			ub->ub_parms[resource].maxheld =
++			ub->ub_parms[resource].held;
++	}
++	spin_unlock(&ub->ub_lock);
++}
++
++static void ubstat_save_statistics(void)
++{
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	local_irq_save(flags);
++	for_each_beancounter (ub)
++		ubstat_save_onestat(ub);
++	local_irq_restore(flags);
++}
++
++static void ubstatd_timeout(unsigned long __data)
++{
++	struct task_struct *p;
++
++	p = (struct task_struct *) __data;
++	wake_up_process(p);
++}
++
++/*
++ * Safe wrapper for send_sig. It prevents a race with release_task
++ * for sighand.
++ * Should be called under tasklist_lock.
++ */
++static void task_send_sig(struct ub_stat_notify *notify)
++{
++	if (likely(notify->task->sighand != NULL))
++		send_sig(notify->signum, notify->task, 1);
++}
++
++static inline void do_notifies(void)
++{
++	LIST_HEAD(notif_free_list);
++	struct ub_stat_notify *notify;
++	struct ub_stat_notify *tmp;
++
++	spin_lock(&ubs_notify_lock);
++	ubs_start_time = ubs_end_time;
++	/*
++	 * the expression below relies on time being unsigned long and
++	 * arithmetic promotion rules
++	 */
++	ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
++	mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
++	ubs_min_interval = TIME_MAX_SEC;
++	/* save statistics accumulated for the interval */
++	ubstat_save_statistics();
++	/* send signals */
++	read_lock(&tasklist_lock);
++	while (!list_empty(&ubs_notify_list)) {
++		notify = list_entry(ubs_notify_list.next,
++				struct ub_stat_notify, list);
++		task_send_sig(notify);
++		list_del(&notify->list);
++		list_add(&notify->list, &notif_free_list);
++	}
++	read_unlock(&tasklist_lock);
++	spin_unlock(&ubs_notify_lock);
++
++	list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
++		put_task_struct(notify->task);
++		kfree(notify);
++	}
++}
++
++/*
++ * Kernel thread
++ */
++static int ubstatd(void *unused)
++{
++	/* daemonize call will take care of signals */
++	daemonize("ubstatd");
++
++	ubs_timer.data = (unsigned long)current;
++	ubs_timer.function = ubstatd_timeout;
++	add_timer(&ubs_timer);
++
++	while (1) {
++		set_task_state(current, TASK_INTERRUPTIBLE);
++		if (time_after(ubs_timer.expires, jiffies)) {
++			schedule();
++			try_to_freeze();
++			continue;
++		}
++
++		__set_task_state(current, TASK_RUNNING);
++		do_notifies();
++	}
++	return 0;
++}
++
++static int __init ubstatd_init(void)
++{
++	init_timer(&ubs_timer);
++	ubs_timer.expires = TIME_MAX_JIF;
++	ubs_min_interval = TIME_MAX_SEC;
++	ubs_start_time = ubs_end_time = 0;
++
++	kernel_thread(ubstatd, NULL, 0);
++	return 0;
++}
++
++module_init(ubstatd_init);
+diff --git a/kernel/bc/sys.c b/kernel/bc/sys.c
+new file mode 100644
+index 0000000..798166b
+--- /dev/null
++++ b/kernel/bc/sys.c
+@@ -0,0 +1,173 @@
++/*
++ *  kernel/bc/sys.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/virtinfo.h>
++#include <linux/compat.h>
++#include <asm/uaccess.h>
++
++#include <bc/beancounter.h>
++
++/*
++ *	The (rather boring) getluid syscall
++ */
++asmlinkage long sys_getluid(void)
++{
++	struct user_beancounter *ub;
++
++	ub = get_exec_ub();
++	if (ub == NULL)
++		return -EINVAL;
++
++	return ub->ub_uid;
++}
++
++/*
++ *	The setluid syscall
++ */
++asmlinkage long sys_setluid(uid_t uid)
++{
++	struct user_beancounter *ub;
++	struct task_beancounter *task_bc;
++	int error;
++
++	task_bc = &current->task_bc;
++
++	/* You may not disown a setluid */
++	error = -EINVAL;
++	if (uid == (uid_t)-1)
++		goto out;
++
++	/* You may only set an ub as root */
++	error = -EPERM;
++	if (!capable(CAP_SETUID))
++		goto out;
++	/*
++	 * The ub once set is irrevocable to all
++	 * unless it's set from ve0.
++	 */
++	if (!ve_is_super(get_exec_env()))
++		goto out;
++
++	/* Ok - set up a beancounter entry for this user */
++	error = -ENOBUFS;
++	ub = get_beancounter_byuid(uid, 1);
++	if (ub == NULL)
++		goto out;
++
++	ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) "
++			"for %.20s pid %d\n",
++			ub, atomic_read(&ub->ub_refcount),
++			current->comm, current->pid);
++	/* install bc */
++	error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub);
++	if (!(error & NOTIFY_FAIL)) {
++		put_beancounter(task_bc->exec_ub);
++		task_bc->exec_ub = ub;
++		if (!(error & NOTIFY_OK)) {
++			put_beancounter(task_bc->fork_sub);
++			task_bc->fork_sub = get_beancounter(ub);
++		}
++		error = 0;
++	} else {
++		put_beancounter(ub);
++		error = -ENOBUFS;
++	}
++out:
++	return error;
++}
++
++long do_setublimit(uid_t uid, unsigned long resource,
++		unsigned long *new_limits)
++{
++	int error;
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	error = -EPERM;
++	if(!capable(CAP_SYS_RESOURCE))
++		goto out;
++
++	if (!ve_is_super(get_exec_env()))
++		goto out;
++
++	error = -EINVAL;
++	if (resource >= UB_RESOURCES)
++		goto out;
++
++	error = -EINVAL;
++	if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
++		goto out;
++
++	error = -ENOENT;
++	ub = get_beancounter_byuid(uid, 0);
++	if (ub == NULL) {
++		ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid);
++		goto out;
++	}
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_parms[resource].barrier = new_limits[0];
++	ub->ub_parms[resource].limit = new_limits[1];
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++	put_beancounter(ub);
++
++	error = 0;
++out:
++	return error;
++}
++
++/*
++ *	The setbeanlimit syscall
++ */
++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
++		unsigned long __user *limits)
++{
++	unsigned long new_limits[2];
++
++	if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
++		return -EFAULT;
++
++	return do_setublimit(uid, resource, new_limits);
++}
++
++extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, 
++		void __user *buf, long size);
++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, 
++		void __user *buf, long size)
++{
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
++
++	return do_ubstat(func, arg1, arg2, buf, size);
++}
++
++#ifdef CONFIG_COMPAT
++asmlinkage long compat_sys_setublimit(uid_t uid, int resource,
++		unsigned int __user *limits)
++{
++	unsigned int u_new_limits[2];
++	unsigned long new_limits[2];
++
++        if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits)))
++                return -EFAULT;
++
++	new_limits[0] = u_new_limits[0];
++	new_limits[1] = u_new_limits[1];
++
++	return do_setublimit(uid, resource, new_limits);
++}
++
++asmlinkage long compat_sys_ubstat(int func, unsigned int arg1,
++		unsigned int arg2, compat_uptr_t *buf, long size)
++{
++	return sys_ubstat(func, arg1, arg2, buf, size);
++}
++#endif
+diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
+new file mode 100644
+index 0000000..e98134b
+--- /dev/null
++++ b/kernel/bc/vm_pages.c
+@@ -0,0 +1,549 @@
++/*
++ *  kernel/bc/vm_pages.c
++ *
++ *  Copyright (C) 2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/mm.h>
++#include <linux/highmem.h>
++#include <linux/virtinfo.h>
++#include <linux/module.h>
++#include <linux/shmem_fs.h>
++#include <linux/vmalloc.h>
++#include <linux/init.h>
++
++#include <asm/pgtable.h>
++#include <asm/page.h>
++
++#include <bc/beancounter.h>
++#include <bc/vmpages.h>
++#include <bc/proc.h>
++
++static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma,
++		pmd_t *pmd, unsigned long addr, unsigned long end,
++		unsigned long *ret)
++{
++	pte_t *pte;
++	spinlock_t *ptl;
++
++	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
++	do {
++		if (!pte_none(*pte) && pte_present(*pte))
++			(*ret)++;
++	} while (pte++, addr += PAGE_SIZE, (addr != end));
++	pte_unmap_unlock(pte - 1, ptl);
++
++	return addr;
++}
++
++static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma,
++		pud_t *pud, unsigned long addr, unsigned long end,
++		unsigned long *ret)
++{
++	pmd_t *pmd;
++	unsigned long next;
++
++	pmd = pmd_offset(pud, addr);
++	do {
++		next = pmd_addr_end(addr, end);
++		if (pmd_none_or_clear_bad(pmd))
++			continue;
++		next = pages_in_pte_range(vma, pmd, addr, next, ret);
++	} while (pmd++, addr = next, (addr != end));
++
++	return addr;
++}
++
++static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma,
++		pgd_t *pgd, unsigned long addr, unsigned long end,
++		unsigned long *ret)
++{
++	pud_t *pud;
++	unsigned long next;
++
++	pud = pud_offset(pgd, addr);
++	do {
++		next = pud_addr_end(addr, end);
++		if (pud_none_or_clear_bad(pud))
++			continue;
++		next = pages_in_pmd_range(vma, pud, addr, next, ret);
++	} while (pud++, addr = next, (addr != end));
++
++	return addr;
++}
++
++unsigned long pages_in_vma_range(struct vm_area_struct *vma,
++		unsigned long addr, unsigned long end)
++{
++	pgd_t *pgd;
++	unsigned long next;
++	unsigned long ret;
++
++	ret = 0;
++	BUG_ON(addr >= end);
++	pgd = pgd_offset(vma->vm_mm, addr);
++	do {
++		next = pgd_addr_end(addr, end);
++		if (pgd_none_or_clear_bad(pgd))
++			continue;
++		next = pages_in_pud_range(vma, pgd, addr, next, &ret);
++	} while (pgd++, addr = next, (addr != end));
++	return ret;
++}
++
++void __ub_update_physpages(struct user_beancounter *ub)
++{
++	ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages
++		+ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT);
++	ub_adjust_maxheld(ub, UB_PHYSPAGES);
++}
++
++void __ub_update_oomguarpages(struct user_beancounter *ub)
++{
++	ub->ub_parms[UB_OOMGUARPAGES].held =
++		ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages;
++	ub_adjust_maxheld(ub, UB_OOMGUARPAGES);
++}
++
++void __ub_update_privvm(struct user_beancounter *ub)
++{
++	ub->ub_parms[UB_PRIVVMPAGES].held =
++		(ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT)
++		+ ub->ub_unused_privvmpages
++		+ ub->ub_parms[UB_SHMPAGES].held;
++	ub_adjust_maxheld(ub, UB_PRIVVMPAGES);
++}
++
++static inline int __charge_privvm_locked(struct user_beancounter *ub, 
++		unsigned long s, enum ub_severity strict)
++{
++	if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0)
++		return -ENOMEM;
++
++	ub->ub_unused_privvmpages += s;
++	return 0;
++}
++
++static void __unused_privvm_dec_locked(struct user_beancounter *ub, 
++		long size)
++{
++	/* catch possible overflow */
++	if (ub->ub_unused_privvmpages < size) {
++		uncharge_warn(ub, UB_UNUSEDPRIVVM,
++				size, ub->ub_unused_privvmpages);
++		size = ub->ub_unused_privvmpages;
++	}
++	ub->ub_unused_privvmpages -= size;
++	__ub_update_privvm(ub);
++}
++
++void __ub_unused_privvm_dec(struct mm_struct *mm, long size)
++{
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return;
++
++	ub = top_beancounter(ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	__unused_privvm_dec_locked(ub, size);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_unused_privvm_sub(struct mm_struct *mm,
++		struct vm_area_struct *vma, unsigned long count)
++{
++	if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
++		__ub_unused_privvm_dec(mm, count);
++}
++
++void ub_unused_privvm_add(struct mm_struct *mm,
++		struct vm_area_struct *vma, unsigned long size)
++{
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	ub = mm->mm_ub;
++	if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
++		return;
++
++	ub = top_beancounter(ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_unused_privvmpages += size;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++int ub_protected_charge(struct mm_struct *mm, unsigned long size,
++		unsigned long newflags, struct vm_area_struct *vma)
++{
++	unsigned long flags;
++	struct file *file;
++	struct user_beancounter *ub;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return PRIVVM_NO_CHARGE;
++
++	flags = vma->vm_flags;
++	if (!((newflags ^ flags) & VM_WRITE))
++		return PRIVVM_NO_CHARGE;
++
++	file = vma->vm_file;
++	if (!VM_UB_PRIVATE(newflags | VM_WRITE, file))
++		return PRIVVM_NO_CHARGE;
++
++	if (flags & VM_WRITE)
++		return PRIVVM_TO_SHARED;
++
++	ub = top_beancounter(ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (__charge_privvm_locked(ub, size, UB_SOFT) < 0)
++		goto err;
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return PRIVVM_TO_PRIVATE;
++
++err:
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return PRIVVM_ERROR;
++}
++
++int ub_memory_charge(struct mm_struct *mm, unsigned long size,
++		unsigned vm_flags, struct file *vm_file, int sv)
++{
++	struct user_beancounter *ub, *ubl;
++	unsigned long flags;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return 0;
++
++	size >>= PAGE_SHIFT;
++	if (size > UB_MAXVALUE)
++		return -EINVAL;
++
++	BUG_ON(sv != UB_SOFT && sv != UB_HARD);
++
++	if (vm_flags & VM_LOCKED) {
++		if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
++			goto out_err;
++	}
++	if (VM_UB_PRIVATE(vm_flags, vm_file)) {
++		ubl = top_beancounter(ub);
++		spin_lock_irqsave(&ubl->ub_lock, flags);
++		if (__charge_privvm_locked(ubl, size, sv))
++			goto out_private;
++		spin_unlock_irqrestore(&ubl->ub_lock, flags);
++	}
++	return 0;
++
++out_private:
++	spin_unlock_irqrestore(&ubl->ub_lock, flags);
++	if (vm_flags & VM_LOCKED)
++		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
++out_err:
++	return -ENOMEM;
++}
++
++void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
++		unsigned vm_flags, struct file *vm_file)
++{
++	struct user_beancounter *ub;
++	unsigned long flags;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return;
++
++	size >>= PAGE_SHIFT;
++
++	if (vm_flags & VM_LOCKED)
++		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
++	if (VM_UB_PRIVATE(vm_flags, vm_file)) {
++		ub = top_beancounter(ub);
++		spin_lock_irqsave(&ub->ub_lock, flags);
++		__unused_privvm_dec_locked(ub, size);
++		spin_unlock_irqrestore(&ub->ub_lock, flags);
++	}
++}
++
++int ub_locked_charge(struct mm_struct *mm, unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return 0;
++
++	return charge_beancounter(ub, UB_LOCKEDPAGES,
++			size >> PAGE_SHIFT, UB_HARD);
++}
++
++void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	ub = mm->mm_ub;
++	if (ub == NULL)
++		return;
++
++	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
++}
++
++int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	ub = shi->shmi_ub;
++	if (ub == NULL)
++		return 0;
++
++	return charge_beancounter(ub, UB_LOCKEDPAGES,
++			size >> PAGE_SHIFT, UB_HARD);
++}
++
++void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	ub = shi->shmi_ub;
++	if (ub == NULL)
++		return;
++
++	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
++}
++
++
++static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_tmpfs_respages++;
++	__ub_update_physpages(ub);
++	__ub_update_oomguarpages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_tmpfs_respages_inc(struct shmem_inode_info *shi)
++{
++	struct user_beancounter *ub;
++
++	for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
++		do_ub_tmpfs_respages_inc(ub);
++}
++
++static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub,
++		unsigned long size)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	/* catch possible overflow */
++	if (ub->ub_tmpfs_respages < size) {
++		uncharge_warn(ub, UB_TMPFSPAGES,
++				size, ub->ub_tmpfs_respages);
++		size = ub->ub_tmpfs_respages;
++	}
++	ub->ub_tmpfs_respages -= size;
++	/* update values what is the most interesting */
++	__ub_update_physpages(ub);
++	__ub_update_oomguarpages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
++		unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
++		do_ub_tmpfs_respages_sub(ub, size);
++}
++
++int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size)
++{
++	int ret;
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	ub = shi->shmi_ub;
++	if (ub == NULL)
++		return 0;
++
++	ub = top_beancounter(ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD);
++	if (ret == 0)
++		__ub_update_privvm(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++	return ret;
++}
++
++void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size)
++{
++	unsigned long flags;
++	struct user_beancounter *ub;
++
++	ub = shi->shmi_ub;
++	if (ub == NULL)
++		return;
++
++	ub = top_beancounter(ub);
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	__uncharge_beancounter_locked(ub, UB_SHMPAGES, size);
++	__ub_update_privvm(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++#ifdef CONFIG_BC_SWAP_ACCOUNTING
++static inline void do_ub_swapentry_inc(struct user_beancounter *ub)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	ub->ub_swap_pages++;
++	__ub_update_oomguarpages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num,
++		struct user_beancounter *ub)
++{
++	si->swap_ubs[num] = get_beancounter(ub);
++	for (; ub != NULL; ub = ub->parent)
++		do_ub_swapentry_inc(ub);
++}
++EXPORT_SYMBOL(ub_swapentry_inc);
++
++static inline void do_ub_swapentry_dec(struct user_beancounter *ub)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&ub->ub_lock, flags);
++	if (ub->ub_swap_pages <= 0)
++		uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages);
++	else
++		ub->ub_swap_pages--;
++	__ub_update_oomguarpages(ub);
++	spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num)
++{
++	struct user_beancounter *ub, *ubp;
++
++	ub = si->swap_ubs[num];
++	si->swap_ubs[num] = NULL;
++	for (ubp = ub; ubp != NULL; ubp = ubp->parent)
++		do_ub_swapentry_dec(ubp);
++	put_beancounter(ub);
++}
++EXPORT_SYMBOL(ub_swapentry_dec);
++
++int ub_swap_init(struct swap_info_struct *si, pgoff_t num)
++{
++	struct user_beancounter **ubs;
++
++	ubs = vmalloc(num * sizeof(struct user_beancounter *));
++	if (ubs == NULL)
++		return -ENOMEM;
++
++	memset(ubs, 0, num * sizeof(struct user_beancounter *));
++	si->swap_ubs = ubs;
++	return 0;
++}
++
++void ub_swap_fini(struct swap_info_struct *si)
++{
++	if (si->swap_ubs) {
++		vfree(si->swap_ubs);
++		si->swap_ubs = NULL;
++	}
++}
++#endif
++
++static int vmguar_enough_memory(struct vnotifier_block *self,
++		unsigned long event, void *arg, int old_ret)
++{
++	struct user_beancounter *ub;
++
++	if (event != VIRTINFO_ENOUGHMEM)
++		return old_ret;
++	/*
++	 * If it's a kernel thread, don't care about it.
++	 * Added in order aufsd to run smoothly over ramfs.
++	 */
++	if (!current->mm)
++		return NOTIFY_DONE;
++
++	ub = top_beancounter(current->mm->mm_ub);
++	if (ub->ub_parms[UB_PRIVVMPAGES].held >
++			ub->ub_parms[UB_VMGUARPAGES].barrier)
++		return old_ret;
++
++	return NOTIFY_OK;
++}
++
++static struct vnotifier_block vmguar_notifier_block = {
++	.notifier_call = vmguar_enough_memory
++};
++
++static int __init init_vmguar_notifier(void)
++{
++	virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block);
++	return 0;
++}
++
++static void __exit fini_vmguar_notifier(void)
++{
++	virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block);
++}
++
++module_init(init_vmguar_notifier);
++module_exit(fini_vmguar_notifier);
++
++#ifdef CONFIG_PROC_FS
++static int bc_vmaux_show(struct seq_file *f, void *v)
++{
++	struct user_beancounter *ub;
++	unsigned long swap, unmap;
++	int i;
++
++	ub = seq_beancounter(f);
++
++	swap = unmap = 0;
++	for_each_online_cpu(i) {
++		swap += per_cpu_ptr(ub->ub_percpu, i)->swapin;
++		unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap;
++	}
++
++	seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM],
++			ub->ub_unused_privvmpages);
++	seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES],
++			ub->ub_tmpfs_respages);
++	seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES],
++			ub->ub_swap_pages);
++
++	seq_printf(f, bc_proc_lu_fmt, "swapin", swap);
++	seq_printf(f, bc_proc_lu_fmt, "unmap", unmap);
++	return 0;
++}
++static struct bc_proc_entry bc_vmaux_entry = {
++	.name = "vmaux",
++	.u.show = bc_vmaux_show,
++};
++
++static int __init bc_vmaux_init(void)
++{
++	bc_register_proc_entry(&bc_vmaux_entry);
++	return 0;
++}
++
++late_initcall(bc_vmaux_init);
++#endif
+diff --git a/kernel/capability.c b/kernel/capability.c
+index 901e0fd..6618a51 100644
+--- a/kernel/capability.c
++++ b/kernel/capability.c
+@@ -19,7 +19,8 @@
+  * This lock protects task->cap_* for all tasks including current.
+  * Locking rule: acquire this prior to tasklist_lock.
+  */
+-static DEFINE_SPINLOCK(task_capability_lock);
++DEFINE_SPINLOCK(task_capability_lock);
++EXPORT_SYMBOL(task_capability_lock);
+ 
+ /*
+  * Leveraged for setting/resetting capabilities
+@@ -242,7 +243,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+ 	pgrp = find_vpid(pgrp_nr);
+ 	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+ 		target = g;
+-		while_each_thread(g, target) {
++		while_each_thread_ve(g, target) {
+ 			if (!security_capset_check(target, effective,
+ 							inheritable,
+ 							permitted)) {
+@@ -272,7 +273,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
+      int ret = -EPERM;
+      int found = 0;
+ 
+-     do_each_thread(g, target) {
++     do_each_thread_ve(g, target) {
+              if (target == current || is_container_init(target->group_leader))
+                      continue;
+              found = 1;
+@@ -281,7 +282,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
+ 		     continue;
+ 	     ret = 0;
+ 	     security_capset_set(target, effective, inheritable, permitted);
+-     } while_each_thread(g, target);
++     } while_each_thread_ve(g, target);
+ 
+      if (!found)
+ 	     ret = 0;
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index 15ac0e1..e2735e2 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -1809,7 +1809,7 @@ static void cgroup_enable_task_cg_lists(void)
+ 	struct task_struct *p, *g;
+ 	write_lock(&css_set_lock);
+ 	use_task_css_set_links = 1;
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		task_lock(p);
+ 		/*
+ 		 * We should check if the process is exiting, otherwise
+@@ -1819,7 +1819,7 @@ static void cgroup_enable_task_cg_lists(void)
+ 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
+ 			list_add(&p->cg_list, &p->cgroups->tasks);
+ 		task_unlock(p);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 	write_unlock(&css_set_lock);
+ }
+ 
+@@ -2894,9 +2894,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+  again:
+ 	root = subsys->root;
+ 	if (root == &rootnode) {
+-		printk(KERN_INFO
+-		       "Not cloning cgroup for unused subsystem %s\n",
+-		       subsys->name);
+ 		mutex_unlock(&cgroup_mutex);
+ 		return 0;
+ 	}
+diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c
+new file mode 100644
+index 0000000..92c0f76
+--- /dev/null
++++ b/kernel/cgroup_lite.c
+@@ -0,0 +1,221 @@
++/*
++ * lite cgroups engine
++ */
++
++#include <linux/cgroup.h>
++#include <linux/seq_file.h>
++#include <linux/fs.h>
++#include <linux/ve.h>
++#include <linux/proc_fs.h>
++#include <linux/module.h>
++
++#define SUBSYS(_x) &_x ## _subsys,
++
++static struct cgroup_subsys *subsys[] = {
++#include <linux/cgroup_subsys.h>
++};
++
++static struct css_set init_css_set;
++static struct cgroup init_cgroup;
++static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT];
++
++static int init_css_set_subsystems(struct cgroup *g, struct css_set *set)
++{
++	int i;
++	struct cgroup_subsys_state *ss;
++
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++		struct cgroup_subsys *cs = subsys[i];
++
++		ss = cs->create(cs, g);
++		if (IS_ERR(ss))
++			goto destroy;
++
++		g->subsys[i] = ss;
++		set->subsys[i] = ss;
++		atomic_set(&ss->refcnt, 0);
++		ss->cgroup = g;
++	}
++	return 0;
++
++destroy:
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++		struct cgroup_subsys *cs = subsys[i];
++
++		if (g->subsys[i])
++			cs->destroy(cs, g);
++	}
++	return PTR_ERR(ss);
++}
++
++int init_ve_cgroups(struct ve_struct *ve)
++{
++	int err = -ENOMEM;
++	struct cgroup *g;
++	struct css_set *cs;
++
++	g = kzalloc(sizeof(struct cgroup), GFP_KERNEL);
++	if (g == NULL)
++		goto err_galloc;
++
++	cs = kzalloc(sizeof(struct css_set), GFP_KERNEL);
++	if (cs == NULL)
++		goto err_calloc;
++
++	g->parent = &init_cgroup;
++	err = init_css_set_subsystems(g, cs);
++	if (err)
++		goto err_subsys;
++
++	g->parent = &init_cgroup;
++	ve->ve_cgroup = g;
++	ve->ve_css_set = cs;
++	return 0;
++
++err_subsys:
++	kfree(cs);
++err_calloc:
++	kfree(g);
++err_galloc:
++	return err;
++}
++EXPORT_SYMBOL(init_ve_cgroups);
++
++void fini_ve_cgroups(struct ve_struct *ve)
++{
++	int i;
++	struct cgroup *g = ve->ve_cgroup;
++	struct css_set *css = ve->ve_css_set;
++
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
++		struct cgroup_subsys *cs = subsys[i];
++		struct cgroup_subsys_state *ss = css->subsys[i];
++
++		BUG_ON(ss != g->subsys[i]);
++
++		if (cs->pre_destroy)
++			cs->pre_destroy(cs, g);
++
++		if (atomic_read(&ss->refcnt))
++			printk(KERN_ERR "CG: leaking %d/%s subsys\n",
++					ve->veid, subsys[i]->name);
++		else
++			cs->destroy(cs, g);
++	}
++
++	kfree(g);
++	kfree(css);
++	ve->ve_cgroup = NULL;
++	ve->ve_css_set = NULL;
++}
++EXPORT_SYMBOL(fini_ve_cgroups);
++
++/*
++ * task lifecycle
++ */
++
++void cgroup_fork(struct task_struct *child)
++{
++	child->cgroups = current->cgroups;
++}
++
++void cgroup_fork_callbacks(struct task_struct *child)
++{
++}
++
++void cgroup_post_fork(struct task_struct *child)
++{
++}
++
++void cgroup_exit(struct task_struct *tsk, int dummy)
++{
++	tsk->cgroups = &init_css_set;
++}
++
++int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
++{
++	return -ENODATA;
++}
++
++/*
++ * proc struts
++ */
++
++static int proc_cgroup_show(struct seq_file *m, void *v)
++{
++	struct task_struct *tsk;
++
++	tsk = pid_task((struct pid *)m->private, PIDTYPE_PID);
++	seq_printf(m, "%p\n", tsk->cgroups);
++	return 0;
++}
++
++static int cgroup_open(struct inode *inode, struct file *file)
++{
++	if (!capable(CAP_SYS_ADMIN))
++		return -EPERM;
++
++	return single_open(file, proc_cgroup_show, PROC_I(inode)->pid);
++}
++
++struct file_operations proc_cgroup_operations = {
++	.open		= cgroup_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= single_release,
++};
++
++/*
++ * cgroups misc struts
++ */
++
++int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
++		const struct cftype cft[], int count)
++{
++	int idx = subsys->subsys_id;
++	static DEFINE_SPINLOCK(add_files_lock);
++
++	if (unlikely(subsys_cftypes[idx] == NULL)) {
++		spin_lock(&add_files_lock);
++		if (subsys_cftypes[idx] == NULL)
++			subsys_cftypes[idx] = (struct cftype *)cft;
++		spin_unlock(&add_files_lock);
++	}
++
++	BUG_ON(subsys_cftypes[idx] != cft);
++	return 0;
++}
++
++void cgroup_lock(void)
++{
++}
++
++void cgroup_unlock(void)
++{
++}
++
++
++int cgroup_is_removed(const struct cgroup *cgrp)
++{
++	return 0;
++}
++
++int __init cgroup_init_early(void)
++{
++	int i;
++
++	init_task.cgroups = &init_css_set;
++	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
++		BUG_ON(subsys[i]->early_init);
++
++	return 0;
++}
++
++int __init cgroup_init(void)
++{
++	get_ve0()->ve_cgroup = &init_cgroup;
++	get_ve0()->ve_css_set = &init_css_set;
++	if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0)
++		panic("CG: Can't init initial set\n");
++	return 0;
++}
+diff --git a/kernel/compat.c b/kernel/compat.c
+index 32c254a..58506ef 100644
+--- a/kernel/compat.c
++++ b/kernel/compat.c
+@@ -22,6 +22,7 @@
+ #include <linux/security.h>
+ #include <linux/timex.h>
+ #include <linux/migrate.h>
++#include <linux/module.h>
+ #include <linux/posix-timers.h>
+ 
+ #include <asm/uaccess.h>
+@@ -40,7 +41,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
+ 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+ }
+ 
+-static long compat_nanosleep_restart(struct restart_block *restart)
++long compat_nanosleep_restart(struct restart_block *restart)
+ {
+ 	struct compat_timespec __user *rmtp;
+ 	struct timespec rmt;
+@@ -62,6 +63,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(compat_nanosleep_restart);
+ 
+ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+ 				     struct compat_timespec __user *rmtp)
+diff --git a/kernel/cpt/Makefile b/kernel/cpt/Makefile
+new file mode 100644
+index 0000000..d97cc31
+--- /dev/null
++++ b/kernel/cpt/Makefile
+@@ -0,0 +1,53 @@
++#
++#
++#  kernel/cpt/Makefile
++#
++#  Copyright (C) 2000-2005  SWsoft
++#  All rights reserved.
++#
++#  Licensing governed by "linux/COPYING.SWsoft" file.
++
++obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o
++
++vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \
++	cpt_mm.o cpt_files.o cpt_kernel.o \
++	cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \
++	cpt_conntrack.o cpt_epoll.o
++
++vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \
++	rst_mm.o rst_files.o \
++	rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \
++	rst_conntrack.o rst_epoll.o
++
++ifeq ($(CONFIG_BEANCOUNTERS), y)
++vzcpt-objs += cpt_ubc.o
++vzrst-objs += rst_ubc.o
++endif
++
++ifeq ($(CONFIG_INOTIFY_USER), y)
++vzcpt-objs += cpt_inotify.o
++vzrst-objs += rst_inotify.o
++endif
++
++vzrst-objs += cpt_exports.o
++
++ifeq ($(CONFIG_VZ_CHECKPOINT), m)
++vzrst-objs += cpt_obj.o cpt_kernel.o
++endif
++
++ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y)
++vzcpt-objs += cpt_iterative.o
++vzrst-objs += rst_iterative.o
++endif
++
++ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y)
++vzcpt-objs += cpt_pagein.o
++vzrst-objs += rst_pagein.o
++endif
++
++ifeq ($(CONFIG_X86_64), y)
++vzcpt-objs += cpt_x8664.o
++ifeq ($(CONFIG_VZ_CHECKPOINT), m)
++vzrst-objs += cpt_x8664.o
++endif
++endif
+diff --git a/kernel/cpt/cpt_conntrack.c b/kernel/cpt/cpt_conntrack.c
+new file mode 100644
+index 0000000..19dcf32
+--- /dev/null
++++ b/kernel/cpt/cpt_conntrack.c
+@@ -0,0 +1,365 @@
++/*
++ *
++ *  kernel/cpt/cpt_conntrack.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/unistd.h>
++#include <linux/ve.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++#include <linux/icmp.h>
++#include <linux/ip.h>
++
++#if defined(CONFIG_VE_IPTABLES) && \
++    (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv4/ip_conntrack.h>
++#include <linux/netfilter_ipv4/ip_nat.h>
++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
++#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++#include <linux/netfilter_ipv4/ip_conntrack_core.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++
++/* How does it work?
++ *
++ * Network is disabled, so new conntrack entries will not appear.
++ * However, some of them can disappear because of timeouts.
++ *
++ * So, we take read_lock, collect all required information atomically,
++ * essentially, creating parallel "refcount" structures holding pointers.
++ * We delete conntrack timers as well, so the structures cannot disappear
++ * after releasing the lock. Now, after releasing lock we can dump everything
++ * safely. And on exit we restore timers to their original values.
++ *
++ * Note, this approach is not going to work in VE0.
++ */
++
++struct ct_holder
++{
++	struct ct_holder *next;
++	struct ip_conntrack_tuple_hash *cth;
++	int index;
++};
++
++static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple)
++{
++	v->cpt_dst = tuple->dst.ip;
++	v->cpt_dstport = tuple->dst.u.all;
++	v->cpt_protonum = tuple->dst.protonum;
++	v->cpt_dir = tuple->dst.dir;
++
++	v->cpt_src = tuple->src.ip;
++	v->cpt_srcport = tuple->src.u.all;
++}
++
++static int dump_one_expect(struct cpt_ip_connexpect_image *v,
++			   struct ip_conntrack_expect *exp,
++			   int sibling, cpt_context_t *ctx)
++{
++	int err = 0;
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	encode_tuple(&v->cpt_tuple, &exp->tuple);
++	encode_tuple(&v->cpt_mask, &exp->mask);
++	v->cpt_sibling_conntrack = sibling;
++	v->cpt_flags = exp->flags;
++	v->cpt_seq = exp->id;
++	v->cpt_dir = 0;
++	v->cpt_manip_proto = 0;
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++	v->cpt_manip_proto = exp->saved_proto.all;
++	v->cpt_dir = exp->dir;
++#endif
++	v->cpt_timeout = 0;
++	if (exp->master->helper->timeout)
++		v->cpt_timeout = exp->timeout.expires - jiffies;
++	return err;
++}
++
++/* NOTE. We use one page to dump list of expectations. This may be not enough
++ * in theory. In practice there is only one expectation per conntrack record.
++ * Moreover, taking into account that _ALL_ of expecations are saved in one
++ * global list, which is looked up each incoming/outpging packet, the system
++ * would be severely dead when even one conntrack would have so much of
++ * expectations. Shortly, I am not going to repair this.
++ */
++
++static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list,
++			    cpt_context_t *ctx)
++{
++	int err = 0;
++	unsigned long pg;
++	struct cpt_ip_connexpect_image *v;
++	struct ip_conntrack_expect *exp;
++
++	if (ct->expecting == 0)
++		return err;
++	if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE)
++		return -ENOBUFS;
++
++	pg = __get_free_page(GFP_KERNEL);
++	if (!pg)
++		return -ENOMEM;
++	v = (struct cpt_ip_connexpect_image *)pg;
++
++	read_lock_bh(&ip_conntrack_lock);
++	list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) {
++		int sibling;
++
++		if (exp->master != ct)
++			continue;
++
++		if (ct->helper == NULL) {
++			eprintk_ctx("conntrack: no helper and non-trivial expectation\n");
++			err = -EINVAL;
++			break;
++		}
++
++		sibling = 0;
++#if 0
++		/* That's all? No need to calculate sibling? */
++		if (exp->sibling) {
++			struct ct_holder *c;
++			for (c = list; c; c = c->next) {
++				if (tuplehash_to_ctrack(c->cth) == exp->sibling) {
++					sibling = c->index;
++					break;
++				}
++			}
++			/* NOTE: exp->sibling could be not "confirmed" and, hence,
++			 * out of hash table. We should just ignore such a sibling,
++			 * the connection is going to be retried, the packet
++			 * apparently was lost somewhere.
++			 */
++			if (sibling == 0)
++				dprintk_ctx("sibling conntrack is not found\n");
++		}
++#endif
++
++		/* If the expectation still does not have exp->sibling
++		 * and timer is not running, it is about to die on another
++		 * cpu. Skip it. */
++		if (!sibling &&
++		    ct->helper->timeout &&
++		    !timer_pending(&exp->timeout)) {
++			dprintk_ctx("conntrack: expectation: no timer\n");
++			continue;
++		}
++
++		err = dump_one_expect(v, exp, sibling, ctx);
++		if (err)
++			break;
++
++		v++;
++	}
++	read_unlock_bh(&ip_conntrack_lock);
++
++	if (err == 0 && (unsigned long)v != pg)
++		ctx->write((void*)pg, (unsigned long)v - pg, ctx);
++
++	free_page(pg);
++	return err;
++}
++
++static int dump_one_ct(struct ct_holder *c, struct ct_holder *list,
++		       cpt_context_t *ctx)
++{
++	struct ip_conntrack_tuple_hash *h = c->cth;
++	struct ip_conntrack *ct = tuplehash_to_ctrack(h);
++	struct cpt_ip_conntrack_image v;
++	int err = 0;
++
++	if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) {
++		eprintk_ctx("conntrack module ct->proto version mismatch\n");
++		return -EINVAL;
++	}
++
++	cpt_open_object(NULL, ctx);
++
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NET_CONNTRACK;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_ARRAY;
++
++	read_lock_bh(&ip_conntrack_lock);
++	v.cpt_status = ct->status;
++	v.cpt_timeout = ct->timeout.expires - jiffies;
++	v.cpt_ct_helper = (ct->helper != NULL);
++	v.cpt_index = c->index;
++	v.cpt_id = ct->id;
++	v.cpt_mark = 0;
++#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
++	v.cpt_mark = ct->mark;
++#endif
++	encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple);
++	encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple);
++	memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data));
++	memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data));
++
++	v.cpt_masq_index = 0;
++	v.cpt_initialized = 0;
++	v.cpt_num_manips = 0;
++	v.cpt_nat_helper = 0;
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
++	defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
++	v.cpt_masq_index = ct->nat.masq_index;
++#endif
++	/* "help" data is used by pptp, difficult to support */
++	v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos;
++	v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before;
++	v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after;
++	v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos;
++	v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before;
++	v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after;
++#endif
++	read_unlock_bh(&ip_conntrack_lock);
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	err = dump_expect_list(ct, list, ctx);
++
++	cpt_close_object(ctx);
++	return err;
++}
++
++int cpt_dump_ip_conntrack(cpt_context_t * ctx)
++{
++	struct ct_holder *ct_list = NULL;
++	struct ct_holder *c, **cp;
++	int err = 0;
++	int index = 0;
++	int idx;
++
++	if (get_exec_env()->_ip_conntrack == NULL)
++		return 0;
++
++	for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) {
++		c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
++		if (c == NULL) {
++			err = -ENOMEM;
++			goto done;
++		}
++		memset(c, 0, sizeof(struct ct_holder));
++		c->next = ct_list;
++		ct_list = c;
++	}
++
++	c = ct_list;
++
++	read_lock_bh(&ip_conntrack_lock);
++	for (idx = 0; idx < ip_conntrack_htable_size; idx++) {
++		struct ip_conntrack_tuple_hash *h;
++		list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) {
++			/* Skip reply tuples, they are covered by original
++			 * direction. */
++			if (DIRECTION(h))
++				continue;
++
++			/* Oops, we have not enough of holders...
++			 * It is impossible. */
++			if (unlikely(c == NULL)) {
++				read_unlock_bh(&ip_conntrack_lock);
++				eprintk_ctx("unexpected conntrack appeared\n");
++				err = -ENOMEM;
++				goto done;
++			}
++
++			/* If timer is not running, it means that it
++			 * has just been scheduled on another cpu.
++			 * We should skip this conntrack, it is about to be
++			 * destroyed. */
++			if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) {
++				dprintk_ctx("conntrack: no timer\n");
++				continue;
++			}
++
++			/* Timer is deleted. refcnt is _not_ decreased.
++			 * We are going to restore the timer on exit
++			 * from this function. */
++			c->cth = h;
++			c->index = ++index;
++			c = c->next;
++		}
++	}
++	read_unlock_bh(&ip_conntrack_lock);
++
++	/* No conntracks? Good. */
++	if (index == 0)
++		goto done;
++
++	/* Comb the list a little. */
++	cp = &ct_list;
++	while ((c = *cp) != NULL) {
++		/* Discard unused entries; they can appear, if some
++		 * entries were timed out since we preallocated the list.
++		 */
++		if (c->cth == NULL) {
++			*cp = c->next;
++			kfree(c);
++			continue;
++		}
++
++		/* Move conntracks attached to expectations to the beginning
++		 * of the list. */
++		if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) {
++			*cp = c->next;
++			c->next = ct_list;
++			ct_list = c;
++			dprintk_ctx("conntrack: %d moved in list\n", c->index);
++			continue;
++		}
++		cp = &c->next;
++	}
++
++	cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK);
++
++	for (c = ct_list; c; c = c->next) {
++		err = dump_one_ct(c, ct_list, ctx);
++		if (err)
++			goto done;
++	}
++
++	cpt_close_section(ctx);
++
++done:
++	while ((c = ct_list) != NULL) {
++		ct_list = c->next;
++		if (c->cth) {
++			/* Restore timer. refcnt is preserved. */
++			add_timer(&tuplehash_to_ctrack(c->cth)->timeout);
++		}
++		kfree(c);
++	}
++	return err;
++}
++
++#endif
+diff --git a/kernel/cpt/cpt_context.c b/kernel/cpt/cpt_context.c
+new file mode 100644
+index 0000000..58a8069
+--- /dev/null
++++ b/kernel/cpt/cpt_context.c
+@@ -0,0 +1,257 @@
++/*
++ *
++ *  kernel/cpt/cpt_context.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++
++static void file_write(const void *addr, size_t count, struct cpt_context *ctx)
++{
++	mm_segment_t oldfs;
++	ssize_t err = -EBADF;
++	struct file *file = ctx->file;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if (file)
++		err = file->f_op->write(file, addr, count, &file->f_pos);
++	set_fs(oldfs);
++	if (err != count && !ctx->write_error)
++		ctx->write_error = err < 0 ? err : -EIO;
++}
++
++static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
++{
++	mm_segment_t oldfs;
++	ssize_t err = -EBADF;
++	struct file *file = ctx->file;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if (file)
++		err = file->f_op->write(file, addr, count, &pos);
++	set_fs(oldfs);
++	if (err != count && !ctx->write_error)
++		ctx->write_error = err < 0 ? err : -EIO;
++}
++
++static void file_align(struct cpt_context *ctx)
++{
++	struct file *file = ctx->file;
++
++	if (file)
++		file->f_pos = CPT_ALIGN(file->f_pos);
++}
++
++void cpt_context_init(struct cpt_context *ctx)
++{
++	int i;
++
++	memset(ctx, 0, sizeof(*ctx));
++
++	init_MUTEX(&ctx->main_sem);
++	ctx->refcount = 1;
++
++	ctx->current_section = -1;
++	ctx->current_object = -1;
++	ctx->pagesize = PAGE_SIZE;
++	ctx->write = file_write;
++	ctx->pwrite = file_pwrite;
++	ctx->align = file_align;
++	for (i=0; i < CPT_SECT_MAX; i++)
++		ctx->sections[i] = CPT_NULL;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	init_completion(&ctx->pgin_notify);
++#endif
++	cpt_object_init(ctx);
++}
++
++int cpt_open_dumpfile(struct cpt_context *ctx)
++{
++	ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
++	if (ctx->tmpbuf == NULL)
++		return -ENOMEM;
++	__cpt_release_buf(ctx);
++	return 0;
++}
++
++int cpt_close_dumpfile(struct cpt_context *ctx)
++{
++	if (ctx->file) {
++		fput(ctx->file);
++		ctx->file = NULL;
++	}
++	if (ctx->tmpbuf) {
++		free_page((unsigned long)ctx->tmpbuf);
++		ctx->tmpbuf = NULL;
++	}
++	if (ctx->write_error)
++		eprintk_ctx("error while writing dump file: %d\n", ctx->write_error);
++	return ctx->write_error;
++}
++
++int cpt_major_hdr_out(struct cpt_context *ctx)
++{
++	struct cpt_major_hdr hdr;
++
++	if (ctx->file == NULL)
++		return 0;
++
++	memset(&hdr, 0, sizeof(hdr));
++	hdr.cpt_signature[0] = CPT_SIGNATURE0;
++	hdr.cpt_signature[1] = CPT_SIGNATURE1;
++	hdr.cpt_signature[2] = CPT_SIGNATURE2;
++	hdr.cpt_signature[3] = CPT_SIGNATURE3;
++	hdr.cpt_hdrlen = sizeof(hdr);
++	hdr.cpt_image_version = CPT_VERSION_20;
++#ifdef CONFIG_X86_64
++	hdr.cpt_os_arch = CPT_OS_ARCH_EMT64;
++#elif defined(CONFIG_X86_32)
++	hdr.cpt_os_arch = CPT_OS_ARCH_I386;
++#elif defined(CONFIG_IA64)
++	hdr.cpt_os_arch = CPT_OS_ARCH_IA64;
++#else
++#error	Arch is not supported
++#endif
++	hdr.cpt_ve_features = (__u32)ctx->features;
++	hdr.cpt_ve_features2 = (__u32)(ctx->features>>32);
++	hdr.cpt_pagesize = (__u16)PAGE_SIZE;
++	hdr.cpt_hz = HZ;
++	hdr.cpt_start_jiffies64 = ctx->virt_jiffies64;
++	hdr.cpt_start_sec = ctx->start_time.tv_sec;
++	hdr.cpt_start_nsec = ctx->start_time.tv_nsec;
++	hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags;
++	hdr.cpt_kernel_config[0] = ctx->kernel_config_flags;
++	hdr.cpt_iptables_mask = ctx->iptables_mask;
++
++	ctx->write(&hdr, sizeof(hdr), ctx);
++	return 0;
++}
++
++int cpt_close_section(struct cpt_context *ctx)
++{
++	if (ctx->file && ctx->current_section >= 0) {
++		__u64 next = ctx->file->f_pos - ctx->current_section;
++		ctx->pwrite(&next, 8, ctx, ctx->current_section);
++		ctx->current_section = -1;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(cpt_close_section);
++
++int cpt_open_section(struct cpt_context *ctx, __u32 type)
++{
++	struct cpt_section_hdr hdr;
++
++	if (ctx->file == NULL)
++		return 0;
++
++	cpt_close_section(ctx);
++
++	ctx->current_section = ctx->file->f_pos;
++	ctx->sections[type] = ctx->current_section;
++
++	hdr.cpt_next = 0;
++	hdr.cpt_section = type;
++	hdr.cpt_hdrlen = sizeof(hdr);
++	hdr.cpt_align = 0;
++	ctx->write(&hdr, sizeof(hdr), ctx);
++
++	return 0;
++}
++EXPORT_SYMBOL(cpt_open_section);
++
++
++int cpt_close_object(struct cpt_context *ctx)
++{
++	if (ctx->file && ctx->current_object >= 0) {
++		__u64 next = ctx->file->f_pos - ctx->current_object;
++		ctx->pwrite(&next, 8, ctx, ctx->current_object);
++		ctx->current_object = -1;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(cpt_close_object);
++
++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	if (ctx->file == NULL)
++		return 0;
++
++	cpt_close_object(ctx);
++
++	ctx->current_object = ctx->file->f_pos;
++	if (obj)
++		cpt_obj_setpos(obj, ctx->current_object, ctx);
++
++	return 0;
++}
++EXPORT_SYMBOL(cpt_open_object);
++
++int cpt_push_object(loff_t *saved, struct cpt_context *ctx)
++{
++	if (ctx->file) {
++		*saved = ctx->current_object;
++		ctx->current_object = ctx->file->f_pos;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(cpt_push_object);
++
++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx)
++{
++	ctx->current_object = *saved;
++	return 0;
++}
++EXPORT_SYMBOL(cpt_pop_object);
++
++int cpt_dump_tail(struct cpt_context *ctx)
++{
++	struct cpt_major_tail hdr;
++	int i;
++
++	if (ctx->file == NULL)
++		return 0;
++
++	cpt_open_section(ctx, CPT_SECT_TRAILER);
++	memset(&hdr, 0, sizeof(hdr));
++	hdr.cpt_next = sizeof(hdr);
++	hdr.cpt_object = CPT_OBJ_TRAILER;
++	hdr.cpt_hdrlen = sizeof(hdr);
++	hdr.cpt_content = CPT_CONTENT_VOID;
++	hdr.cpt_lazypages = 0;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	hdr.cpt_lazypages = ctx->lazypages;
++#endif
++	hdr.cpt_64bit = ctx->tasks64;
++	hdr.cpt_signature[0] = CPT_SIGNATURE0;
++	hdr.cpt_signature[1] = CPT_SIGNATURE1;
++	hdr.cpt_signature[2] = CPT_SIGNATURE2;
++	hdr.cpt_signature[3] = CPT_SIGNATURE3;
++	hdr.cpt_nsect = CPT_SECT_MAX_INDEX;
++	for (i = 0; i < CPT_SECT_MAX_INDEX; i++)
++		hdr.cpt_sections[i] = ctx->sections[i];
++
++	ctx->write(&hdr, sizeof(hdr), ctx);
++	cpt_close_section(ctx);
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_context.h b/kernel/cpt/cpt_context.h
+new file mode 100644
+index 0000000..e4f82f9
+--- /dev/null
++++ b/kernel/cpt/cpt_context.h
+@@ -0,0 +1,215 @@
++#include <linux/fs.h>
++#include <asm/uaccess.h>
++#include <bc/beancounter.h>
++
++#define	CPT_CTX_ERROR		-1
++#define	CPT_CTX_IDLE		0
++#define CPT_CTX_SUSPENDING	1
++#define	CPT_CTX_SUSPENDED	2
++#define CPT_CTX_DUMPING		3
++#define CPT_CTX_UNDUMPING	4
++#define CPT_CTX_UNDUMPED	5
++
++#define CPT_TID(tsk)   task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm
++#define CPT_FID		"%d,%d(%s)"
++
++
++typedef struct cpt_context
++{
++	struct list_head ctx_list;
++	int	refcount;
++	int	ctx_state;
++	int	objcount;
++	int	sticky;
++	struct semaphore main_sem;
++
++	struct file *errorfile;
++	struct file *statusfile;
++	struct file *lockfile;
++
++	int	errno;
++	char	*error_msg;
++	loff_t	err_offset;
++
++	struct file	*file;
++	char		*tmpbuf;
++	int		pagesize;
++#ifdef CONFIG_VZ_CHECKPOINT_ITER
++	int		iter_done;
++	void		*iter_dir;
++	struct user_beancounter *iter_ub;
++#endif
++	loff_t		current_section;
++	loff_t		current_object;
++
++	loff_t		sections[CPT_SECT_MAX];
++
++	__u32		errormask;
++	__u32		write_error;
++
++	struct list_head object_array[CPT_OBJ_MAX];
++
++	void		(*write)(const void *addr, size_t count, struct cpt_context *ctx);
++	void		(*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
++	ssize_t		(*read)(void *addr, size_t count, struct cpt_context *ctx);
++	ssize_t		(*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
++	void		(*align)(struct cpt_context *ctx);
++	int		ve_id;
++	int		contextid;
++	struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst
++					     * corresponging to start_time */
++	__u64		virt_jiffies64;	/* Virtual jiffies64. It is == cpt_jiffies64 when
++					 * VE did not migrate. */
++	struct timespec	start_time;
++	struct timespec delta_time;
++	__s64		delta_nsec;
++	int		image_version;
++	__u16		image_arch;
++	__u64		iptables_mask;
++	__u64		features;
++
++#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9)
++#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS)
++	struct hlist_head *anonvmas;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	struct file	*pagein_file_in;
++	struct file	*pagein_file_out;
++	int		lazy_vm;
++	int		lazypages;
++	int		lazytype;
++	struct task_struct	*pgin_task;
++	unsigned long	last_pagein;
++	struct pagein_desc	**pgin_dir;
++	struct pgin_device	*pagein_dev;
++	struct completion	pgin_notify;
++	struct completion	*pgind_completion;
++	struct swap_info_struct	*pgin_swp;
++#endif
++	int		tasks64;
++	__u32		src_cpu_flags;
++	__u32		dst_cpu_flags;
++	__u32		kernel_config_flags;
++
++	__u32		last_vpid;
++
++	struct filejob  *filejob_queue;
++
++	int		slm_count;
++
++	char		*vdso;
++
++#ifdef CONFIG_BEANCOUNTERS
++	/* Store here ubc limits and barriers during undumping,
++	   and restore them before resuming */
++	struct ubparm	saved_ubc[UB_RESOURCES];
++#endif
++} cpt_context_t;
++
++typedef struct {
++	int pid;
++	cpt_context_t *ctx;
++	struct completion done;
++} pagein_info_t;
++
++int pagein_info_printf(char *buf, cpt_context_t *ctx);
++
++int cpt_open_dumpfile(struct cpt_context *);
++int cpt_close_dumpfile(struct cpt_context *);
++int rst_open_dumpfile(struct cpt_context *);
++void rst_close_dumpfile(struct cpt_context *);
++void cpt_context_init(struct cpt_context *);
++void rst_context_init(struct cpt_context *);
++void cpt_context_destroy(struct cpt_context *);
++
++void rst_report_error(int err, cpt_context_t *ctx);
++
++
++int cpt_major_hdr_out(struct cpt_context *ctx);
++int cpt_dump_tail(struct cpt_context *ctx);
++int cpt_close_section(struct cpt_context *ctx);
++int cpt_open_section(struct cpt_context *ctx, __u32 type);
++int cpt_close_object(struct cpt_context *ctx);
++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx);
++int cpt_push_object(loff_t *saved, struct cpt_context *ctx);
++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx);
++
++int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *);
++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx);
++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx);
++void rst_put_name(__u8 *name, struct cpt_context *ctx);
++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx);
++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx);
++
++pid_t vpid_to_pid(pid_t);
++
++#define rst_get_object(type, pos, tmp, ctx) \
++ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx))
++
++extern int debug_level;
++
++#define cpt_printk(lvl, fmt, args...)	do {	\
++		if (lvl <= debug_level)		\
++			printk(fmt, ##args);	\
++	} while (0)
++
++#define dprintk(a...) cpt_printk(3, "CPT DBG: " a)
++#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
++
++#define wprintk(a...) cpt_printk(2, "CPT WRN: " a)
++#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
++
++#define eprintk(a...) cpt_printk(1, "CPT ERR: " a)
++#define eprintk_ctx(f, arg...)						\
++do {									\
++	eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg);			\
++	if (ctx->error_msg && ctx->err_offset < PAGE_SIZE)		\
++		ctx->err_offset += snprintf((char*)(ctx->error_msg +	\
++				ctx->err_offset),			\
++			       	PAGE_SIZE - ctx->err_offset,		\
++				"Error: " f, ##arg);			\
++} while(0)
++
++#define CPT_TMPBUF_FREE 0x789adf12
++#define CPT_TMPBUF_BUSY 0xabcd9876
++
++static inline void *cpt_get_buf(cpt_context_t *ctx)
++{
++	void *buf = ctx->tmpbuf;
++
++	BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE);
++	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY;
++	return buf;
++}
++
++static inline void __cpt_release_buf(cpt_context_t *ctx)
++{
++	void *buf = ctx->tmpbuf;
++
++	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
++}
++
++static inline void cpt_release_buf(cpt_context_t *ctx)
++{
++	void *buf = ctx->tmpbuf;
++
++	BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY);
++	*(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
++}
++
++static inline void cpt_flush_error(cpt_context_t *ctx)
++{
++	mm_segment_t oldfs;
++
++	if (ctx->errorfile && ctx->error_msg && ctx->err_offset) {
++		if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) {
++			oldfs = get_fs();
++			set_fs(KERNEL_DS);
++			ctx->errorfile->f_op->write(ctx->errorfile,
++				ctx->error_msg, ctx->err_offset,
++				&ctx->errorfile->f_pos);
++			set_fs(oldfs);
++		}
++		ctx->error_msg[0] = 0;
++		ctx->err_offset = 0;
++	}
++}
+diff --git a/kernel/cpt/cpt_dump.c b/kernel/cpt/cpt_dump.c
+new file mode 100644
+index 0000000..de2364b
+--- /dev/null
++++ b/kernel/cpt/cpt_dump.c
+@@ -0,0 +1,1247 @@
++/*
++ *
++ *  kernel/cpt/cpt_dump.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/ptrace.h>
++#include <linux/smp_lock.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/virtinfo.h>
++#include <linux/virtinfoscp.h>
++#include <bc/task.h>
++#include <linux/cpt_image.h>
++#include <linux/nsproxy.h>
++#include <linux/mnt_namespace.h>
++#include <linux/netdevice.h>
++#include <linux/nfcalls.h>
++#include <linux/dcache.h>
++#include <linux/if_tun.h>
++#include <linux/utsname.h>
++#include <linux/pid_namespace.h>
++#include <linux/ipc_namespace.h>
++#include <linux/netdevice.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_process.h"
++#include "cpt_net.h"
++#include "cpt_socket.h"
++#include "cpt_ubc.h"
++#include "cpt_kernel.h"
++
++
++static int vps_child_level(struct task_struct *root, struct task_struct *c)
++{
++	int level = 0;
++	int veid = VE_TASK_INFO(c)->owner_env->veid;
++
++	while (VE_TASK_INFO(c)->owner_env->veid == veid) {
++		if (c->pid != c->tgid)
++			c = c->group_leader;
++		if (c == root)
++			return level;
++
++		c = c->parent;
++		level++;
++	}
++	return -1;
++}
++
++static inline int freezable(struct task_struct * p)
++{
++	if (p->exit_state)
++		return 0;
++
++	switch (p->state) {
++	case EXIT_ZOMBIE:
++	case EXIT_DEAD:
++	case TASK_STOPPED:
++#if TASK_TRACED != TASK_STOPPED
++	case TASK_TRACED:
++#endif
++		return 0;
++	default:
++		return 1;
++	}
++}
++
++static void wake_ve(cpt_context_t *ctx)
++{
++	struct task_struct *p, *g;
++
++	do_each_thread_ve(g, p) {
++		spin_lock_irq(&p->sighand->siglock);
++		if (p->flags & PF_FROZEN) {
++			p->flags &= ~PF_FROZEN;
++			wake_up_process(p);
++		}
++		spin_unlock_irq(&p->sighand->siglock);
++	} while_each_thread_ve(g, p);
++}
++
++/*
++ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE...
++ *
++ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context
++ * of another process. Apparently, it is unacceptable on SMP.
++ * Let's take freeze_processes() in kernel/power/process.c as an example.
++ * Unserialized modifications tsk->flags easily
++ * (believe or not, but it happens with probability of almost 100% :-))
++ * creates the situation when setting PF_FREEZE in freeze_processes(),
++ * which quickly spins raising PF_FREEZE of all the processes,
++ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks.
++ *
++ * So, to make things clean, we require that those flags may be modified
++ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE
++ * is just a kind of signal.
++ *
++ * It is not enough, because we are still not allowed to change tsk->flags
++ * in context of another process, we can corrupt another flags, when the process
++ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags,
++ * which can be changed atomically.
++ *
++ * PF_FROZEN also changes in context of another process, but this happens
++ * only when the process is already in refrigerator() which does not modify
++ * tsk->flags.
++ */
++
++static int check_process_external(struct task_struct *p)
++{
++	if (pid_alive(p)) {
++		if (p->pids[PIDTYPE_PID].pid->level == 0)
++			return PIDTYPE_PID;
++		if (p->pids[PIDTYPE_PGID].pid->level == 0)
++			return PIDTYPE_PGID;
++		if (p->pids[PIDTYPE_SID].pid->level == 0)
++			return PIDTYPE_SID;
++	}
++
++	return PIDTYPE_MAX;
++}
++
++enum
++{
++	OBSTACLE_NOGO = -1,
++	OBSTACLE_TIMEOUT = -2,
++	OBSTACLE_TRYAGAIN = -3,
++};
++
++#define SUSPEND_TIMEOUT	(10UL*HZ)
++
++static int vps_stop_tasks(struct cpt_context *ctx)
++{
++	unsigned long start_time = jiffies;
++	unsigned long target, timeout;
++	struct task_struct *p, *g;
++	int todo;
++	int round = 0;
++
++	do_gettimespec(&ctx->start_time); 
++	do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
++	ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup;
++
++	read_lock(&tasklist_lock);
++
++	atomic_inc(&get_exec_env()->suspend);
++	timeout = HZ/5;
++	target = jiffies + timeout;
++
++	for(;;) {
++		struct task_struct *root;
++		todo = 0;
++
++		root = find_task_by_vpid(1);
++		if (!root) {
++			read_unlock(&tasklist_lock);
++			eprintk_ctx("cannot find ve init\n");
++			atomic_dec(&get_exec_env()->suspend);
++			return -ESRCH;
++		}
++
++		do_each_thread_ve(g, p) {
++			if (vps_child_level(root, p) >= 0) {
++				switch (check_process_external(p)) {
++				case PIDTYPE_PID:
++					eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
++							task_pid_vnr(p), p->pid, p->comm);
++					todo = OBSTACLE_NOGO;
++					goto out;
++				case PIDTYPE_PGID:
++					eprintk_ctx("external process group %d/%d(%s) inside CT "
++							"(e.g. vzctl enter or vzctl exec).\n",
++							task_pgrp_vnr(p), p->pid, p->comm);
++					todo = OBSTACLE_NOGO;
++					goto out;
++				case PIDTYPE_SID:
++					eprintk_ctx("external process session %d/%d(%s) inside CT "
++							"(e.g. vzctl enter or vzctl exec).\n",
++							task_session_vnr(p), p->pid, p->comm);
++					todo = OBSTACLE_NOGO;
++					goto out;
++				}
++				if (p->vfork_done) {
++					/* Task between vfork()...exec()
++					 * cannot be frozen, because parent
++					 * wait in uninterruptible state.
++					 * So, we do nothing, waiting for
++					 * exec(), unless:
++					 */
++					if (p->state == TASK_STOPPED ||
++					    p->state == TASK_TRACED) {
++						eprintk_ctx("task " CPT_FID " is stopped while vfork(). "
++								"Checkpointing is impossible.\n",
++								CPT_TID(p));
++						todo = OBSTACLE_NOGO;
++						/* It is fatal, _user_ stopped
++						 * vfork()ing task, so that we
++						 * cannot suspend now.
++						 */
++					} else {
++						todo = OBSTACLE_TRYAGAIN;
++					}
++					goto out;
++				}
++				if (p->signal->group_exit_task &&
++				    p->signal->notify_count) {
++					/* exec() waits for threads' death */
++					wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p));
++					todo = OBSTACLE_TRYAGAIN;
++					goto out;
++				}
++				if (p->state == TASK_TRACED
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++				    && !p->stopped_state
++#endif
++				    ) {
++					int ptrace_id = p->pn_state;
++					/* Debugger waits for signal. */
++					switch (ptrace_id) {
++					case PN_STOP_TF:
++					case PN_STOP_TF_RT:
++					case PN_STOP_ENTRY:
++					case PN_STOP_FORK:
++					case PN_STOP_VFORK:
++					case PN_STOP_SIGNAL:
++					case PN_STOP_EXIT:
++					case PN_STOP_LEAVE:
++						break;
++					default:
++						eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id);
++						todo = OBSTACLE_NOGO;
++						goto out;
++					}
++				}
++#ifdef CONFIG_UTRACE
++				if (check_utrace(p, root, ctx)) {
++					eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p));
++					todo = OBSTACLE_NOGO;
++					goto out;
++				}
++#endif
++				if (p->flags & PF_NOFREEZE) {
++					eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p));
++					todo = OBSTACLE_NOGO;
++					goto out;
++				}
++
++				if (!freezable(p))
++					continue;
++
++				spin_lock_irq(&p->sighand->siglock);
++				if (!(p->flags & PF_FROZEN)) {
++					set_tsk_thread_flag(p, TIF_FREEZE);
++					signal_wake_up(p, 0);
++				}
++				spin_unlock_irq(&p->sighand->siglock);
++
++				if (p->flags & PF_FROZEN) {
++					if (p->state != TASK_UNINTERRUPTIBLE)
++						printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p));
++					continue;
++				}
++
++				if (round == 10)
++					wprintk_ctx(CPT_FID " is running\n", CPT_TID(p));
++
++				todo++;
++			} else {
++				if (p != current) {
++					eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
++							task_pid_vnr(p), task_pid_nr(p), p->comm);
++					todo = OBSTACLE_NOGO;
++					goto out;
++				}
++			}
++		} while_each_thread_ve(g, p);
++
++		if (todo > 0) {
++			/* No visible obstacles, but VE did not freeze
++			 * for timeout. Interrupt suspend, if it is major
++			 * timeout or signal; if it is minor timeout
++			 * we will wake VE and restart suspend.
++			 */
++			if (time_after(jiffies, start_time + SUSPEND_TIMEOUT)
++			    || signal_pending(current))
++				todo = OBSTACLE_TIMEOUT;
++			else if (time_after(jiffies, target))
++				todo = OBSTACLE_TRYAGAIN;
++		}
++
++out:
++		if (todo < 0) {
++			atomic_dec(&get_exec_env()->suspend);
++
++			wake_ve(ctx);
++
++#if 0
++			/* This is sign of failure of printk(), which is not
++			 * ours. So, no prefixes. */
++			printk(">\n");
++#endif
++		}
++
++		read_unlock(&tasklist_lock);
++
++		if (!todo) {
++			atomic_dec(&get_exec_env()->suspend);
++			return 0;
++		}
++
++		switch (todo) {
++		case OBSTACLE_NOGO:
++			eprintk_ctx("suspend is impossible now.\n");
++			return -EAGAIN;
++
++		case OBSTACLE_TIMEOUT:
++			eprintk_ctx("interrupted or timed out.\n");
++			return -EINTR;
++
++		case OBSTACLE_TRYAGAIN:
++			if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) ||
++			    signal_pending(current)) {
++				wprintk_ctx("suspend timed out\n");
++				return -EAGAIN;
++			}
++
++			wprintk_ctx("minor suspend timeout (%lu) expired, "
++				    "trying again\n", timeout);
++
++			/* Try again. VE is awake, give it some time to run. */
++			current->state = TASK_INTERRUPTIBLE;
++			schedule_timeout(HZ);
++
++			/* After a short wait restart suspend
++			 * with longer timeout */
++			atomic_inc(&get_exec_env()->suspend);
++			timeout = min(timeout<<1, SUSPEND_TIMEOUT);
++			target = jiffies + timeout;
++			break;
++
++		default:
++			if (round > 0) {
++				/* VE is partially frozen, give processes
++				 * a chance to enter to refrigerator(). */
++				current->state = TASK_INTERRUPTIBLE;
++				schedule_timeout(HZ/20);
++			} else {
++				yield();
++			}
++		}
++
++		read_lock(&tasklist_lock);
++		round++;
++	}
++}
++
++static int cpt_unlock_ve(struct cpt_context *ctx)
++{
++	struct ve_struct *env;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++	down_write(&env->op_sem);
++	env->is_locked = 0;
++	up_write(&env->op_sem);
++	put_ve(env);
++	return 0;
++}
++
++int cpt_resume(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx);
++
++	cpt_unlock_sockets(ctx);
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	if (ctx->pgin_task) {
++		wait_for_completion(&ctx->pgin_notify);
++		put_task_struct(ctx->pgin_task);
++		ctx->pgin_task = NULL;
++	}
++#endif
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++
++		spin_lock_irq(&tsk->sighand->siglock);
++		if (tsk->flags & PF_FROZEN) {
++			tsk->flags &= ~PF_FROZEN;
++			wake_up_process(tsk);
++		} else if (freezable(tsk)) {
++			eprintk_ctx("strange, %s not frozen\n", tsk->comm );
++		}
++		spin_unlock_irq(&tsk->sighand->siglock);
++		put_task_struct(tsk);
++	}
++
++	cpt_resume_network(ctx);
++
++	cpt_unlock_ve(ctx);
++
++	cpt_finish_ubc(ctx);
++	cpt_object_destroy(ctx);
++	return 0;
++}
++
++int cpt_kill(struct cpt_context *ctx)
++{
++	int err = 0;
++	struct ve_struct *env;
++	cpt_object_t *obj;
++	struct task_struct *root_task = NULL;
++	long delay;
++
++	if (!ctx->ve_id)
++		return -EINVAL;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++
++	/* from here cpt_kill succeeds */
++	virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx);
++
++	if (current->ve_task_info.owner_env == env) {
++		wprintk_ctx("attempt to kill ve from inside, escaping...\n");
++		ve_move_task(current, get_ve0());
++	}
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	if (ctx->pgin_task) {
++		wait_for_completion(&ctx->pgin_notify);
++		put_task_struct(ctx->pgin_task);
++		ctx->pgin_task = NULL;
++	}
++#endif
++
++	cpt_kill_sockets(ctx);
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++
++		if (tsk->exit_state) {
++			put_task_struct(tsk);
++			continue;
++		}
++
++		if (task_pid_vnr(tsk) == 1) {
++			root_task = tsk;
++			continue;
++		}
++
++		tsk->robust_list = NULL;
++#ifdef CONFIG_COMPAT
++		tsk->compat_robust_list = NULL;
++#endif
++		tsk->clear_child_tid = NULL;
++
++		if (tsk->ptrace) {
++			write_lock_irq(&tasklist_lock);
++			tsk->ptrace = 0;
++			if (!list_empty(&tsk->ptrace_list)) {
++				list_del_init(&tsk->ptrace_list);
++				remove_parent(tsk);
++				tsk->parent = tsk->parent;
++				add_parent(tsk);
++			}
++			write_unlock_irq(&tasklist_lock);
++		}
++
++		send_sig(SIGKILL, tsk, 1);
++
++		spin_lock_irq(&tsk->sighand->siglock);
++		sigfillset(&tsk->blocked);
++		sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
++		set_tsk_thread_flag(tsk, TIF_SIGPENDING);
++		if (tsk->flags & PF_FROZEN)
++			tsk->flags &= ~PF_FROZEN;
++		spin_unlock_irq(&tsk->sighand->siglock);
++
++		wake_up_process(tsk);
++		put_task_struct(tsk);
++	}
++
++	yield();
++
++	if (root_task != NULL) {
++		send_sig(SIGKILL, root_task, 1);
++
++		spin_lock_irq(&root_task->sighand->siglock);
++		sigfillset(&root_task->blocked);
++		sigdelsetmask(&root_task->blocked, sigmask(SIGKILL));
++		set_tsk_thread_flag(root_task, TIF_SIGPENDING);
++		clear_tsk_thread_flag(root_task, TIF_FREEZE);
++		if (root_task->flags & PF_FROZEN)
++			root_task->flags &= ~PF_FROZEN;
++		spin_unlock_irq(&root_task->sighand->siglock);
++
++		wake_up_process(root_task);
++		put_task_struct(root_task);
++	}
++
++	cpt_finish_ubc(ctx);
++	cpt_object_destroy(ctx);
++
++	delay = 1;
++	while (atomic_read(&env->counter) != 1) {
++		if (signal_pending(current))
++			break;
++		current->state = TASK_INTERRUPTIBLE;
++		delay = (delay < HZ) ? (delay << 1) : HZ;
++		schedule_timeout(delay);
++	}
++	put_ve(env);
++
++	return err;
++}
++
++#ifdef CONFIG_BEANCOUNTERS
++static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx)
++{
++	struct task_beancounter *tbc;
++
++	tbc = &(t->task_bc);
++	cpt_add_ubc(tbc->exec_ub, ctx);
++	cpt_add_ubc(tbc->task_ub, ctx);
++	cpt_add_ubc(tbc->fork_sub, ctx);
++}
++#else
++static void inline collect_task_ubc(struct task_struct *t,
++		struct cpt_context *ctx)
++{ return; }
++#endif
++
++static cpt_object_t * remember_task(struct task_struct * child,
++		cpt_object_t * head, cpt_context_t * ctx)
++{
++	cpt_object_t *cobj;
++
++	if (freezable(child) && !(child->flags&PF_FROZEN)) {
++		eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child));
++		put_task_struct(child);
++		return NULL;
++	}
++
++	if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG();
++	if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
++		put_task_struct(child);
++		return NULL;
++	}
++	cobj->o_count = 1;
++	cpt_obj_setobj(cobj, child, ctx);
++	insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx);
++	collect_task_ubc(child, ctx);
++	return cobj;
++}
++
++static int vps_collect_tasks(struct cpt_context *ctx)
++{
++	int err = -ESRCH;
++	cpt_object_t *obj;
++	struct task_struct *root;
++	read_lock(&tasklist_lock);
++	root = find_task_by_vpid(1);
++	if (root)
++		get_task_struct(root);
++	read_unlock(&tasklist_lock);
++
++	if (!root) {
++		err = -ESRCH;
++		eprintk_ctx("vps_collect_tasks: cannot find root\n");
++		goto out;
++	}
++
++	if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
++		put_task_struct(root);
++		return -ENOMEM;
++	}
++	obj->o_count = 1;
++	cpt_obj_setobj(obj, root, ctx);
++	intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
++	collect_task_ubc(root, ctx);
++
++	/* Collect process subtree recursively */
++	for_each_object(obj, CPT_OBJ_TASK) {
++		cpt_object_t *head = obj;
++		struct task_struct *tsk = obj->o_obj;
++		struct task_struct *child;
++
++		if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) {
++			eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk));
++			err = -EINVAL;
++			goto out;
++		}
++
++		if (tsk->state == TASK_RUNNING)
++			printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk));
++
++		wait_task_inactive(tsk);
++
++		err = check_task_state(tsk, ctx);
++		if (err)
++			goto out;
++
++		if (tsk->pid == tsk->tgid) {
++			child = tsk;
++			for (;;) {
++				read_lock(&tasklist_lock);
++				child = next_thread(child);
++				if (child != tsk)
++					get_task_struct(child);
++				read_unlock(&tasklist_lock);
++
++				if (child == tsk)
++					break;
++
++				if (child->parent != tsk->parent) {
++					put_task_struct(child);
++					eprintk_ctx("illegal thread structure, kernel bug\n");
++					err = -EINVAL;
++					goto out;
++				}
++
++				if ((head = remember_task(child, head, ctx)) == NULL) {
++					eprintk_ctx("task obj allocation failure\n");
++					err = -ENOMEM;
++					goto out;
++				}
++			}
++		}
++
++		/* About locking. VE is frozen. But lists of children
++		 * may change at least for init, when entered task reparents
++		 * to init and when reparented task exits. If we take care
++		 * of this case, we still can unlock while scanning
++		 * tasklists.
++		 */
++		read_lock(&tasklist_lock);
++		list_for_each_entry(child, &tsk->children, sibling) {
++			if (child->parent != tsk)
++				continue;
++			if (child->pid != child->tgid)
++				continue;
++			get_task_struct(child);
++			read_unlock(&tasklist_lock);
++
++			if ((head = remember_task(child, head, ctx)) == NULL) {
++				eprintk_ctx("task obj allocation failure\n");
++				err = -ENOMEM;
++				goto out;
++			}
++
++			read_lock(&tasklist_lock);
++		}
++
++		list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) {
++			if (child->parent != tsk)
++				continue;
++			if (child->pid != child->tgid)
++				continue;
++			get_task_struct(child);
++			read_unlock(&tasklist_lock);
++
++			if ((head = remember_task(child, head, ctx)) == NULL) {
++				eprintk_ctx("task obj allocation failure\n");
++				err = -ENOMEM;
++				goto out;
++			}
++
++			read_lock(&tasklist_lock);
++		}
++		read_unlock(&tasklist_lock);
++	}
++
++	return 0;
++
++out:
++	while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) {
++		struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next;
++		cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
++		struct task_struct *tsk;
++
++		list_del(head);
++		tsk = obj->o_obj;
++		put_task_struct(tsk);
++		free_cpt_object(obj, ctx);
++	}
++	return err;
++}
++
++static int cpt_collect(struct cpt_context *ctx)
++{
++	int err;
++
++	if ((err = cpt_collect_mm(ctx)) != 0)
++		return err;
++
++	if ((err = cpt_collect_sysv(ctx)) != 0)
++		return err;
++
++	if ((err = cpt_collect_files(ctx)) != 0)
++		return err;
++
++	if ((err = cpt_collect_fs(ctx)) != 0)
++		return err;
++
++	if ((err = cpt_collect_namespace(ctx)) != 0)
++		return err;
++
++	if ((err = cpt_collect_signals(ctx)) != 0)
++		return err;
++
++	if (virtinfo_notifier_call(VITYPE_SCP,
++				VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL)
++		return -ECHRNG;
++
++	return 0;
++}
++
++static int cpt_dump_veinfo(cpt_context_t *ctx)
++{
++	struct cpt_veinfo_image *i = cpt_get_buf(ctx);
++	struct ve_struct *ve;
++	struct timespec delta;
++	struct ipc_namespace *ns;
++
++	cpt_open_section(ctx, CPT_SECT_VEINFO);
++	cpt_open_object(NULL, ctx);
++
++	memset(i, 0, sizeof(*i));
++
++	i->cpt_next = CPT_NULL;
++	i->cpt_object = CPT_OBJ_VEINFO;
++	i->cpt_hdrlen = sizeof(*i);
++	i->cpt_content = CPT_CONTENT_VOID;
++
++	ve = get_exec_env();
++	ns = ve->ve_ns->ipc_ns;
++
++	if (ns->shm_ctlall > 0xFFFFFFFFU)
++		i->shm_ctl_all = 0xFFFFFFFFU;
++	if (ns->shm_ctlmax > 0xFFFFFFFFU)
++		i->shm_ctl_max = 0xFFFFFFFFU;
++	i->shm_ctl_mni = ns->shm_ctlmni;
++
++	i->msg_ctl_max = ns->msg_ctlmax;
++	i->msg_ctl_mni = ns->msg_ctlmni;
++	i->msg_ctl_mnb = ns->msg_ctlmnb;
++
++	BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
++	i->sem_ctl_arr[0] = ns->sem_ctls[0];
++	i->sem_ctl_arr[1] = ns->sem_ctls[1];
++	i->sem_ctl_arr[2] = ns->sem_ctls[2];
++	i->sem_ctl_arr[3] = ns->sem_ctls[3];
++
++	do_posix_clock_monotonic_gettime(&delta);
++	_set_normalized_timespec(&delta,
++			delta.tv_sec - ve->start_timespec.tv_sec,
++			delta.tv_nsec - ve->start_timespec.tv_nsec);
++	i->start_timespec_delta = cpt_timespec_export(&delta);
++	i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies;
++
++	i->last_pid = ve->ve_ns->pid_ns->last_pid;
++
++	ctx->write(i, sizeof(*i), ctx);
++	cpt_release_buf(ctx);
++	cpt_close_object(ctx);
++	cpt_close_section(ctx);
++	return 0;
++}
++
++static int cpt_dump_utsname(cpt_context_t *ctx)
++{
++	int len;
++	struct cpt_object_hdr o;
++	struct ve_struct *ve;
++	struct uts_namespace *ns;
++
++	cpt_open_section(ctx, CPT_SECT_UTSNAME);
++
++	ve = get_exec_env();
++	ns = ve->ve_ns->uts_ns;
++
++ 	cpt_open_object(NULL, ctx);
++	len = strlen(ns->name.nodename);
++ 	o.cpt_next = CPT_NULL;
++	o.cpt_object = CPT_OBJ_NAME;
++	o.cpt_hdrlen = sizeof(o);
++	o.cpt_content = CPT_CONTENT_NAME;
++
++	ctx->write(&o, sizeof(o), ctx);
++	ctx->write(ns->name.nodename, len+1, ctx);
++	ctx->align(ctx);
++ 	cpt_close_object(ctx);
++ 
++ 	cpt_open_object(NULL, ctx);
++	len = strlen(ns->name.domainname);
++ 	o.cpt_next = CPT_NULL;
++	o.cpt_object = CPT_OBJ_NAME;
++	o.cpt_hdrlen = sizeof(o);
++	o.cpt_content = CPT_CONTENT_NAME;
++
++	ctx->write(&o, sizeof(o), ctx);
++	ctx->write(ns->name.domainname, len+1, ctx);
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++
++	cpt_close_section(ctx);
++	return 0;
++}
++
++#ifndef CONFIG_IA64
++static int cpt_dump_vsyscall(cpt_context_t *ctx)
++{
++	struct cpt_page_block *pgb = cpt_get_buf(ctx);
++
++	cpt_open_section(ctx, CPT_SECT_VSYSCALL);
++	cpt_open_object(NULL, ctx);
++
++	pgb->cpt_next = CPT_NULL;
++	pgb->cpt_object = CPT_OBJ_VSYSCALL;
++	pgb->cpt_hdrlen = sizeof(*pgb);
++	pgb->cpt_content = CPT_CONTENT_DATA;
++	pgb->cpt_start = cpt_ptr_export(vsyscall_addr);
++	pgb->cpt_end = pgb->cpt_start + PAGE_SIZE;
++
++	ctx->write(pgb, sizeof(*pgb), ctx);
++	cpt_release_buf(ctx);
++
++	ctx->write(vsyscall_addr, PAGE_SIZE, ctx);
++
++	cpt_close_object(ctx);
++	cpt_close_section(ctx);
++	return 0;
++}
++#endif
++
++int cpt_dump(struct cpt_context *ctx)
++{
++	struct ve_struct *oldenv, *env;
++	struct nsproxy *old_ns;
++	int err, err2 = 0;
++
++	if (!ctx->ve_id)
++		return -EINVAL;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++
++	down_read(&env->op_sem);
++	err = -ESRCH;
++	if (!env->is_running)
++		goto out_noenv;
++	if (!env->is_locked)
++		goto out_noenv;
++	err = -EINVAL;
++	if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) {
++		printk(KERN_WARNING "CT: checkpointing not supported yet"
++				" for hidden pid namespaces.\n");
++		goto out_noenv;
++	}
++
++	oldenv = set_exec_env(env);
++	old_ns = current->nsproxy;
++	current->nsproxy = env->ve_ns;
++
++	/* Phase 2: real checkpointing */
++	err = cpt_open_dumpfile(ctx);
++	if (err)
++		goto out;
++	
++	cpt_major_hdr_out(ctx);
++
++	if (!err)
++		err = cpt_dump_veinfo(ctx);
++	if (!err)
++		err = cpt_dump_ubc(ctx);
++	if (!err)
++		err = cpt_dump_files(ctx);
++	if (!err)
++		err = cpt_dump_files_struct(ctx);
++	if (!err)
++		err = cpt_dump_fs_struct(ctx);
++	/* netdevices should be dumped after dumping open files
++	   as we need to restore netdevice binding to /dev/net/tun file */
++	if (!err)
++		err = cpt_dump_ifinfo(ctx);
++	if (!err)
++		err = cpt_dump_namespace(ctx);
++	if (!err)
++		err = cpt_dump_sighand(ctx);
++	if (!err)
++		err = cpt_dump_vm(ctx);
++	if (!err)
++		err = cpt_dump_sysvsem(ctx);
++	if (!err)
++		err = cpt_dump_sysvmsg(ctx);
++	if (!err)
++		err = cpt_dump_tasks(ctx);
++	if (!err)
++		err = cpt_dump_orphaned_sockets(ctx);
++#if defined(CONFIG_VE_IPTABLES) && \
++    (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++	if (!err)
++		err = cpt_dump_ip_conntrack(ctx);
++#endif
++	if (!err) {
++		if (virtinfo_notifier_call(VITYPE_SCP,
++					VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL)
++			err = -ECHRNG;
++	}
++	if (!err)
++		err = cpt_dump_utsname(ctx);
++
++#ifndef CONFIG_IA64
++	if (!err)
++		err = cpt_dump_vsyscall(ctx);
++#endif
++
++	if (!err)
++		err = cpt_dump_tail(ctx);
++
++	err2 = cpt_close_dumpfile(ctx);
++
++out:
++	current->nsproxy = old_ns;
++	set_exec_env(oldenv);
++out_noenv:
++	up_read(&env->op_sem);
++	put_ve(env);
++	return err ? : err2;
++}
++
++int cpt_vps_suspend(struct cpt_context *ctx)
++{
++	struct ve_struct *oldenv, *env;
++	struct nsproxy *old_ns;
++	int err = 0;
++
++	ctx->kernel_config_flags = test_kernel_config();
++	cpt_object_init(ctx);
++
++	if (!ctx->ve_id) {
++		env = get_exec_env();
++		if (env == get_ve0())
++			return -EINVAL;
++		wprintk("undefined ve_id\n");
++		ctx->ve_id = env->veid;
++		get_ve(env);
++	} else {
++		env = get_ve_by_id(ctx->ve_id);
++		if (!env)
++			return -ESRCH;
++	}
++
++#ifdef CONFIG_VE_IPTABLES
++	ctx->iptables_mask = env->_iptables_modules;
++#endif
++	ctx->features = env->features;
++
++	down_write(&env->op_sem);
++	err = -ESRCH;
++	if (!env->is_running)
++		goto out_noenv;
++
++	err = -EBUSY;
++	if (env->is_locked)
++		goto out_noenv;
++	env->is_locked = 1;
++	downgrade_write(&env->op_sem);
++
++	oldenv = set_exec_env(env);
++	old_ns = current->nsproxy;
++	current->nsproxy = env->ve_ns;
++
++	/* Phase 0: find and stop all the tasks */
++	if ((err = vps_stop_tasks(ctx)) != 0)
++		goto out;
++
++	if ((err = cpt_suspend_network(ctx)) != 0)
++		goto out_wake;
++
++	/* At the moment all the state is frozen. We do not need to lock
++	 * the state, which can be changed only if the tasks are running.
++	 */
++
++	/* Phase 1: collect task tree */
++	if ((err = vps_collect_tasks(ctx)) != 0)
++		goto out_wake;
++
++	/* Phase 1': collect all the resources */
++	if ((err = cpt_collect(ctx)) != 0)
++		goto out;
++
++out:
++	current->nsproxy = old_ns;
++	set_exec_env(oldenv);
++	up_read(&env->op_sem);
++	put_ve(env);
++        return err;
++
++out_noenv:
++	up_write(&env->op_sem);
++	put_ve(env);
++	return err;
++
++out_wake:
++	read_lock(&tasklist_lock);
++	wake_ve(ctx);
++	read_unlock(&tasklist_lock);
++	goto out;
++}
++
++static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps)
++{
++	struct net *net = get_exec_env()->ve_netns;
++	struct net_device *dev;
++
++	read_lock(&dev_base_lock);
++	for_each_netdev(net, dev) {
++		if (dev != net->loopback_dev
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++		    && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open))
++#endif
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++		    && dev != get_exec_env()->_venet_dev
++#endif
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++		    && dev->open != tun_net_open
++#endif
++							) {
++			eprintk_ctx("unsupported netdevice %s\n", dev->name);
++			*caps |= (1<<CPT_UNSUPPORTED_NETDEV);
++		}
++	}
++	read_unlock(&dev_base_lock);
++}
++
++static void check_one_process(struct cpt_context *ctx, __u32 *caps,
++		unsigned int flags, struct ve_struct *env,
++		struct task_struct *root, struct task_struct *p)
++{
++	struct mnt_namespace *ns;
++
++	if (tsk_used_math(p)) {
++		*caps |= flags & ((1<<CPT_CPU_X86_FXSR) |
++				(1<<CPT_CPU_X86_SSE) |
++				(1<<CPT_CPU_X86_SSE2) |
++				(1<<CPT_CPU_X86_MMX) |
++				(1<<CPT_CPU_X86_3DNOW) |
++				(1<<CPT_CPU_X86_3DNOW2));
++	}
++	/* This is not 100% true. VE could migrate with vdso using int80.
++	 * In this case we do not need SEP/SYSCALL32 caps. It is not so easy
++	 * to test, so that we do not. */
++#ifdef CONFIG_X86_64
++	if (!(task_thread_info(p)->flags & _TIF_IA32))
++		*caps |= flags & ((1<<CPT_CPU_X86_EMT64)|(1<<CPT_CPU_X86_SYSCALL));
++	else if (p->mm && p->mm->context.vdso) {
++		if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
++			*caps |= flags & (1<<CPT_CPU_X86_SEP);
++		else
++			*caps |= flags & (1<<CPT_CPU_X86_SYSCALL32);
++	}
++#elif defined(CONFIG_X86_32)
++	if (p->mm && p->mm->context.vdso)
++		*caps |= flags & (1<<CPT_CPU_X86_SEP);
++#endif
++#ifdef CONFIG_IA64
++	if (!IS_IA32_PROCESS(task_pt_regs(p)))
++		*caps |= (1<<CPT_CPU_X86_IA64);
++#endif
++	if (vps_child_level(root, p) >= 0) {
++		switch (check_process_external(p)) {
++		case PIDTYPE_PID:
++			eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm);
++			*caps |= (1<<CPT_EXTERNAL_PROCESS);
++			break;
++		case PIDTYPE_PGID:
++			eprintk_ctx("external process group %d/%d(%s) inside CT "
++					"(e.g. vzctl enter or vzctl exec).\n",
++					task_pgrp_vnr(p), p->pid, p->comm);
++			*caps |= (1<<CPT_EXTERNAL_PROCESS);
++			break;
++		case PIDTYPE_SID:
++			eprintk_ctx("external process session %d/%d(%s) inside CT "
++					"(e.g. vzctl enter or vzctl exec).\n",
++					task_session_vnr(p), p->pid, p->comm);
++			*caps |= (1<<CPT_EXTERNAL_PROCESS);
++		}
++	} else {
++		eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm);
++		*caps |= (1<<CPT_EXTERNAL_PROCESS);
++	}
++	task_lock(p);
++	ns = NULL;
++	if (p->nsproxy) {
++		ns = p->nsproxy->mnt_ns;
++		if (ns)
++			get_mnt_ns(ns);
++	}
++	task_unlock(p);
++	if (ns) {
++		if (ns != current->nsproxy->mnt_ns) {
++			eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm);
++			*caps |= (1<<CPT_NAMESPACES);
++		}
++		put_mnt_ns(ns);
++	}
++	if (p->policy != SCHED_NORMAL) {
++		eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm);
++		*caps |= (1<<CPT_SCHEDULER_POLICY);
++	}
++#ifdef CONFIG_UTRACE
++	if (check_utrace(p, root, ctx)) {
++		eprintk_ctx("task %d/%d(%s) is ptraced from host system\n", p->pid, virt_pid(p), p->comm);
++		*caps |= (1<<CPT_PTRACED_FROM_VE0);
++	}
++#endif
++	if (cpt_check_unsupported(p, ctx)) {
++		*caps |= (1<<CPT_UNSUPPORTED_MISC);
++	}
++}
++
++static void check_unsupported_mounts(struct cpt_context *ctx, __u32 *caps,
++		struct ve_struct *env, struct mnt_namespace *n, char *path_buf)
++{
++	struct list_head *p;
++	char *path;
++
++	down_read(&namespace_sem);
++	list_for_each(p, &n->list) {
++		struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
++		struct path p;
++
++		p.dentry = mnt->mnt_root;
++		p.mnt = mnt;
++		path = __d_path(&p, &env->root_path,
++				path_buf, PAGE_SIZE);
++		if (IS_ERR(path))
++			continue;
++
++		if (check_one_vfsmount(mnt)) {
++			eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name);
++			*caps |= (1<<CPT_UNSUPPORTED_FSTYPE);
++		}
++	}
++	up_read(&namespace_sem);
++}
++
++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps)
++{
++	struct task_struct *p;
++	struct task_struct *root;
++	struct ve_struct *env;
++	struct ve_struct *old_env;
++	struct nsproxy *old_ns;
++	struct mnt_namespace *n;
++	int err;
++	unsigned int flags = test_cpu_caps();
++
++	if (!ctx->ve_id)
++		return -EINVAL;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (env == NULL)
++		return -ESRCH;
++
++	*caps = flags & (1<<CPT_CPU_X86_CMOV);
++
++	old_env = set_exec_env(env);
++	old_ns = current->nsproxy;
++	current->nsproxy = env->ve_ns;
++
++	check_unsupported_netdevices(ctx, caps);
++
++	read_lock(&tasklist_lock);
++	root = find_task_by_vpid(1);
++	if (!root) {
++		read_unlock(&tasklist_lock);
++		eprintk_ctx("cannot find ve init\n");
++		err = -ESRCH;
++		goto out;
++	}
++	get_task_struct(root);
++	for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p))
++		check_one_process(ctx, caps, flags, env, root, p);
++	read_unlock(&tasklist_lock);
++
++	task_lock(root);
++	n = NULL;
++	if (root->nsproxy) {
++		n = root->nsproxy->mnt_ns;
++		if (n)
++			get_mnt_ns(n);
++	}
++	task_unlock(root);
++	if (n) {
++		char *path_buf;
++
++		path_buf = (char *) __get_free_page(GFP_KERNEL);
++		if (!path_buf) {
++			put_mnt_ns(n);
++			err = -ENOMEM;
++			goto out_root;
++		}
++
++		check_unsupported_mounts(ctx, caps, env, n, path_buf);
++
++		free_page((unsigned long) path_buf);
++		put_mnt_ns(n);
++	}
++
++	err = 0;
++
++out_root:
++	put_task_struct(root);
++out:
++	current->nsproxy = old_ns;
++	set_exec_env(old_env);
++	put_ve(env);
++
++	return err;
++}
+diff --git a/kernel/cpt/cpt_dump.h b/kernel/cpt/cpt_dump.h
+new file mode 100644
+index 0000000..71f6d94
+--- /dev/null
++++ b/kernel/cpt/cpt_dump.h
+@@ -0,0 +1,16 @@
++int cpt_dump(struct cpt_context *cpt);
++int rst_undump(struct cpt_context *cpt);
++int cpt_suspend(struct cpt_context *cpt);
++int cpt_resume(struct cpt_context *cpt);
++int cpt_kill(struct cpt_context *cpt);
++int rst_clean(struct cpt_context *cpt);
++int rst_resume(struct cpt_context *cpt);
++int rst_kill(struct cpt_context *cpt);
++
++int cpt_freeze_one(pid_t pid, int freeze);
++int cpt_vps_suspend(struct cpt_context *ctx);
++int vps_rst_undump(struct cpt_context *ctx);
++
++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps);
++
++int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx);
+diff --git a/kernel/cpt/cpt_epoll.c b/kernel/cpt/cpt_epoll.c
+new file mode 100644
+index 0000000..81d2b98
+--- /dev/null
++++ b/kernel/cpt/cpt_epoll.c
+@@ -0,0 +1,113 @@
++/*
++ *
++ *  kernel/cpt/cpt_epoll.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mnt_namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/eventpoll.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx)
++{
++	int err = 0;
++	struct file *file = obj->o_obj;
++	struct eventpoll *ep;
++	struct rb_node *rbp;
++	struct cpt_epoll_image ei;
++
++	if (file->f_op != &eventpoll_fops) {
++		eprintk_ctx("bad epoll file\n");
++		return -EINVAL;
++	}
++
++	ep = file->private_data;
++
++	/* eventpoll.c does not protect open /proc/N/fd, silly.
++	 * Opener will get an invalid file with uninitialized private_data
++	 */
++	if (unlikely(ep == NULL)) {
++		eprintk_ctx("bad epoll device\n");
++		return -EINVAL;
++	}
++
++	cpt_open_object(NULL, ctx);
++
++	ei.cpt_next = CPT_NULL;
++	ei.cpt_object = CPT_OBJ_EPOLL;
++	ei.cpt_hdrlen = sizeof(ei);
++	ei.cpt_content = CPT_CONTENT_ARRAY;
++	ei.cpt_file = obj->o_pos;
++
++	ctx->write(&ei, sizeof(ei), ctx);
++
++	mutex_lock(&epmutex);
++	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
++		loff_t saved_obj;
++		cpt_object_t *tobj;
++		struct cpt_epoll_file_image efi;
++		struct epitem *epi;
++		epi = rb_entry(rbp, struct epitem, rbn);
++		tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx);
++		if (tobj == NULL) {
++			eprintk_ctx("epoll device refers to an external file\n");
++			err = -EBUSY;
++			break;
++		}
++		cpt_push_object(&saved_obj, ctx);
++		cpt_open_object(NULL, ctx);
++
++		efi.cpt_next = CPT_NULL;
++		efi.cpt_object = CPT_OBJ_EPOLL_FILE;
++		efi.cpt_hdrlen = sizeof(efi);
++		efi.cpt_content = CPT_CONTENT_VOID;
++		efi.cpt_file = tobj->o_pos;
++		efi.cpt_fd = epi->ffd.fd;
++		efi.cpt_events = epi->event.events;
++		efi.cpt_data = epi->event.data;
++		efi.cpt_revents = 0;
++		efi.cpt_ready = 0;
++		if (!list_empty(&epi->rdllink))
++			efi.cpt_ready = 1;
++
++		ctx->write(&efi, sizeof(efi), ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++	mutex_unlock(&epmutex);
++
++	cpt_close_object(ctx);
++
++	return err;
++}
++
+diff --git a/kernel/cpt/cpt_exports.c b/kernel/cpt/cpt_exports.c
+new file mode 100644
+index 0000000..f492331
+--- /dev/null
++++ b/kernel/cpt/cpt_exports.c
+@@ -0,0 +1,13 @@
++#include <linux/module.h>
++#include <asm/signal.h>
++
++#include "cpt_obj.h"
++
++EXPORT_SYMBOL(alloc_cpt_object);
++EXPORT_SYMBOL(intern_cpt_object);
++EXPORT_SYMBOL(insert_cpt_object);
++EXPORT_SYMBOL(__cpt_object_add);
++EXPORT_SYMBOL(cpt_object_add);
++EXPORT_SYMBOL(cpt_object_get);
++EXPORT_SYMBOL(lookup_cpt_object);
++EXPORT_SYMBOL(lookup_cpt_obj_bypos);
+diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c
+new file mode 100644
+index 0000000..1a2dd15
+--- /dev/null
++++ b/kernel/cpt/cpt_files.c
+@@ -0,0 +1,1634 @@
++/*
++ *
++ *  kernel/cpt/cpt_files.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mnt_namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <linux/pagemap.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/ve_proto.h>
++#include <bc/kmem.h>
++#include <linux/cpt_image.h>
++#include <linux/if_tun.h>
++#include <linux/fdtable.h>
++#include <linux/shm.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt)
++{
++	char *path;
++	struct path p;
++	unsigned long pg = __get_free_page(GFP_KERNEL);
++
++	if (!pg)
++		return;
++
++	p.dentry = d;
++	p.mnt = mnt;
++	path = d_path(&p, (char *)pg, PAGE_SIZE);
++
++	if (!IS_ERR(path))
++		eprintk("<%s>", path);
++	free_page(pg);
++}
++
++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
++			 cpt_context_t *ctx)
++{
++	if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) {
++		struct nameidata nd;
++		if (path_lookup(path, 0, &nd)) {
++			eprintk_ctx("d_path cannot be looked up %s\n", path);
++			return -EINVAL;
++		}
++		if (nd.path.dentry != d || nd.path.mnt != mnt) {
++			eprintk_ctx("d_path is invisible %s\n", path);
++			path_put(&nd.path);
++			return -EINVAL;
++		}
++		path_put(&nd.path);
++	}
++	return 0;
++}
++
++static int
++cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx)
++{
++	int result = 0;
++
++#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE)
++	char *path;
++	unsigned long pg;
++	struct dentry * renamed_dentry;
++	struct path p;
++
++	if (de->d_sb->s_magic != FSMAGIC_VEFS)
++		return 0;
++	if (de->d_inode->i_nlink != 0 ||
++	    atomic_read(&de->d_inode->i_writecount) > 0) 
++		return 0;
++
++	renamed_dentry = vefs_replaced_dentry(de);
++	if (renamed_dentry == NULL)
++		return 0;
++
++	pg = __get_free_page(GFP_KERNEL);
++	if (!pg)
++		return 0;
++
++	p.dentry = de;
++	p.mnt = mnt;
++	path = d_path(&p, (char *)pg, PAGE_SIZE);
++	if (!IS_ERR(path)) {
++		int len;
++		struct nameidata nd;
++
++		len = pg + PAGE_SIZE - 1 - (unsigned long)path;
++		if (len >= sizeof("(deleted) ") - 1 &&
++		    !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
++			len -= sizeof("(deleted) ") - 1;
++			path += sizeof("(deleted) ") - 1;
++		}
++
++		if (path_lookup(path, 0, &nd) == 0) {
++			if (mnt == nd.path.mnt &&
++			    vefs_is_renamed_dentry(nd.path.dentry, renamed_dentry))
++				result = 1;
++			path_put(&nd.path);
++		}
++	}
++	free_page(pg);
++#endif
++	return result;
++}
++
++static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt,
++			   int replaced, cpt_context_t *ctx)
++{
++	int len;
++	char *path;
++	struct path p;
++	char *pg = cpt_get_buf(ctx);
++	loff_t saved;
++
++	p.dentry = d;
++	p.mnt = mnt;
++	path = d_path(&p, pg, PAGE_SIZE);
++	len = PTR_ERR(path);
++
++	if (IS_ERR(path)) {
++		struct cpt_object_hdr o;
++		char tmp[1];
++
++		/* VZ changes d_path() to return EINVAL, when path
++		 * is not supposed to be visible inside VE.
++		 * This changes behaviour of d_path() comparing
++		 * to mainstream kernel, f.e. d_path() fails
++		 * on any kind of shared memory. Maybe, there are
++		 * another cases, but I am aware only about this one.
++		 * So, we just ignore error on shmem mounts and proceed.
++		 * Otherwise, checkpointing is prohibited because
++		 * of reference to an invisible file.
++		 */
++		if (len != -EINVAL ||
++		    mnt != get_exec_env()->shmem_mnt)
++			eprintk_ctx("d_path err=%d\n", len);
++		else
++			len = 0;
++
++		cpt_push_object(&saved, ctx);
++		cpt_open_object(NULL, ctx);
++		o.cpt_next = CPT_NULL;
++		o.cpt_object = CPT_OBJ_NAME;
++		o.cpt_hdrlen = sizeof(o);
++		o.cpt_content = CPT_CONTENT_NAME;
++		tmp[0] = 0;
++
++		ctx->write(&o, sizeof(o), ctx);
++		ctx->write(tmp, 1, ctx);
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved, ctx);
++
++		__cpt_release_buf(ctx);
++		return len;
++	} else {
++		struct cpt_object_hdr o;
++
++		len = pg + PAGE_SIZE - 1 - path;
++		if (replaced &&
++		    len >= sizeof("(deleted) ") - 1 &&
++		    !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
++			len -= sizeof("(deleted) ") - 1;
++			path += sizeof("(deleted) ") - 1;
++		}
++		o.cpt_next = CPT_NULL;
++		o.cpt_object = CPT_OBJ_NAME;
++		o.cpt_hdrlen = sizeof(o);
++		o.cpt_content = CPT_CONTENT_NAME;
++		path[len] = 0;
++
++		if (cpt_verify_overmount(path, d, mnt, ctx)) {
++			__cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++
++		cpt_push_object(&saved, ctx);
++		cpt_open_object(NULL, ctx);
++		ctx->write(&o, sizeof(o), ctx);
++		ctx->write(path, len+1, ctx);
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved, ctx);
++		__cpt_release_buf(ctx);
++	}
++	return 0;
++}
++
++int cpt_dump_string(const char *s, struct cpt_context *ctx)
++{
++	int len;
++	struct cpt_object_hdr o;
++
++	cpt_open_object(NULL, ctx);
++	len = strlen(s);
++	o.cpt_next = CPT_NULL;
++	o.cpt_object = CPT_OBJ_NAME;
++	o.cpt_hdrlen = sizeof(o);
++	o.cpt_content = CPT_CONTENT_NAME;
++
++	ctx->write(&o, sizeof(o), ctx);
++	ctx->write(s, len+1, ctx);
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++	return 0;
++}
++
++static int
++cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx)
++{
++	return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx);
++}
++
++int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_inode_image *v = cpt_get_buf(ctx);
++	struct kstat sbuf;
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_INODE;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) {
++		cpt_release_buf(ctx);
++		return err;
++	}
++
++	v->cpt_dev	= d->d_inode->i_sb->s_dev;
++	v->cpt_ino	= d->d_inode->i_ino;
++	v->cpt_mode	= sbuf.mode;
++	v->cpt_nlink	= sbuf.nlink;
++	v->cpt_uid	= sbuf.uid;
++	v->cpt_gid	= sbuf.gid;
++	v->cpt_rdev	= d->d_inode->i_rdev;
++	v->cpt_size	= sbuf.size;
++	v->cpt_atime	= cpt_timespec_export(&sbuf.atime);
++	v->cpt_mtime	= cpt_timespec_export(&sbuf.mtime);
++	v->cpt_ctime	= cpt_timespec_export(&sbuf.ctime);
++	v->cpt_blksize	= sbuf.blksize;
++	v->cpt_blocks	= sbuf.blocks;
++	v->cpt_sb	= d->d_inode->i_sb->s_magic;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	return 0;
++}
++
++int cpt_collect_files(cpt_context_t * ctx)
++{
++	int err;
++	cpt_object_t *obj;
++	int index = 0;
++
++	/* Collect process fd sets */
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL)
++			return -ENOMEM;
++	}
++
++	/* Collect files from fd sets */
++	for_each_object(obj, CPT_OBJ_FILES) {
++		int fd;
++		struct files_struct *f = obj->o_obj;
++
++		cpt_obj_setindex(obj, index++, ctx);
++
++		if (obj->o_count != atomic_read(&f->count)) {
++			eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count));
++			return -EBUSY;
++		}
++
++		for (fd = 0; fd < f->fdt->max_fds; fd++) {
++			struct file *file = fcheck_files(f, fd);
++			if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL)
++				return -ENOMEM;
++		}
++	}
++
++	/* Collect files queued by AF_UNIX sockets. */
++	if ((err = cpt_collect_passedfds(ctx)) < 0)
++		return err;
++
++	/* OK. At this point we should count all the references. */
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++		struct file *parent;
++		cpt_object_t *ino_obj;
++
++		if (obj->o_count != atomic_read(&file->f_count)) {
++			eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count));
++			cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
++			return -EBUSY;
++		}
++
++		switch (file->f_dentry->d_inode->i_sb->s_magic) {
++		case FSMAGIC_FUTEX:
++		case FSMAGIC_MQUEUE:
++		case FSMAGIC_BDEV:
++#ifndef CONFIG_INOTIFY_USER
++		case FSMAGIC_INOTIFY:
++#endif
++			eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic);
++			return -EBUSY;
++		}
++
++		/* Collect inode. It is necessary mostly to resolve deleted
++		 * hard links. */
++		ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++		if (ino_obj == NULL)
++			return -ENOMEM;
++
++		parent = ino_obj->o_parent;
++		if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry)))
++			ino_obj->o_parent = file;
++
++		if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
++			int maj = imajor(file->f_dentry->d_inode);
++			if (maj == PTY_MASTER_MAJOR ||
++			    (maj >= UNIX98_PTY_MASTER_MAJOR &&
++			     maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
++			    maj == PTY_SLAVE_MAJOR ||
++			    maj == UNIX98_PTY_SLAVE_MAJOR ||
++			    maj == TTYAUX_MAJOR) {
++				err = cpt_collect_tty(file, ctx);
++				if (err)
++					return err;
++			}
++		}
++
++		if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
++			err = cpt_collect_socket(file, ctx);
++			if (err)
++				return err;
++		}
++	}
++
++	err = cpt_index_sockets(ctx);
++
++	return err;
++}
++
++/* /dev/ptmx is special, all the files share one inode, but real tty backend
++ * is attached via file->private_data.
++ */
++
++static inline int is_cloning_inode(struct inode *ino)
++{
++	return S_ISCHR(ino->i_mode) && 
++		ino->i_rdev == MKDEV(TTYAUX_MAJOR,2);
++}
++
++static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx)
++{
++	pid_t pid;
++	struct cpt_flock_image *v = cpt_get_buf(ctx);
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_FLOCK;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	v->cpt_owner = owner;
++
++	pid = fl->fl_pid;
++	if (pid) {
++		pid = pid_to_vpid(fl->fl_pid);
++		if (pid == -1) {
++			if (!(fl->fl_flags&FL_FLOCK)) {
++				eprintk_ctx("posix lock from another container?\n");
++				cpt_release_buf(ctx);
++				return -EBUSY;
++			}
++			pid = 0;
++		}
++	}
++
++	v->cpt_pid = pid;
++	v->cpt_start = fl->fl_start;
++	v->cpt_end = fl->fl_end;
++	v->cpt_flags = fl->fl_flags;
++	v->cpt_type = fl->fl_type;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	return 0;
++}
++
++
++int cpt_dump_flock(struct file *file, struct cpt_context *ctx)
++{
++	int err = 0;
++	struct file_lock *fl;
++
++	lock_kernel();
++	for (fl = file->f_dentry->d_inode->i_flock;
++	     fl; fl = fl->fl_next) {
++		if (file != fl->fl_file)
++			continue;
++		if (fl->fl_flags & FL_LEASE) {
++			eprintk_ctx("lease lock is not supported\n");
++			err = -EINVAL;
++			break;
++		}
++		if (fl->fl_flags & FL_POSIX) {
++			cpt_object_t *obj;
++			obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx);
++			if (obj) {
++				dump_one_flock(fl, obj->o_index, ctx);
++				continue;
++			} else {
++				eprintk_ctx("unknown lock owner %p\n", fl->fl_owner);
++				err = -EINVAL;
++			}
++		}
++		if (fl->fl_flags & FL_FLOCK) {
++			dump_one_flock(fl, -1, ctx);
++			continue;
++		}
++	}
++	unlock_kernel();
++	return err;
++}
++
++static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx)
++{
++	int err = 0;
++	cpt_object_t *iobj;
++	struct cpt_file_image *v = cpt_get_buf(ctx);
++	struct kstat sbuf;
++	int replaced = 0;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_FILE;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_flags = file->f_flags;
++	v->cpt_mode = file->f_mode;
++	v->cpt_pos = file->f_pos;
++	v->cpt_uid = file->f_uid;
++	v->cpt_gid = file->f_gid;
++
++	vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf);
++
++	v->cpt_i_mode = sbuf.mode;
++	v->cpt_lflags = 0;
++	if (IS_ROOT(file->f_dentry))
++		v->cpt_lflags |= CPT_DENTRY_ROOT;
++	else if (d_unhashed(file->f_dentry)) {
++		if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) {
++			v->cpt_lflags |= CPT_DENTRY_REPLACED;
++			replaced = 1;
++		} else {
++			v->cpt_lflags |= CPT_DENTRY_DELETED;
++		}
++	}
++	if (is_cloning_inode(file->f_dentry->d_inode))
++		v->cpt_lflags |= CPT_DENTRY_CLONING;
++	if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC)
++		v->cpt_lflags |= CPT_DENTRY_PROC;
++	v->cpt_inode = CPT_NULL;
++	if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) {
++		iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++		if (iobj)
++			v->cpt_inode = iobj->o_pos;
++	}
++	v->cpt_priv = CPT_NULL;
++	v->cpt_fown_fd = -1;
++	if (S_ISCHR(v->cpt_i_mode)) {
++		iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx);
++		if (iobj) {
++			v->cpt_priv = iobj->o_pos;
++			if (file->f_flags&FASYNC)
++				v->cpt_fown_fd = cpt_tty_fasync(file, ctx);
++		}
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++		if (file->f_op && file->f_op->open == tun_chr_open)
++			v->cpt_lflags |= CPT_DENTRY_TUNTAP;
++#endif
++	}
++	if (S_ISSOCK(v->cpt_i_mode)) {
++		if (obj->o_index < 0) {
++			eprintk_ctx("BUG: no socket index\n");
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++		v->cpt_priv = obj->o_index;
++		if (file->f_flags&FASYNC)
++			v->cpt_fown_fd = cpt_socket_fasync(file, ctx);
++	}
++	if (file->f_op == &eventpoll_fops) {
++		v->cpt_priv = file->f_dentry->d_inode->i_ino;
++		v->cpt_lflags |= CPT_DENTRY_EPOLL;
++	}
++	if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
++		v->cpt_priv = file->f_dentry->d_inode->i_ino;
++		v->cpt_lflags |= CPT_DENTRY_INOTIFY;
++	}
++
++	v->cpt_fown_pid = (file->f_owner.pid == NULL ?
++			CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid));
++	v->cpt_fown_uid = file->f_owner.uid;
++	v->cpt_fown_euid = file->f_owner.euid;
++	v->cpt_fown_signo = file->f_owner.signum;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	if (!S_ISSOCK(v->cpt_i_mode)) {
++		err = cpt_dump_filename(file, replaced, ctx);
++		if (err)
++			return err;
++		if ((file->f_mode & FMODE_WRITE) &&
++				file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS)
++			vefs_track_notify(file->f_dentry, 1);
++	}
++
++	if (file->f_dentry->d_inode->i_flock)
++		err = cpt_dump_flock(file, ctx);
++
++	cpt_close_object(ctx);
++
++	return err;
++}
++
++/* About this weird function... Crappy code dealing with SYSV shared memory 
++ * defines TMPFS inode and file with f_op doing only mmap. So...
++ * Maybe, this is wrong and leaks something. It is clear access to
++ * SYSV shmem via mmap is quite unusual and impossible from user space.
++ */
++static int dump_content_shm(struct file *file, struct cpt_context *ctx)
++{
++	struct cpt_obj_bits *v;
++	loff_t saved_pos;
++	unsigned long addr;
++
++	addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size,
++			     PROT_READ, MAP_SHARED, 0);
++	if (IS_ERR((void*)addr))
++		return PTR_ERR((void*)addr);
++
++	cpt_push_object(&saved_pos, ctx);
++	cpt_open_object(NULL, ctx);
++	v = cpt_get_buf(ctx);
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_BITS;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_DATA;
++	v->cpt_size = file->f_dentry->d_inode->i_size;
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx);
++	ctx->align(ctx);
++	do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size);
++
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_pos, ctx);
++	return 0;
++}
++
++static int data_is_zero(char *addr, int len)
++{
++	int i;
++	unsigned long zerolong = 0;
++
++	for (i=0; i<len/sizeof(unsigned long); i++) {
++		if (((unsigned long*)(addr))[i] != 0)
++			return 0;
++	}
++	i = len % sizeof(unsigned long);
++	if (!i)
++		return 1;
++	return memcmp(addr + len - i, &zerolong, i) == 0;
++}
++
++
++static int dump_content_regular(struct file *file, struct cpt_context *ctx)
++{
++	loff_t saved_pos;
++	loff_t pos = 0;
++	loff_t obj_opened = CPT_NULL;
++	struct cpt_page_block pgb;
++	ssize_t (*do_read)(struct file *, char __user *, size_t, loff_t *);
++
++	if (file->f_op == NULL)
++		return -EINVAL;
++
++	do_read = file->f_op->read;
++	if (file->f_op == &shm_file_operations) {
++		struct shm_file_data *sfd = file->private_data;
++
++		cpt_dump_content_sysvshm(sfd->file, ctx);
++
++		return 0;
++	}
++	if (file->f_op == &shmem_file_operations) {
++		do_read = file->f_dentry->d_inode->i_fop->read;
++		cpt_dump_content_sysvshm(file, ctx);
++		if (!do_read) {
++			wprintk_ctx("TMPFS is not configured?\n");
++			return dump_content_shm(file, ctx);
++		}
++	}
++
++	if (!(file->f_mode & FMODE_READ) ||
++	    (file->f_flags & O_DIRECT)) {
++		file = dentry_open(dget(file->f_dentry),
++				   mntget(file->f_vfsmnt), O_RDONLY);
++		if (IS_ERR(file)) {
++			cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
++			eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file));
++			return PTR_ERR(file);
++		}
++	} else {
++		atomic_inc(&file->f_count);
++	}
++
++	for (;;) {
++		mm_segment_t oldfs;
++		int err;
++
++		(void)cpt_get_buf(ctx);
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos);
++		set_fs(oldfs);
++		if (err < 0) {
++			eprintk_ctx("dump_content_regular: do_read: %d", err);
++			fput(file);
++			__cpt_release_buf(ctx);
++			return err;
++		}
++		if (err == 0) {
++			__cpt_release_buf(ctx);
++			break;
++		}
++		if (data_is_zero(ctx->tmpbuf, err)) {
++			if (obj_opened != CPT_NULL) {
++				ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
++				ctx->align(ctx);
++				cpt_close_object(ctx);
++				cpt_pop_object(&saved_pos, ctx);
++				obj_opened = CPT_NULL;
++			}
++		} else {
++			if (obj_opened == CPT_NULL) {
++				cpt_push_object(&saved_pos, ctx);
++				cpt_open_object(NULL, ctx);
++				obj_opened = ctx->file->f_pos;
++				pgb.cpt_next = CPT_NULL;
++				pgb.cpt_object = CPT_OBJ_PAGES;
++				pgb.cpt_hdrlen = sizeof(pgb);
++				pgb.cpt_content = CPT_CONTENT_DATA;
++				pgb.cpt_start = pos - err;
++				pgb.cpt_end = pgb.cpt_start;
++				ctx->write(&pgb, sizeof(pgb), ctx);
++			}
++			ctx->write(ctx->tmpbuf, err, ctx);
++			pgb.cpt_end += err;
++		}
++		__cpt_release_buf(ctx);
++	}
++
++	fput(file);
++
++	if (obj_opened != CPT_NULL) {
++		ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_pos, ctx);
++		obj_opened = CPT_NULL;
++	}
++	return 0;
++}
++
++
++static int dump_content_chrdev(struct file *file, struct cpt_context *ctx)
++{
++	struct inode *ino = file->f_dentry->d_inode;
++	int maj;
++
++	maj = imajor(ino);
++	if (maj == MEM_MAJOR) {
++		/* Well, OK. */
++		return 0;
++	}
++	if (maj == PTY_MASTER_MAJOR ||
++	    (maj >= UNIX98_PTY_MASTER_MAJOR &&
++	     maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
++	    maj == PTY_SLAVE_MAJOR ||
++	    maj == UNIX98_PTY_SLAVE_MAJOR ||
++	    maj == TTYAUX_MAJOR) {
++		return cpt_dump_content_tty(file, ctx);
++	}
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++	if (file->f_op && file->f_op->open == tun_chr_open)
++		return 0;
++#endif
++	eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino));
++	return -EINVAL;
++}
++
++static int dump_content_blkdev(struct file *file, struct cpt_context *ctx)
++{
++	struct inode *ino = file->f_dentry->d_inode;
++
++	/* We are not going to transfer them. */
++	eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino));
++	return -EINVAL;
++}
++
++static int dump_content_fifo(struct file *file, struct cpt_context *ctx)
++{
++	struct inode *ino = file->f_dentry->d_inode;
++	cpt_object_t *obj;
++	loff_t saved_pos;
++	int readers;
++	int writers;
++	int anon = 0;
++
++	mutex_lock(&ino->i_mutex);
++	readers = ino->i_pipe->readers;
++	writers = ino->i_pipe->writers;
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file1 = obj->o_obj;
++		if (file1->f_dentry->d_inode == ino) {
++			if (file1->f_mode & FMODE_READ)
++				readers--;
++			if (file1->f_mode & FMODE_WRITE)
++				writers--;
++		}
++	}	
++	mutex_unlock(&ino->i_mutex);
++	if (readers || writers) {
++		struct dentry *dr = file->f_dentry->d_sb->s_root;
++		if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0)
++			anon = 1;
++
++		if (anon) {
++			eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers);
++			return -EBUSY;
++		}
++		/* If fifo has external readers/writers, we are in troubles.
++		 * If the buffer is not empty, we must move its content.
++		 * But if the fifo is owned by a service, we cannot do
++		 * this. See?
++		 *
++		 * For now we assume, that if fifo is opened by another
++		 * process, we do not own it and, hence, migrate without
++		 * data.
++		 */
++		return 0;
++	}
++
++	/* OK, we must save fifo state. No semaphores required. */
++
++	if (ino->i_pipe->nrbufs) {
++		struct cpt_obj_bits *v = cpt_get_buf(ctx);
++		struct pipe_inode_info *info;
++		int count, buf, nrbufs;
++
++		mutex_lock(&ino->i_mutex);
++		info =  ino->i_pipe;
++		count = 0;
++		buf = info->curbuf;
++		nrbufs = info->nrbufs;
++		while (--nrbufs >= 0) {
++			if (!info->bufs[buf].ops->can_merge) {
++				mutex_unlock(&ino->i_mutex);
++				eprintk_ctx("unknown format of pipe buffer\n");
++				return -EINVAL;
++			}
++			count += info->bufs[buf].len;
++			buf = (buf+1) & (PIPE_BUFFERS-1);
++		}
++
++		if (!count) {
++			mutex_unlock(&ino->i_mutex);
++			return 0;
++		}
++
++		cpt_push_object(&saved_pos, ctx);
++		cpt_open_object(NULL, ctx);
++		v->cpt_next = CPT_NULL;
++		v->cpt_object = CPT_OBJ_BITS;
++		v->cpt_hdrlen = sizeof(*v);
++		v->cpt_content = CPT_CONTENT_DATA;
++		v->cpt_size = count;
++		ctx->write(v, sizeof(*v), ctx);
++		cpt_release_buf(ctx);
++
++		count = 0;
++		buf = info->curbuf;
++		nrbufs = info->nrbufs;
++		while (--nrbufs >= 0) {
++			struct pipe_buffer *b = info->bufs + buf;
++			/* need to ->pin first? */
++			void * addr = b->ops->map(info, b, 0);
++			ctx->write(addr + b->offset, b->len, ctx);
++			b->ops->unmap(info, b, addr);
++			buf = (buf+1) & (PIPE_BUFFERS-1);
++		}
++
++		mutex_unlock(&ino->i_mutex);
++
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_pos, ctx);
++	}
++
++	return 0;
++}
++
++static int dump_content_socket(struct file *file, struct cpt_context *ctx)
++{
++	return 0;
++}
++
++struct cpt_dirent {
++	unsigned long	ino;
++	char		*name;
++	int		namelen;
++	int		found;
++};
++
++static int cpt_filldir(void * __buf, const char * name, int namelen,
++		loff_t offset, u64 ino, unsigned int d_type)
++{
++	struct cpt_dirent * dirent = __buf;
++
++	if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) {
++		memcpy(dirent->name, name, namelen);
++		dirent->name[namelen] = '\0';
++		dirent->namelen = namelen;
++		dirent->found = 1;
++		return 1;
++	}
++	return 0;
++}
++
++static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt,
++		struct inode *ino, struct cpt_context *ctx)
++{
++	int err = -EBUSY;
++	struct file *f = NULL;
++	struct cpt_dirent entry;
++	struct dentry *de, *found = NULL;
++
++	dprintk_ctx("deleted reference to existing inode, try to find file\n");
++	/* 1. Try to find not deleted dentry in ino->i_dentry list */
++	spin_lock(&dcache_lock);
++	list_for_each_entry(de, &ino->i_dentry, d_alias) {
++		if (!IS_ROOT(de) && d_unhashed(de))
++			continue;
++		found = de;
++		dget_locked(found);
++		break;
++	}
++	spin_unlock(&dcache_lock);
++	if (found) {
++		err = cpt_dump_dentry(found, mnt, 0, ctx);
++		dput(found);
++		if (!err) {
++			dprintk_ctx("dentry found in aliases\n");
++			return 0;
++		}
++	}
++
++	/* 2. Try to find file in current dir */
++	de = dget_parent(d);
++	if (!de)
++		return -EINVAL;
++
++	mntget(mnt);
++	f = dentry_open(de, mnt, O_RDONLY);
++	if (IS_ERR(f))
++		return PTR_ERR(f);
++
++	entry.ino = ino->i_ino;
++	entry.name = cpt_get_buf(ctx);
++	entry.found = 0;
++	err = vfs_readdir(f, cpt_filldir, &entry);
++	if (err || !entry.found) {
++		err = err ? err : -ENOENT;
++		goto err_readdir;
++	}
++
++	found = lookup_one_len(entry.name, de, entry.namelen);
++	if (IS_ERR(found)) {
++		err = PTR_ERR(found);
++		goto err_readdir;
++	}
++
++	err = -ENOENT;
++	if (found->d_inode != ino)
++		goto err_lookup;
++
++	dprintk_ctx("dentry found in dir\n");
++	__cpt_release_buf(ctx);
++	err = cpt_dump_dentry(found, mnt, 0, ctx);
++
++err_lookup:
++	dput(found);
++err_readdir:
++	fput(f);
++	__cpt_release_buf(ctx);
++	return err;
++}
++
++static int dump_one_inode(struct file *file, struct dentry *d,
++			  struct vfsmount *mnt, struct cpt_context *ctx)
++{
++	int err = 0;
++	struct inode *ino = d->d_inode;
++	cpt_object_t *iobj;
++	int dump_it = 0;
++
++	iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx);
++	if (!iobj)
++		return -EINVAL;
++
++	if (iobj->o_pos >= 0)
++		return 0;
++
++	if ((!IS_ROOT(d) && d_unhashed(d)) &&
++	    !cpt_replaced(d, mnt, ctx))
++		dump_it = 1;
++	if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) {
++		if (file->f_op == &eventpoll_fops)
++			return 0;
++		dump_it = 1;
++	}
++
++	if (!dump_it)
++		return 0;
++
++	cpt_open_object(iobj, ctx);
++	cpt_dump_inode(d, mnt, ctx);
++
++	if (!IS_ROOT(d) && d_unhashed(d)) {
++		struct file *parent;
++		parent = iobj->o_parent;
++		if (!parent ||
++		    (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) {
++			/* Inode is not deleted, but it does not
++			 * have references from inside checkpointed
++			 * process group. */
++			if (ino->i_nlink != 0) {
++				err = find_linked_dentry(d, mnt, ino, ctx);
++				if (err) {
++					eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err);
++					return -EBUSY;
++				}
++				if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
++					dump_it = 0;
++			}
++		} else {
++			/* Refer to _another_ file name. */
++			err = cpt_dump_filename(parent, 0, ctx);
++			if (err)
++				return err;
++			if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
++				dump_it = 0;
++		}
++	}
++	if (dump_it) {
++		if (S_ISREG(ino->i_mode)) {
++			if ((err = dump_content_regular(file, ctx)) != 0) {
++				eprintk_ctx("dump_content_regular ");
++				cpt_printk_dentry(d, mnt);
++			}
++		} else if (S_ISDIR(ino->i_mode)) {
++			/* We cannot do anything. The directory should be
++			 * empty, so it is not a big deal.
++			 */
++		} else if (S_ISCHR(ino->i_mode)) {
++			err = dump_content_chrdev(file, ctx);
++		} else if (S_ISBLK(ino->i_mode)) {
++			err = dump_content_blkdev(file, ctx);
++		} else if (S_ISFIFO(ino->i_mode)) {
++			err = dump_content_fifo(file, ctx);
++		} else if (S_ISSOCK(ino->i_mode)) {
++			err = dump_content_socket(file, ctx);
++		} else {
++			eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic);
++			err = -EINVAL;
++		}
++	}
++	cpt_close_object(ctx);
++
++	return err;
++}
++
++int cpt_dump_files(struct cpt_context *ctx)
++{
++	int epoll_nr, inotify_nr;
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_TTY);
++	for_each_object(obj, CPT_OBJ_TTY) {
++		int err;
++
++		if ((err = cpt_dump_tty(obj, ctx)) != 0)
++			return err;
++	}
++	cpt_close_section(ctx);
++
++	cpt_open_section(ctx, CPT_SECT_INODE);
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++		int err;
++
++		if ((err = dump_one_inode(file, file->f_dentry,
++					  file->f_vfsmnt, ctx)) != 0)
++			return err;
++	}
++	for_each_object(obj, CPT_OBJ_FS) {
++		struct fs_struct *fs = obj->o_obj;
++		int err;
++
++		if (fs->root.dentry &&
++		    (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0)
++			return err;
++		if (fs->pwd.dentry &&
++		    (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0)
++			return err;
++		if (fs->altroot.dentry &&
++		    (err = dump_one_inode(NULL, fs->altroot.dentry, fs->altroot.mnt, ctx)) != 0)
++			return err;
++	}
++	cpt_close_section(ctx);
++
++	epoll_nr = 0;
++	inotify_nr = 0;
++	cpt_open_section(ctx, CPT_SECT_FILES);
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++		int err;
++
++		if ((err = dump_one_file(obj, file, ctx)) != 0)
++			return err;
++		if (file->f_op == &eventpoll_fops)
++			epoll_nr++;
++		if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY)
++			inotify_nr++;
++	}
++	cpt_close_section(ctx);
++
++	if (epoll_nr) {
++		cpt_open_section(ctx, CPT_SECT_EPOLL);
++		for_each_object(obj, CPT_OBJ_FILE) {
++			struct file *file = obj->o_obj;
++			if (file->f_op == &eventpoll_fops) {
++				int err;
++				if ((err = cpt_dump_epolldev(obj, ctx)) != 0)
++					return err;
++			}
++		}
++		cpt_close_section(ctx);
++	}
++
++	if (inotify_nr) {
++		cpt_open_section(ctx, CPT_SECT_INOTIFY);
++		for_each_object(obj, CPT_OBJ_FILE) {
++			struct file *file = obj->o_obj;
++			if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
++				int err = -EINVAL;
++#ifdef CONFIG_INOTIFY_USER
++				if ((err = cpt_dump_inotify(obj, ctx)) != 0)
++#endif
++					return err;
++			}
++		}
++		cpt_close_section(ctx);
++	}
++
++	cpt_open_section(ctx, CPT_SECT_SOCKET);
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		int err;
++
++		if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0)
++			return err;
++	}
++	cpt_close_section(ctx);
++
++	return 0;
++}
++
++static int dump_filedesc(int fd, struct file *file,
++			 struct files_struct *f, struct cpt_context *ctx)
++{
++	struct cpt_fd_image *v = cpt_get_buf(ctx);
++	cpt_object_t *obj;
++
++	cpt_open_object(NULL, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_FILEDESC;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	v->cpt_fd = fd;
++	obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx);
++	if (!obj) BUG();
++	v->cpt_file = obj->o_pos;
++	v->cpt_flags = 0;
++	if (FD_ISSET(fd, f->fdt->close_on_exec))
++		v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	cpt_close_object(ctx);
++
++	return 0;
++}
++
++static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct files_struct *f = obj->o_obj;
++	struct cpt_files_struct_image *v = cpt_get_buf(ctx);
++	int fd;
++	loff_t saved_obj;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_FILES;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_index = obj->o_index;
++	v->cpt_max_fds = f->fdt->max_fds;
++	v->cpt_next_fd = f->next_fd;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	for (fd = 0; fd < f->fdt->max_fds; fd++) {
++		struct file *file = fcheck_files(f, fd);
++		if (file)
++			dump_filedesc(fd, file, f, ctx);
++	}
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_close_object(ctx);
++
++	return 0;
++}
++
++int cpt_dump_files_struct(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_FILES_STRUCT);
++
++	for_each_object(obj, CPT_OBJ_FILES) {
++		int err;
++
++		if ((err = dump_one_file_struct(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++	return 0;
++}
++
++int cpt_collect_fs(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->fs) {
++			if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL)
++				return -ENOMEM;
++			if (tsk->fs->pwd.dentry &&
++			    cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL)
++				return -ENOMEM;
++			if (tsk->fs->root.dentry &&
++			    cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL)
++				return -ENOMEM;
++			if (tsk->fs->altroot.dentry &&
++			    cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot.dentry->d_inode, ctx) == NULL)
++				return -ENOMEM;
++		}
++	}
++	return 0;
++}
++
++int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
++{
++	struct file file;
++
++	memset(&file, 0, sizeof(file));
++
++	file.f_dentry = d;
++	file.f_vfsmnt = mnt;
++	file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK;
++	return dump_one_file(NULL, &file, ctx);
++}
++
++static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct fs_struct *fs = obj->o_obj;
++	struct cpt_fs_struct_image *v = cpt_get_buf(ctx);
++	loff_t saved_obj;
++	int err;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_FS;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_umask = fs->umask;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx);
++	if (!err)
++		err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx);
++	if (!err && fs->altroot.dentry)
++		err = cpt_dump_dir(fs->altroot.dentry, fs->altroot.mnt, ctx);
++
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_close_object(ctx);
++
++	return err;
++}
++
++int cpt_dump_fs_struct(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_FS);
++
++	for_each_object(obj, CPT_OBJ_FS) {
++		int err;
++
++		if ((err = dump_one_fs(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++	return 0;
++}
++
++static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	int err = 0;
++	struct mnt_namespace *n = obj->o_obj;
++	struct list_head *p;
++	char *path_buf, *path;
++
++	path_buf = (char *) __get_free_page(GFP_KERNEL);
++	if (!path_buf)
++		return -ENOMEM;
++
++	down_read(&namespace_sem);
++	list_for_each(p, &n->list) {
++		struct path pt;
++		struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
++
++		pt.dentry = mnt->mnt_root;
++		pt.mnt = mnt;
++		path = d_path(&pt, path_buf, PAGE_SIZE);
++		if (IS_ERR(path))
++			continue;
++
++		if (check_one_vfsmount(mnt)) {
++			eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name);
++			err = -EINVAL;
++			break;
++		}
++	}
++	up_read(&namespace_sem);
++
++	free_page((unsigned long) path_buf);
++
++	return err;
++}
++
++int cpt_collect_namespace(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->nsproxy && tsk->nsproxy->mnt_ns &&
++				cpt_object_add(CPT_OBJ_NAMESPACE,
++					tsk->nsproxy->mnt_ns, ctx) == NULL)
++			return -ENOMEM;
++	}
++
++	for_each_object(obj, CPT_OBJ_NAMESPACE) {
++		int err;
++		if ((err = check_one_namespace(obj, ctx)) != 0)
++			return err;
++	}
++
++	return 0;
++}
++
++struct args_t
++{
++	int* pfd;
++	char* path;
++};
++
++static int dumptmpfs(void *arg)
++{
++	int i;
++	struct args_t *args = arg;
++	int *pfd = args->pfd;
++	int fd0, fd2;
++	char *path = args->path;
++	char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL };
++
++	i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
++	if (i < 0) {
++		eprintk("cannot enter ve to dump tmpfs\n");
++		module_put(THIS_MODULE);
++		return 255 << 8;
++	}
++
++	if (pfd[1] != 1)
++		sc_dup2(pfd[1], 1);
++	set_fs(KERNEL_DS);
++	fd0 = sc_open("/dev/null", O_RDONLY, 0);
++	fd2 = sc_open("/dev/null", O_WRONLY, 0);
++	if (fd0 < 0 || fd2 < 0) {
++		eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2);
++		module_put(THIS_MODULE);
++		return 255 << 8;
++	}
++	if (fd0 != 0)
++		sc_dup2(fd0, 0);
++	if (fd2 != 2)
++		sc_dup2(fd2, 2);
++
++	for (i = 3; i < current->files->fdt->max_fds; i++) {
++		sc_close(i);
++	}
++
++	module_put(THIS_MODULE);
++
++	i = sc_execve("/bin/tar", argv, NULL);
++	eprintk("failed to exec /bin/tar: %d\n", i);
++	return 255 << 8;
++}
++
++static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx)
++{
++	int err;
++	int pid;
++	int pfd[2];
++	struct file *f;
++	struct cpt_object_hdr v;
++	char buf[16];
++	int n;
++	loff_t saved_obj;
++	struct args_t args;
++	int status;
++	mm_segment_t oldfs;
++	sigset_t ignore, blocked;
++	
++	err = sc_pipe(pfd);
++	if (err < 0)
++		return err;
++	args.pfd = pfd;
++	args.path = path;
++	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
++	sigprocmask(SIG_BLOCK, &ignore, &blocked);
++	err = pid = local_kernel_thread(dumptmpfs, (void*)&args,
++			SIGCHLD | CLONE_VFORK, 0);
++	if (err < 0) {
++		eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
++		goto out;
++	}
++	f = fget(pfd[0]);
++	sc_close(pfd[1]);
++	sc_close(pfd[0]);
++
++	cpt_push_object(&saved_obj, ctx);
++	cpt_open_object(NULL, ctx);
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NAME;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_NAME;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	do {
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
++		set_fs(oldfs);
++		if (n > 0)
++			ctx->write(buf, n, ctx);
++	} while (n > 0);
++
++	fput(f);
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if ((err = sc_waitx(pid, 0, &status)) < 0)
++		eprintk_ctx("wait4: %d\n", err);
++	else if ((status & 0x7f) == 0) {
++		err = (status & 0xff00) >> 8;
++		if (err != 0) {
++			eprintk_ctx("tar exited with %d\n", err);
++			err = -EINVAL;
++		}
++	} else {
++		eprintk_ctx("tar terminated\n");
++		err = -EINVAL;
++	}
++	set_fs(oldfs);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++
++	buf[0] = 0;
++	ctx->write(buf, 1, ctx);
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_obj, ctx);
++	return n ? : err;
++
++out:
++	if (pfd[1] >= 0)
++		sc_close(pfd[1]);
++	if (pfd[0] >= 0)
++		sc_close(pfd[0]);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++	return err;
++}
++
++static int loopy_root(struct vfsmount *mnt)
++{
++	struct list_head *p;
++
++	list_for_each(p, &mnt->mnt_ns->list) {
++		struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list);
++		if (m == mnt)
++			return 0;
++		if (m->mnt_sb == mnt->mnt_sb)
++			return 1;
++	}
++	/* Cannot happen */
++	return 0;
++}
++
++static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx)
++{
++	struct list_head *p;
++	int err = -EINVAL;
++
++	/* One special case: mount --bind /a /a */
++	if (mnt->mnt_root == mnt->mnt_mountpoint)
++		return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx);
++
++	list_for_each_prev(p, &mnt->mnt_list) {
++		struct vfsmount * m;
++
++		if (p == &mnt->mnt_ns->list)
++			break;
++
++		m = list_entry(p, struct vfsmount, mnt_list);
++
++		if (m->mnt_sb != mnt->mnt_sb)
++			continue;
++
++		err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx);
++		if (err == 0)
++			break;
++	}
++	return err;
++}
++
++static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx)
++{
++	int err = 0;
++	struct cpt_vfsmount_image v;
++	loff_t saved_obj;
++	char *path_buf, *path;
++	struct path p;
++
++	path_buf = (char *) __get_free_page(GFP_KERNEL);
++	if (!path_buf)
++		return -ENOMEM;
++
++	p.dentry = mnt->mnt_root;
++	p.mnt = mnt;
++	path = d_path(&p, path_buf, PAGE_SIZE);
++	if (IS_ERR(path)) {
++		free_page((unsigned long) path_buf);
++		return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path);
++	}
++
++	cpt_open_object(NULL, ctx);
++
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_VFSMOUNT;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_ARRAY;
++
++	v.cpt_mntflags = mnt->mnt_flags;
++	if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) {
++		v.cpt_mntflags |= CPT_MNT_EXT;
++	} else {
++		if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt))
++			v.cpt_mntflags |= CPT_MNT_BIND;
++	}
++	v.cpt_flags = mnt->mnt_sb->s_flags;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	cpt_dump_string(mnt->mnt_devname ? : "none", ctx);
++	cpt_dump_string(path, ctx);
++	cpt_dump_string(mnt->mnt_sb->s_type->name, ctx);
++
++	if (v.cpt_mntflags & CPT_MNT_BIND)
++		err = cpt_dump_bind_mnt(mnt, ctx);
++	else if (!(v.cpt_mntflags & CPT_MNT_EXT) &&
++		   strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) {
++		mntget(mnt);
++		up_read(&namespace_sem);
++		err = cpt_dump_tmpfs(path, ctx);
++		down_read(&namespace_sem);
++		if (!err) {
++			if (list_empty(&mnt->mnt_list))
++				err = -EBUSY;
++		}
++		mntput(mnt);
++	}
++
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_close_object(ctx);
++	if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS)
++		vefs_track_force_stop(mnt->mnt_sb);
++
++	free_page((unsigned long) path_buf);
++
++	return err;
++}
++
++static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct mnt_namespace *n = obj->o_obj;
++	struct cpt_object_hdr v;
++	struct list_head *p;
++	loff_t saved_obj;
++	int err = 0;
++
++	cpt_open_object(obj, ctx);
++
++	v.cpt_next = -1;
++	v.cpt_object = CPT_OBJ_NAMESPACE;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_ARRAY;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++
++	down_read(&namespace_sem);
++	list_for_each(p, &n->list) {
++		err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx);
++		if (err)
++			break;
++	}
++	up_read(&namespace_sem);
++
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_close_object(ctx);
++
++	return err;
++}
++
++int cpt_dump_namespace(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_NAMESPACE);
++
++	for_each_object(obj, CPT_OBJ_NAMESPACE) {
++		int err;
++
++		if ((err = dump_one_namespace(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_files.h b/kernel/cpt/cpt_files.h
+new file mode 100644
+index 0000000..7770ab2
+--- /dev/null
++++ b/kernel/cpt/cpt_files.h
+@@ -0,0 +1,71 @@
++int cpt_collect_files(cpt_context_t *);
++int cpt_collect_fs(cpt_context_t *);
++int cpt_collect_namespace(cpt_context_t *);
++int cpt_collect_sysvsem_undo(cpt_context_t *);
++int cpt_collect_tty(struct file *, cpt_context_t *);
++int cpt_dump_files(struct cpt_context *ctx);
++int cpt_dump_files_struct(struct cpt_context *ctx);
++int cpt_dump_fs_struct(struct cpt_context *ctx);
++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx);
++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx);
++int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx);
++struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx);
++struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx);
++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx);
++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx);
++
++int rst_posix_locks(struct cpt_context *ctx);
++
++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx);
++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_restore_fs(struct cpt_context *ctx);
++
++int cpt_collect_sysv(cpt_context_t *);
++int cpt_dump_sysvsem(struct cpt_context *ctx);
++int cpt_dump_sysvmsg(struct cpt_context *ctx);
++int rst_sysv_ipc(struct cpt_context *ctx);
++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int cpt_dump_namespace(struct cpt_context *ctx);
++int rst_root_namespace(struct cpt_context *ctx);
++
++int rst_stray_files(struct cpt_context *ctx);
++int rst_tty_jobcontrol(struct cpt_context *ctx);
++
++void rst_flush_filejobs(struct cpt_context *);
++int rst_do_filejobs(struct cpt_context *);
++
++extern struct file_operations eventpoll_fops;
++int rst_eventpoll(struct cpt_context *);
++struct file *cpt_open_epolldev(struct cpt_file_image *fi,
++			       unsigned flags,
++			       struct cpt_context *ctx);
++int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *);
++
++int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx);
++int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp,
++		   loff_t *pos, struct cpt_context *ctx);
++
++int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx);
++int rst_inotify(cpt_context_t *ctx);
++struct file *rst_open_inotify(struct cpt_file_image *fi,
++			      unsigned flags,
++			      struct cpt_context *ctx);
++
++
++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
++			 cpt_context_t *ctx);
++
++#define check_one_vfsmount(mnt) \
++	(strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \
++	 strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0)
+diff --git a/kernel/cpt/cpt_fsmagic.h b/kernel/cpt/cpt_fsmagic.h
+new file mode 100644
+index 0000000..142e539
+--- /dev/null
++++ b/kernel/cpt/cpt_fsmagic.h
+@@ -0,0 +1,16 @@
++/* Collected from kernel sources. */
++
++#define FSMAGIC_TMPFS	0x01021994
++#define FSMAGIC_PIPEFS	0x50495045
++#define FSMAGIC_SOCKFS	0x534F434B
++#define FSMAGIC_PFMFS	0xa0b4d889
++#define FSMAGIC_BDEV	0x62646576
++#define FSMAGIC_FUTEX	0x0BAD1DEA
++#define FSMAGIC_INOTIFY	0x2BAD1DEA
++#define FSMAGIC_MQUEUE	0x19800202
++#define FSMAGIC_PROC	0x9fa0
++#define FSMAGIC_DEVPTS	0x1CD1
++#define FSMAGIC_AUTOFS	0x0187
++#define FSMAGIC_EXT2	0xEF53
++#define FSMAGIC_REISER	0x52654973
++#define FSMAGIC_VEFS    0x565a4653
+diff --git a/kernel/cpt/cpt_inotify.c b/kernel/cpt/cpt_inotify.c
+new file mode 100644
+index 0000000..4d4637e
+--- /dev/null
++++ b/kernel/cpt/cpt_inotify.c
+@@ -0,0 +1,144 @@
++/*
++ *
++ *  kernel/cpt/cpt_inotify.c
++ *
++ *  Copyright (C) 2000-2007  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mnt_namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/inotify.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++extern struct file_operations inotify_fops;
++
++int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx)
++{
++	int err = 0;
++	struct file *file = obj->o_obj;
++	struct inotify_device *dev;
++	struct inotify_watch *watch;
++	struct inotify_kernel_event *kev;
++	struct cpt_inotify_image ii;
++
++	if (file->f_op != &inotify_fops) {
++		eprintk_ctx("bad inotify file\n");
++		return -EINVAL;
++	}
++
++	dev = file->private_data;
++
++	/* inotify_user.c does not protect open /proc/N/fd, silly.
++	 * Opener will get an invalid file with uninitialized private_data
++	 */
++	if (unlikely(dev == NULL)) {
++		eprintk_ctx("bad inotify dev\n");
++		return -EINVAL;
++	}
++
++	cpt_open_object(NULL, ctx);
++
++	ii.cpt_next = CPT_NULL;
++	ii.cpt_object = CPT_OBJ_INOTIFY;
++	ii.cpt_hdrlen = sizeof(ii);
++	ii.cpt_content = CPT_CONTENT_ARRAY;
++	ii.cpt_file = obj->o_pos;
++	ii.cpt_user = dev->user->uid;
++	ii.cpt_max_events = dev->max_events;
++	ii.cpt_last_wd = dev->ih->last_wd;
++
++	ctx->write(&ii, sizeof(ii), ctx);
++
++	mutex_lock(&dev->ih->mutex);
++	list_for_each_entry(watch, &dev->ih->watches, h_list) {
++		loff_t saved_obj;
++		loff_t saved_obj2;
++		struct cpt_inotify_wd_image wi;
++
++		cpt_push_object(&saved_obj, ctx);
++		cpt_open_object(NULL, ctx);
++
++		wi.cpt_next = CPT_NULL;
++		wi.cpt_object = CPT_OBJ_INOTIFY_WATCH;
++		wi.cpt_hdrlen = sizeof(wi);
++		wi.cpt_content = CPT_CONTENT_ARRAY;
++		wi.cpt_wd = watch->wd;
++		wi.cpt_mask = watch->mask;
++
++		ctx->write(&wi, sizeof(wi), ctx);
++
++		cpt_push_object(&saved_obj2, ctx);
++		err = cpt_dump_dir(watch->path.dentry, watch->path.mnt, ctx);
++		cpt_pop_object(&saved_obj2, ctx);
++		if (err)
++			break;
++
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++	mutex_unlock(&dev->ih->mutex);
++
++	if (err)
++		return err;
++
++	mutex_lock(&dev->ev_mutex);
++	list_for_each_entry(kev, &dev->events, list) {
++		loff_t saved_obj;
++		struct cpt_inotify_ev_image ei;
++
++		cpt_push_object(&saved_obj, ctx);
++		cpt_open_object(NULL, ctx);
++
++		ei.cpt_next = CPT_NULL;
++		ei.cpt_object = CPT_OBJ_INOTIFY_EVENT;
++		ei.cpt_hdrlen = sizeof(ei);
++		ei.cpt_content = CPT_CONTENT_NAME;
++		ei.cpt_wd = kev->event.wd;
++		ei.cpt_mask = kev->event.mask;
++		ei.cpt_cookie = kev->event.cookie;
++		ei.cpt_namelen = kev->name ? strlen(kev->name) : 0;
++
++		ctx->write(&ei, sizeof(ei), ctx);
++
++		if (kev->name) {
++			ctx->write(kev->name, ei.cpt_namelen+1, ctx);
++			ctx->align(ctx);
++		}
++
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++	mutex_unlock(&dev->ev_mutex);
++
++	cpt_close_object(ctx);
++
++	return err;
++}
+diff --git a/kernel/cpt/cpt_kernel.c b/kernel/cpt/cpt_kernel.c
+new file mode 100644
+index 0000000..5eb7f1c
+--- /dev/null
++++ b/kernel/cpt/cpt_kernel.c
+@@ -0,0 +1,177 @@
++/*
++ *
++ *  kernel/cpt/cpt_kernel.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#define __KERNEL_SYSCALLS__ 1
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/kernel.h>
++#ifdef CONFIG_X86
++#include <asm/cpufeature.h>
++#endif
++#include <linux/cpt_image.h>
++
++#include "cpt_kernel.h"
++#include "cpt_syscalls.h"
++
++int debug_level = 1;
++
++#ifdef CONFIG_X86_32
++
++/*
++ * Create a kernel thread
++ */
++extern void kernel_thread_helper(void);
++int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
++{
++	struct pt_regs regs;
++
++	memset(&regs, 0, sizeof(regs));
++
++	regs.bx = (unsigned long) fn;
++	regs.dx = (unsigned long) arg;
++
++	regs.ds = __USER_DS;
++	regs.es = __USER_DS;
++	regs.fs = __KERNEL_PERCPU;
++	regs.orig_ax = -1;
++	regs.ip = (unsigned long) kernel_thread_helper;
++	regs.cs = __KERNEL_CS | get_kernel_rpl();
++	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
++
++	/* Ok, create the new process.. */
++	return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL, pid);
++}
++#endif
++
++#ifdef CONFIG_IA64
++pid_t
++asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid)
++{
++	extern void start_kernel_thread (void);
++	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
++	struct {
++		struct switch_stack sw;
++		struct pt_regs pt;
++	} regs;
++
++	memset(&regs, 0, sizeof(regs));
++	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
++	regs.pt.r1 = helper_fptr[1];		/* set GP */
++	regs.pt.r9 = (unsigned long) fn;	/* 1st argument */
++	regs.pt.r11 = (unsigned long) arg;	/* 2nd argument */
++	/* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
++	regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
++	regs.pt.cr_ifs = 1UL << 63;		/* mark as valid, empty frame */
++	regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
++	regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
++	regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/);
++	return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL, pid);
++}
++#endif
++
++int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
++{
++	pid_t ret;
++
++	if (current->fs == NULL) {
++		/* do_fork_pid() hates processes without fs, oopses. */
++		printk("CPT BUG: local_kernel_thread: current->fs==NULL\n");
++		return -EINVAL;
++	}
++	if (!try_module_get(THIS_MODULE))
++		return -EBUSY;
++	ret = asm_kernel_thread(fn, arg, flags, pid);
++	if (ret < 0)
++		module_put(THIS_MODULE);
++	return ret;
++}
++
++#ifdef __i386__
++int __execve(const char *file, char **argv, char **envp)
++{
++	long res;
++	__asm__ volatile ("int $0x80"
++	: "=a" (res)
++	: "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)),
++		  "d" ((long)(envp)) : "memory");
++	return (int)res;
++}
++#endif
++
++int sc_execve(char *cmd, char **argv, char **env)
++{
++	int ret;
++#ifndef __i386__
++	ret = kernel_execve(cmd, argv, env);
++#else
++	ret = __execve(cmd, argv, env);
++#endif
++	return ret;
++}
++
++unsigned int test_cpu_caps(void)
++{
++	unsigned int flags = 0;
++
++#ifdef CONFIG_X86
++	if (boot_cpu_has(X86_FEATURE_CMOV))
++		flags |= 1 << CPT_CPU_X86_CMOV;
++	if (cpu_has_fxsr)
++		flags |= 1 << CPT_CPU_X86_FXSR;
++	if (cpu_has_xmm)
++		flags |= 1 << CPT_CPU_X86_SSE;
++#ifndef CONFIG_X86_64
++	if (cpu_has_xmm2)
++#endif
++		flags |= 1 << CPT_CPU_X86_SSE2;
++	if (cpu_has_mmx)
++		flags |= 1 << CPT_CPU_X86_MMX;
++	if (boot_cpu_has(X86_FEATURE_3DNOW))
++		flags |= 1 << CPT_CPU_X86_3DNOW;
++	if (boot_cpu_has(X86_FEATURE_3DNOWEXT))
++		flags |= 1 << CPT_CPU_X86_3DNOW2;
++	if (boot_cpu_has(X86_FEATURE_SYSCALL))
++		flags |= 1 << CPT_CPU_X86_SYSCALL;
++#ifdef CONFIG_X86_64
++	if (boot_cpu_has(X86_FEATURE_SYSCALL) &&
++			boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
++		flags |= 1 << CPT_CPU_X86_SYSCALL32;
++#endif
++	if (boot_cpu_has(X86_FEATURE_SEP)
++#ifdef CONFIG_X86_64
++			&& boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
++#endif
++	   )
++		flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32));
++#ifdef CONFIG_X86_64
++	flags |= 1 << CPT_CPU_X86_EMT64;
++#endif
++#endif
++#ifdef CONFIG_IA64
++	flags |= 1 << CPT_CPU_X86_IA64;
++	flags |= 1 << CPT_CPU_X86_FXSR;
++#endif
++	return flags;
++}
++
++unsigned int test_kernel_config(void)
++{
++	unsigned int flags = 0;
++#ifdef CONFIG_X86
++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++	flags |= 1 << CPT_KERNEL_CONFIG_PAE;
++#endif
++#endif
++	return flags;
++}
+diff --git a/kernel/cpt/cpt_kernel.h b/kernel/cpt/cpt_kernel.h
+new file mode 100644
+index 0000000..9254778
+--- /dev/null
++++ b/kernel/cpt/cpt_kernel.h
+@@ -0,0 +1,99 @@
++/* Interface to kernel vars which we had to _add_. */
++
++#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++#define TASK_TRACED TASK_STOPPED
++#define unix_peer(sk) ((sk)->sk_pair)
++#define page_mapcount(pg) ((pg)->mapcount)
++#else
++#define unix_peer(sk) (unix_sk(sk)->peer)
++#endif
++
++#ifdef CONFIG_IA64
++#define cpu_has_fxsr 1
++#endif
++
++#define CPT_SIG_IGNORE_MASK (\
++        (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \
++	(1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1)))
++
++static inline void do_gettimespec(struct timespec *ts)
++{
++	struct timeval tv;
++	do_gettimeofday(&tv);
++	ts->tv_sec = tv.tv_sec;
++	ts->tv_nsec = tv.tv_usec*1000;
++}
++
++int local_kernel_thread(int (*fn)(void *),
++		void * arg,
++		unsigned long flags,
++		pid_t pid);
++int asm_kernel_thread(int (*fn)(void *),
++		void * arg,
++		unsigned long flags,
++		pid_t pid);
++
++#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE)
++void vefs_track_force_stop(struct super_block *super);
++
++void vefs_track_notify(struct dentry *vdentry, int track_cow);
++
++struct dentry * vefs_replaced_dentry(struct dentry *de);
++int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde);
++#else
++static inline void vefs_track_force_stop(struct super_block *super) { };
++
++static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { };
++#endif
++
++unsigned int test_cpu_caps(void);
++unsigned int test_kernel_config(void);
++
++#define test_one_flag_old(src, dst, flag, message, ret) \
++if (src & (1 << flag)) \
++	if (!(dst & (1 << flag))) { \
++		wprintk("Destination cpu does not have " message "\n"); \
++		ret = 1; \
++	}
++#define test_one_flag(src, dst, flag, message, ret) \
++if (src & (1 << flag)) \
++	if (!(dst & (1 << flag))) { \
++		eprintk_ctx("Destination cpu does not have " message "\n"); \
++		ret = 1; \
++	}
++
++static inline void
++_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
++{
++	while (nsec >= NSEC_PER_SEC) {
++		nsec -= NSEC_PER_SEC;
++		++sec;
++	}
++	while (nsec < 0) {
++		nsec += NSEC_PER_SEC;
++		--sec;
++	}
++	ts->tv_sec = sec;
++	ts->tv_nsec = nsec;
++}
++
++static inline struct timespec
++_ns_to_timespec(const s64 nsec)
++{
++	struct timespec ts;
++	s32 rem;
++
++	if (!nsec)
++		return (struct timespec) {0, 0};
++
++	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
++	if (unlikely(rem < 0)) {
++		ts.tv_sec--;
++		rem += NSEC_PER_SEC;
++	}
++	ts.tv_nsec = rem;
++
++	return ts;
++}
+diff --git a/kernel/cpt/cpt_mm.c b/kernel/cpt/cpt_mm.c
+new file mode 100644
+index 0000000..a3d8c8e
+--- /dev/null
++++ b/kernel/cpt/cpt_mm.c
+@@ -0,0 +1,918 @@
++/*
++ *
++ *  kernel/cpt/cpt_mm.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/hugetlb.h>
++#include <linux/errno.h>
++#include <linux/ve.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#ifdef CONFIG_X86
++#include <asm/ldt.h>
++#endif
++#include <asm/mmu.h>
++#include <linux/cpt_image.h>
++#include <linux/shm.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++#include "cpt_pagein.h"
++#endif
++#include "cpt_ubc.h"
++
++static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
++			       cpt_context_t *ctx)
++{
++	if (!list_empty(&aio_ctx->run_list)) {
++		/* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */
++		eprintk_ctx("run list is not empty, cannot suspend AIO\n");
++		return -EBUSY;
++	}
++
++	/* Wait for pending IOCBs. Linux AIO is mostly _fake_.
++	 * It is actually synchronous, except for direct IO and
++	 * some funny raw USB things, which cannot happen inside VE.
++	 * However, we do this for future.
++	 *
++	 * Later note: in 2.6.16 we may allow O_DIRECT, so that
++	 * it is not meaningless code.
++	 */
++	wait_for_all_aios(aio_ctx);
++
++	if (!list_empty(&aio_ctx->run_list) ||
++	    !list_empty(&aio_ctx->active_reqs) ||
++	    aio_ctx->reqs_active) {
++		eprintk_ctx("were not able to suspend AIO\n");
++		return -EBUSY;
++	}
++
++	return 0;
++}
++
++static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx)
++{
++	struct vm_area_struct *vma;
++
++	for (vma = mm->mmap; vma; vma = vma->vm_next) {
++		if (vma->vm_file) {
++			if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL)
++				return -ENOMEM;
++		}
++	}
++#ifdef CONFIG_BEANCOUNTERS
++	if (cpt_add_ubc(mm->mm_ub, ctx) == NULL)
++		return -ENOMEM;
++#endif
++
++	if (mm->ioctx_list) {
++		struct kioctx *aio_ctx;
++		int err;
++
++		for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next)
++			if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
++				return err;
++	}
++
++	return 0;
++}
++
++int cpt_collect_mm(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++	int err;
++	int index;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL)
++			return -ENOMEM;
++	}
++
++	index = 1;
++	for_each_object(obj, CPT_OBJ_MM) {
++		struct mm_struct *mm = obj->o_obj;
++		if (obj->o_count != atomic_read(&mm->mm_users)) {
++			eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users));
++			return -EAGAIN;
++		}
++		cpt_obj_setindex(obj, index++, ctx);
++
++		if ((err = collect_one_mm(mm, ctx)) != 0)
++			return err;
++	}
++
++	return 0;
++}
++
++static int zcnt, scnt, scnt0, ucnt;
++
++/* Function where_is_anon_page() returns address of a anonymous page in mm
++ * of already dumped process. This happens f.e. after fork(). We do not use
++ * this right now, just keep statistics, it is diffucult to restore such state,
++ * but the most direct use is to save space in dumped image. */
++
++
++static inline unsigned long
++vma_address0(struct page *page, struct vm_area_struct *vma)
++{
++	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
++	unsigned long address;
++
++	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
++		address |= 1;
++	return address;
++}
++
++static int really_this_one(struct vm_area_struct *vma, unsigned long address,
++			   struct page *page)
++{
++	struct mm_struct *mm = vma->vm_mm;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *pte;
++	spinlock_t *ptl;
++	int result;
++
++	pgd = pgd_offset(mm, address);
++	if (unlikely(!pgd_present(*pgd)))
++		return 0;
++
++	pud = pud_offset(pgd, address);
++	if (!pud_present(*pud))
++		return 0;
++
++	pmd = pmd_offset(pud, address);
++	if (unlikely(!pmd_present(*pmd)))
++		return 0;
++
++	result = 0;
++	pte = pte_offset_map(pmd, address);
++	if (!pte_present(*pte)) {
++		pte_unmap(pte);
++		return 0;
++	}
++
++	ptl = pte_lockptr(mm, pmd);
++	spin_lock(ptl);
++	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte))
++		result = 1;
++	pte_unmap_unlock(pte, ptl);
++	return result;
++}
++
++static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr,
++				 struct page *page, cpt_context_t * ctx)
++{
++	loff_t mmptr = CPT_NULL;
++	struct anon_vma *anon_vma;
++	struct vm_area_struct *vma;
++	int idx = mmobj->o_index;
++
++	if (!PageAnon(page))
++		return CPT_NULL;
++
++	anon_vma = page_lock_anon_vma(page);
++	if (!anon_vma)
++		return CPT_NULL;
++
++	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
++		unsigned long addr = vma_address0(page, vma);
++		cpt_object_t *obj;
++
++		/* We do not try to support mremapped regions (addr != mapaddr),
++		 * only mmaps directly inherited via fork().
++		 * With this limitation we may check self-consistency of
++		 * vmas (vm_start, vm_pgoff, anon_vma) before
++		 * doing __copy_page_range() in rst_mm.
++		 */
++		if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) {
++			obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx);
++			if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) {
++				if (really_this_one(vma, addr, page)) {
++					mmptr = obj->o_pos;
++					idx = obj->o_index;
++				}
++			}
++		}
++	}
++	page_unlock_anon_vma(anon_vma);
++
++	return mmptr;
++}
++
++struct page_area
++{
++	int type;
++	unsigned long start;
++	unsigned long end;
++	pgoff_t pgoff;
++	loff_t mm;
++	__u64 list[16];
++};
++
++struct page_desc
++{
++	int	type;
++	pgoff_t	index;
++	loff_t	mm;
++	int	shared;
++};
++
++enum {
++	PD_ABSENT,
++	PD_COPY,
++	PD_ZERO,
++	PD_CLONE,
++	PD_FUNKEY,
++	PD_LAZY,
++	PD_ITER,
++	PD_ITERYOUNG,
++};
++
++/* 0: page can be obtained from backstore, or still not mapped anonymous  page,
++      or something else, which does not requre copy.
++   1: page requires copy
++   2: page requres copy but its content is zero. Quite useless.
++   3: wp page is shared after fork(). It is to be COWed when modified.
++   4: page is something unsupported... We copy it right now.
++ */
++
++
++
++static void page_get_desc(cpt_object_t *mmobj,
++			  struct vm_area_struct *vma, unsigned long addr,
++			  struct page_desc *pdesc, cpt_context_t * ctx)
++{
++	struct mm_struct *mm = vma->vm_mm;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *ptep, pte;
++	spinlock_t *ptl;
++	struct page *pg = NULL;
++	pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff;
++
++	pdesc->index = linear_index;
++	pdesc->shared = 0;
++	pdesc->mm = CPT_NULL;
++
++	if (vma->vm_flags & VM_IO) {
++		pdesc->type = PD_ABSENT;
++		return;
++	}
++
++	pgd = pgd_offset(mm, addr);
++	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
++		goto out_absent;
++	pud = pud_offset(pgd, addr);
++	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
++		goto out_absent;
++	pmd = pmd_offset(pud, addr);
++	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
++		goto out_absent;
++#ifdef CONFIG_X86
++	if (pmd_huge(*pmd)) {
++		eprintk_ctx("page_huge\n");
++		goto out_unsupported;
++	}
++#endif
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++retry:
++#endif
++	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++	pte = *ptep;
++	pte_unmap(ptep);
++
++	if (pte_none(pte))
++		goto out_absent_unlock;
++
++	if (!pte_present(pte)) {
++		if (pte_file(pte)) {
++			pdesc->index = pte_to_pgoff(pte);
++			goto out_absent_unlock;
++		}
++		if (vma->vm_flags & VM_SHARED) {
++			/* It is impossible: shared mappings cannot be in swap */
++			eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos);
++			goto out_unsupported_unlock;
++		}
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++		/* Otherwise it is in swap. */
++		if (!ctx->lazy_vm) {
++			int err;
++			/* If lazy transfer is not enabled,
++			 * raise it from swap now, so that we
++			 * save at least when the page is shared.
++			 */
++			spin_unlock(ptl);
++			err = handle_mm_fault(mm, vma, addr, 0);
++			if (err == VM_FAULT_SIGBUS)
++				goto out_absent;
++			if (err == VM_FAULT_OOM)
++				goto out_absent;
++			err = 0;
++			goto retry;
++		}
++#endif
++		pdesc->type = PD_LAZY;
++		goto out_unlock;
++	}
++
++	if ((pg = vm_normal_page(vma, addr, pte)) == NULL) {
++		pdesc->type = PD_COPY;
++		goto out_unlock;
++	}
++
++	get_page(pg);
++	spin_unlock(ptl);
++
++	if (pg->mapping && !PageAnon(pg)) {
++		if (vma->vm_file == NULL) {
++			eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
++			goto out_unsupported;
++		}
++		if (vma->vm_file->f_mapping != pg->mapping) {
++			eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n",
++				    addr, vma->vm_file->f_mapping, pg->mapping,
++				    mmobj->o_pos);
++			goto out_unsupported;
++		}
++		pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
++		/* Page is in backstore. For us it is like
++		 * it is not present.
++		 */
++		goto out_absent;
++	}
++
++	if (PageReserved(pg)) {
++		/* Special case: ZERO_PAGE is used, when an
++		 * anonymous page is accessed but not written. */
++		if (pg == ZERO_PAGE(addr)) {
++			if (pte_write(pte)) {
++				eprintk_ctx("not funny already, writable ZERO_PAGE\n");
++				goto out_unsupported;
++			}
++			zcnt++;
++			goto out_absent;
++		}
++		eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index,
++			    addr, mmobj->o_pos);
++		goto out_unsupported;
++	}
++
++	if (pg == ZERO_PAGE(addr)) {
++		wprintk_ctx("that's how it works now\n");
++	}
++
++	if (!pg->mapping) {
++		eprintk_ctx("page without mapping at %08lx@%Ld\n", addr,
++			    mmobj->o_pos);
++		goto out_unsupported;
++	}
++
++	if (pg->mapping && page_mapcount(pg) > 1) {
++		pdesc->shared = 1;
++		pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx);
++		if (pdesc->mm != CPT_NULL) {
++			scnt0++;
++			pdesc->type = PD_CLONE;
++			goto out_put;
++		} else {
++			scnt++;
++		}
++	}
++#ifdef CONFIG_VZ_CHECKPOINT_ITER
++	if (ctx->iter_done &&
++	    test_bit(PG_checkpointed, &pg->flags)) {
++		if (pte_write(pte)) {
++			wprintk_ctx("writable PG_checkpointed page\n");
++		}
++		pdesc->index = page_to_pfn(pg);
++		pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER;
++		goto out_put;
++	}
++#endif
++	pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY;
++
++out_put:
++	if (pg)
++		put_page(pg);
++	return;
++
++out_unlock:
++	spin_unlock(ptl);
++	goto out_put;
++
++out_absent_unlock:
++	spin_unlock(ptl);
++out_absent:
++	pdesc->type = PD_ABSENT;
++	goto out_put;
++
++out_unsupported_unlock:
++	spin_unlock(ptl);
++out_unsupported:
++	ucnt++;
++	pdesc->type = PD_FUNKEY;
++	goto out_put;
++}
++
++/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages()
++ * does not really need this thing. It just stores some page fault stats there.
++ *
++ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages
++ * before accessing vma.
++ */
++void dump_pages(struct vm_area_struct *vma, unsigned long start,
++		unsigned long end, struct cpt_context *ctx)
++{
++#define MAX_PAGE_BATCH 16
++	struct page *pg[MAX_PAGE_BATCH];
++	int npages = (end - start)/PAGE_SIZE;
++	int count = 0;
++
++	while (count < npages) {
++		int copy = npages - count;
++		int n;
++
++		if (copy > MAX_PAGE_BATCH)
++			copy = MAX_PAGE_BATCH;
++		n = get_user_pages(current, vma->vm_mm, start, copy,
++				   0, 1, pg, NULL);
++		if (n == copy) {
++			int i;
++			for (i=0; i<n; i++) {
++				char *maddr = kmap(pg[i]);
++				ctx->write(maddr, PAGE_SIZE, ctx);
++				kunmap(pg[i]);
++			}
++		} else {
++			eprintk_ctx("get_user_pages fault");
++			for ( ; n > 0; n--)
++				page_cache_release(pg[n-1]);
++			return;
++		}
++		start += n*PAGE_SIZE;
++		count += n;
++		for ( ; n > 0; n--)
++			page_cache_release(pg[n-1]);
++	}
++	return;
++}
++
++int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb,
++		    int copy,
++		    struct cpt_context *ctx)
++{
++	loff_t saved_object;
++
++	cpt_push_object(&saved_object, ctx);
++
++	pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES;
++	pgb->cpt_hdrlen = sizeof(*pgb);
++	pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID;
++
++	ctx->write(pgb, sizeof(*pgb), ctx);
++	if (copy == PD_COPY || copy == PD_LAZY)
++		dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx);
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa,
++			 struct cpt_context *ctx)
++{
++	struct cpt_remappage_block pgb;
++	loff_t saved_object;
++
++	cpt_push_object(&saved_object, ctx);
++
++	pgb.cpt_object = CPT_OBJ_REMAPPAGES;
++	pgb.cpt_hdrlen = sizeof(pgb);
++	pgb.cpt_content = CPT_CONTENT_VOID;
++	pgb.cpt_start = pa->start;
++	pgb.cpt_end = pa->end;
++	pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1;
++
++	ctx->write(&pgb, sizeof(pgb), ctx);
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa,
++			struct cpt_context *ctx)
++{
++	struct cpt_copypage_block pgb;
++	loff_t saved_object;
++
++	cpt_push_object(&saved_object, ctx);
++
++	pgb.cpt_object = CPT_OBJ_COPYPAGES;
++	pgb.cpt_hdrlen = sizeof(pgb);
++	pgb.cpt_content = CPT_CONTENT_VOID;
++	pgb.cpt_start = pa->start;
++	pgb.cpt_end = pa->end;
++	pgb.cpt_source = pa->mm;
++
++	ctx->write(&pgb, sizeof(pgb), ctx);
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa,
++			cpt_context_t *ctx)
++{
++	struct cpt_lazypage_block pgb;
++	loff_t saved_object;
++
++	cpt_push_object(&saved_object, ctx);
++
++	pgb.cpt_object = CPT_OBJ_LAZYPAGES;
++	pgb.cpt_hdrlen = sizeof(pgb);
++	pgb.cpt_content = CPT_CONTENT_VOID;
++	pgb.cpt_start = pa->start;
++	pgb.cpt_end = pa->end;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start,
++				     (pa->end-pa->start)/PAGE_SIZE, ctx);
++#endif
++	ctx->write(&pgb, sizeof(pgb), ctx);
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa,
++			cpt_context_t *ctx)
++{
++	struct cpt_iterpage_block pgb;
++	loff_t saved_object;
++
++	cpt_push_object(&saved_object, ctx);
++
++	pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES :
++		CPT_OBJ_ITERYOUNGPAGES;
++	pgb.cpt_hdrlen = sizeof(pgb);
++	pgb.cpt_content = CPT_CONTENT_VOID;
++	pgb.cpt_start = pa->start;
++	pgb.cpt_end = pa->end;
++	ctx->write(&pgb, sizeof(pgb), ctx);
++
++	ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx);
++
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++
++static int can_expand(struct page_area *pa, struct page_desc *pd)
++{
++	if (pa->start == pa->end)
++		return 1;
++	if (pa->type != pd->type)
++		return 0;
++	if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) {
++		if (pa->end - pa->start >= PAGE_SIZE*16)
++			return 0;
++		pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index;
++	}
++	if (pa->type == PD_ABSENT)
++		return pd->index == pa->pgoff + 1;
++	if (pa->type == PD_CLONE)
++		return pd->mm == pa->mm;
++	return 1;
++}
++
++static int dump_one_vma(cpt_object_t *mmobj,
++			struct vm_area_struct *vma, struct cpt_context *ctx)
++{
++	struct cpt_vma_image *v = cpt_get_buf(ctx);
++	unsigned long addr;
++	loff_t saved_object;
++	struct cpt_page_block pgb;
++	struct page_area pa;
++	int cloned_pages = 0;
++
++	cpt_push_object(&saved_object, ctx);
++
++	v->cpt_object = CPT_OBJ_VMA;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_start = vma->vm_start;
++	v->cpt_end = vma->vm_end;
++	v->cpt_flags = vma->vm_flags;
++	if (vma->vm_flags&VM_HUGETLB) {
++		eprintk_ctx("huge TLB VMAs are still not supported\n");
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++	v->cpt_pgprot = vma->vm_page_prot.pgprot;
++	v->cpt_pgoff = vma->vm_pgoff;
++	v->cpt_file = CPT_NULL;
++#ifndef CONFIG_IA64
++	if ((void *)vma->vm_start == vma->vm_mm->context.vdso &&
++			vma->vm_ops == &special_mapping_vmops)
++		v->cpt_type = CPT_VMA_VDSO;
++	else
++#endif
++		v->cpt_type = CPT_VMA_TYPE_0;
++	v->cpt_anonvma = 0;
++
++	/* We have to remember what VMAs are bound to one anon_vma.
++	 * So, we store an identifier of group of VMAs. It is handy
++	 * to use absolute address of anon_vma as this identifier. */
++	v->cpt_anonvmaid = (unsigned long)vma->anon_vma;
++
++	if (vma->vm_file) {
++		struct file *filp;
++		cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx);
++		if (obj == NULL) BUG();
++		filp = obj->o_obj;
++		if (filp->f_op == &shm_file_operations) {
++			struct shm_file_data *sfd = filp->private_data;
++
++			v->cpt_type = CPT_VMA_TYPE_SHM;
++			obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx);
++		}
++		v->cpt_file = obj->o_pos;
++	}
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	if (v->cpt_type == CPT_VMA_VDSO)
++		goto out;
++
++	pa.type = PD_ABSENT;
++	pa.pgoff = vma->vm_pgoff;
++	pa.mm = CPT_NULL;
++	pa.start = vma->vm_start;
++	pa.end = vma->vm_start;
++
++	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
++		struct page_desc pd;
++
++		page_get_desc(mmobj, vma, addr, &pd, ctx);
++		cloned_pages += pd.shared;
++
++		if (pd.type == PD_FUNKEY) {
++			eprintk_ctx("dump_one_vma: funkey page\n");
++			return -EINVAL;
++		}
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++		if (pd.type == PD_LAZY &&
++		    (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED)))
++			pd.type = PD_COPY;
++#else
++		if (pd.type == PD_LAZY)
++			pd.type = PD_COPY;
++#endif
++
++		if (!can_expand(&pa, &pd)) {
++			if (pa.type == PD_COPY ||
++			    pa.type == PD_ZERO) {
++				pgb.cpt_start = pa.start;
++				pgb.cpt_end = pa.end;
++				dump_page_block(vma, &pgb, pa.type, ctx);
++			} else if (pa.type == PD_CLONE) {
++				dump_copypage_block(vma, &pa, ctx);
++				cloned_pages++;
++			} else if (pa.type == PD_LAZY) {
++				dump_lazypage_block(vma, &pa, ctx);
++			} else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) {
++				dump_iterpage_block(vma, &pa, ctx);
++				cloned_pages++;
++			} else if (pa.type == PD_ABSENT &&
++				   pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
++				dump_remappage_block(vma, &pa, ctx);
++			}
++			pa.start = addr;
++		}
++		pa.type = pd.type;
++		pa.end = addr + PAGE_SIZE;
++		pa.pgoff = pd.index;
++		if (addr == pa.start)
++			pa.list[0] = pd.index;
++		pa.mm = pd.mm;
++	}
++
++	if (pa.end > pa.start) {
++		if (pa.type == PD_COPY ||
++		    pa.type == PD_ZERO) {
++			pgb.cpt_start = pa.start;
++			pgb.cpt_end = pa.end;
++			dump_page_block(vma, &pgb, pa.type, ctx);
++		} else if (pa.type == PD_CLONE) {
++			dump_copypage_block(vma, &pa, ctx);
++			cloned_pages++;
++		} else if (pa.type == PD_LAZY) {
++			dump_lazypage_block(vma, &pa, ctx);
++		} else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) {
++			dump_iterpage_block(vma, &pa, ctx);
++			cloned_pages++;
++		} else if (pa.type == PD_ABSENT &&
++			   pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
++			dump_remappage_block(vma, &pa, ctx);
++		}
++	}
++
++	if (cloned_pages) {
++		__u32 anonvma = 1;
++		loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma);
++		ctx->pwrite(&anonvma, 4, ctx, anonpos);
++	}
++
++out:
++	cpt_close_object(ctx);
++
++	cpt_pop_object(&saved_object, ctx);
++
++	return 0;
++}
++
++static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
++			    cpt_context_t *ctx)
++{
++	loff_t saved_object;
++	struct cpt_aio_ctx_image aimg;
++
++	if (!list_empty(&aio_ctx->run_list) ||
++	    !list_empty(&aio_ctx->active_reqs) ||
++	    aio_ctx->reqs_active) {
++		eprintk_ctx("AIO is active after suspend\n");
++		return -EBUSY;
++	}
++
++	cpt_push_object(&saved_object, ctx);
++
++	aimg.cpt_next = CPT_ALIGN(sizeof(aimg));
++	aimg.cpt_object = CPT_OBJ_AIO_CONTEXT;
++	aimg.cpt_hdrlen = sizeof(aimg);
++	aimg.cpt_content = CPT_CONTENT_ARRAY;
++
++	aimg.cpt_max_reqs = aio_ctx->max_reqs;
++	aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages;
++	aimg.cpt_nr = aio_ctx->ring_info.nr;
++	aimg.cpt_tail = aio_ctx->ring_info.tail;
++	aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base;
++
++	ctx->write(&aimg, sizeof(aimg), ctx);
++
++	cpt_pop_object(&saved_object, ctx);
++	return 0;
++}
++
++static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct mm_struct *mm = obj->o_obj;
++	struct vm_area_struct *vma;
++	struct cpt_mm_image *v = cpt_get_buf(ctx);
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = -1;
++	v->cpt_object = CPT_OBJ_MM;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_start_code = mm->start_code;
++	v->cpt_end_code = mm->end_code;
++	v->cpt_start_data = mm->start_data;
++	v->cpt_end_data = mm->end_data;
++	v->cpt_start_brk = mm->start_brk;
++	v->cpt_brk = mm->brk;
++	v->cpt_start_stack = mm->start_stack;
++	v->cpt_start_arg = mm->arg_start;
++	v->cpt_end_arg = mm->arg_end;
++	v->cpt_start_env = mm->env_start;
++	v->cpt_end_env = mm->env_end;
++	v->cpt_def_flags = mm->def_flags;
++#ifdef CONFIG_BEANCOUNTERS
++	v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx);
++#endif
++	/* FIXME when coredump mask exceeds 8 bits */
++	WARN_ON(mm->flags >> 8);
++	v->cpt_dumpable = mm->flags;
++	v->cpt_vps_dumpable = mm->vps_dumpable;
++	v->cpt_used_hugetlb = 0; /* not used */
++#ifndef CONFIG_IA64
++	v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso;
++#endif
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++#ifdef CONFIG_X86
++	if (mm->context.size) {
++		loff_t saved_object;
++		struct cpt_obj_bits b;
++		int size;
++
++		dprintk_ctx("nontrivial LDT\n");
++
++		cpt_push_object(&saved_object, ctx);
++
++		cpt_open_object(NULL, ctx);
++		b.cpt_next = CPT_NULL;
++		b.cpt_object = CPT_OBJ_BITS;
++		b.cpt_hdrlen = sizeof(b);
++		b.cpt_content = CPT_CONTENT_MM_CONTEXT;
++		b.cpt_size = mm->context.size*LDT_ENTRY_SIZE;
++
++		ctx->write(&b, sizeof(b), ctx);
++
++		size = mm->context.size*LDT_ENTRY_SIZE;
++
++#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \
++			LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19)
++		ctx->write(mm->context.ldt, size, ctx);
++#else
++		for (i = 0; i < size; i += PAGE_SIZE) {
++			int nr = i / PAGE_SIZE, bytes;
++			char *kaddr = kmap(mm->context.ldt_pages[nr]);
++
++			bytes = size - i;
++			if (bytes > PAGE_SIZE)
++				bytes = PAGE_SIZE;
++			ctx->write(kaddr, bytes, ctx);
++			kunmap(mm->context.ldt_pages[nr]);
++		}
++#endif
++
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_object, ctx);
++	}
++#endif
++
++	for (vma = mm->mmap; vma; vma = vma->vm_next) {
++		int err;
++
++		if ((err = dump_one_vma(obj, vma, ctx)) != 0)
++			return err;
++	}
++
++	if (mm->ioctx_list) {
++		struct kioctx *aio_ctx;
++		int err;
++
++		for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next)
++			if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
++				return err;
++	}
++
++	cpt_close_object(ctx);
++
++	return 0;
++}
++
++int cpt_dump_vm(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	scnt = scnt0 = zcnt = 0;
++
++	cpt_open_section(ctx, CPT_SECT_MM);
++
++	for_each_object(obj, CPT_OBJ_MM) {
++		int err;
++
++		if ((err = dump_one_mm(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++
++	if (scnt)
++		dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt);
++	if (scnt0)
++		dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0);
++	if (zcnt)
++		dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt);
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_mm.h b/kernel/cpt/cpt_mm.h
+new file mode 100644
+index 0000000..dc2c483
+--- /dev/null
++++ b/kernel/cpt/cpt_mm.h
+@@ -0,0 +1,35 @@
++int cpt_collect_mm(cpt_context_t *);
++
++int cpt_dump_vm(struct cpt_context *ctx);
++
++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int cpt_mm_prepare(unsigned long veid);
++
++int cpt_free_pgin_dir(struct cpt_context *);
++int cpt_start_pagein(struct cpt_context *);
++int rst_setup_pagein(struct cpt_context *);
++int rst_complete_pagein(struct cpt_context *, int);
++int rst_pageind(struct cpt_context *);
++int cpt_iteration(cpt_context_t *ctx);
++int rst_iteration(cpt_context_t *ctx);
++void rst_drop_iter_dir(cpt_context_t *ctx);
++int rst_iter(struct vm_area_struct *vma, u64 pfn,
++	     unsigned long addr, cpt_context_t * ctx);
++
++int rst_swapoff(struct cpt_context *);
++
++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
++struct linux_binprm;
++extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack,
++				       unsigned long map_address);
++#endif
++
++#ifdef CONFIG_X86
++extern struct page *vdso32_pages[1];
++#define vsyscall_addr page_address(vdso32_pages[0])
++#endif
++
++extern struct vm_operations_struct special_mapping_vmops;
+diff --git a/kernel/cpt/cpt_net.c b/kernel/cpt/cpt_net.c
+new file mode 100644
+index 0000000..373db60
+--- /dev/null
++++ b/kernel/cpt/cpt_net.c
+@@ -0,0 +1,610 @@
++/*
++ *
++ *  kernel/cpt/cpt_net.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/nsproxy.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <net/addrconf.h>
++#include <linux/rtnetlink.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++#include <linux/nfcalls.h>
++#include <linux/if_tun.h>
++#include <linux/veth.h>
++#include <linux/fdtable.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++#include "cpt_syscalls.h"
++
++static void cpt_dump_veth(struct net_device *dev, struct cpt_context * ctx)
++{
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++	struct cpt_veth_image v;
++	struct veth_struct *veth;
++
++	if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open))
++		return;
++
++	veth = veth_from_netdev(dev);
++	cpt_open_object(NULL, ctx);
++
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NET_VETH;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_VOID;
++
++	v.cpt_allow_mac_change = veth->allow_mac_change;
++
++	ctx->write(&v, sizeof(v), ctx);
++	cpt_close_object(ctx);
++#endif
++	return;
++}
++
++static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx)
++{
++	struct cpt_netstats_image *n;
++	struct net_device_stats *stats;
++
++	if (!dev->get_stats)
++		return;
++
++	n = cpt_get_buf(ctx);
++	stats = dev->get_stats(dev);
++	cpt_open_object(NULL, ctx);
++
++	n->cpt_next = CPT_NULL;
++	n->cpt_object = CPT_OBJ_NET_STATS;
++	n->cpt_hdrlen = sizeof(*n);
++	n->cpt_content = CPT_CONTENT_VOID;
++
++	n->cpt_rx_packets = stats->rx_packets;
++	n->cpt_tx_packets = stats->tx_packets;
++	n->cpt_rx_bytes = stats->rx_bytes;
++	n->cpt_tx_bytes = stats->tx_bytes;
++	n->cpt_rx_errors = stats->rx_errors;
++	n->cpt_tx_errors = stats->tx_errors;
++	n->cpt_rx_dropped = stats->rx_dropped;
++	n->cpt_tx_dropped = stats->tx_dropped;
++	n->cpt_multicast = stats->multicast;
++	n->cpt_collisions = stats->collisions;
++	n->cpt_rx_length_errors = stats->rx_length_errors;
++	n->cpt_rx_over_errors = stats->rx_over_errors;
++	n->cpt_rx_crc_errors = stats->rx_crc_errors;
++	n->cpt_rx_frame_errors = stats->rx_frame_errors;
++	n->cpt_rx_fifo_errors = stats->rx_fifo_errors;
++	n->cpt_rx_missed_errors = stats->rx_missed_errors;
++	n->cpt_tx_aborted_errors = stats->tx_aborted_errors;
++	n->cpt_tx_carrier_errors = stats->tx_carrier_errors;
++	n->cpt_tx_fifo_errors = stats->tx_fifo_errors;
++	n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors;
++	n->cpt_tx_window_errors = stats->tx_window_errors;
++	n->cpt_rx_compressed = stats->rx_compressed;
++	n->cpt_tx_compressed = stats->tx_compressed;
++
++	ctx->write(n, sizeof(*n), ctx);
++	cpt_close_object(ctx);
++	cpt_release_buf(ctx);
++	return;
++}
++
++static void cpt_dump_tuntap(struct net_device *dev, struct cpt_context * ctx)
++{
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++	struct cpt_tuntap_image v;
++	struct tun_struct *tun;
++	cpt_object_t *obj;
++
++	if (dev->open != tun_net_open)
++		return;
++
++	tun = netdev_priv(dev);
++	cpt_open_object(NULL, ctx);
++
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NET_TUNTAP;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_VOID;
++
++	v.cpt_owner = tun->owner;
++	v.cpt_flags = tun->flags;
++	v.cpt_attached = tun->attached;
++
++	if (tun->bind_file) {
++		obj = lookup_cpt_object(CPT_OBJ_FILE, tun->bind_file, ctx);
++		BUG_ON(!obj);
++		v.cpt_bindfile = obj->o_pos;
++	}
++
++	v.cpt_if_flags = tun->if_flags;
++	BUG_ON(sizeof(v.cpt_dev_addr) != sizeof(tun->dev_addr));
++	memcpy(v.cpt_dev_addr, tun->dev_addr, sizeof(v.cpt_dev_addr));
++	BUG_ON(sizeof(v.cpt_chr_filter) != sizeof(tun->chr_filter));
++	memcpy(v.cpt_chr_filter, tun->chr_filter, sizeof(v.cpt_chr_filter));
++	BUG_ON(sizeof(v.cpt_net_filter) != sizeof(tun->net_filter));
++	memcpy(v.cpt_net_filter, tun->net_filter, sizeof(v.cpt_net_filter));
++	ctx->write(&v, sizeof(v), ctx);
++	cpt_close_object(ctx);
++#endif
++	return;
++}
++
++int cpt_dump_link(struct cpt_context * ctx)
++{
++	struct net *net = get_exec_env()->ve_netns;
++	struct net_device *dev;
++
++	cpt_open_section(ctx, CPT_SECT_NET_DEVICE);
++	for_each_netdev(net, dev) {
++		struct cpt_netdev_image v;
++		struct cpt_hwaddr_image hw;
++		loff_t saved_obj;
++
++		cpt_open_object(NULL, ctx);
++
++		v.cpt_next = CPT_NULL;
++		v.cpt_object = CPT_OBJ_NET_DEVICE;
++		v.cpt_hdrlen = sizeof(v);
++		v.cpt_content = CPT_CONTENT_ARRAY;
++
++		v.cpt_index = dev->ifindex;
++		v.cpt_flags = dev->flags;
++		memcpy(v.cpt_name, dev->name, IFNAMSIZ);
++		ctx->write(&v, sizeof(v), ctx);
++
++		cpt_push_object(&saved_obj, ctx);
++
++		cpt_dump_tuntap(dev, ctx);
++		
++		cpt_dump_veth(dev, ctx);
++
++		/* Dump hardware address */
++		cpt_open_object(NULL, ctx);
++		hw.cpt_next = CPT_NULL;
++		hw.cpt_object = CPT_OBJ_NET_HWADDR;
++		hw.cpt_hdrlen = sizeof(hw);
++		hw.cpt_content = CPT_CONTENT_VOID;
++		BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr));
++		memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr));
++		ctx->write(&hw, sizeof(hw), ctx);
++		cpt_close_object(ctx);
++		
++		cpt_dump_netstats(dev, ctx);
++
++		cpt_pop_object(&saved_obj, ctx);
++
++		cpt_close_object(ctx);
++
++		if (dev != net->loopback_dev
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++		    && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open))
++#endif
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++		     && dev != get_exec_env()->_venet_dev
++#endif
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++		    && dev->open != tun_net_open
++#endif
++							) {
++			eprintk_ctx("unsupported netdevice %s\n", dev->name);
++			cpt_close_section(ctx);
++			return -EBUSY;
++		}
++	}
++	cpt_close_section(ctx);
++	return 0;
++}
++
++int cpt_suspend_network(struct cpt_context *ctx)
++{
++	get_exec_env()->disable_net = 1;
++	synchronize_net();
++	return 0;
++}
++
++int cpt_resume_network(struct cpt_context *ctx)
++{
++	struct ve_struct *env;
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++	env->disable_net = 0;
++	put_ve(env);
++	return 0;
++}
++
++int cpt_dump_ifaddr(struct cpt_context * ctx)
++{
++	struct net *net = get_exec_env()->ve_netns;
++	struct net_device *dev;
++
++	cpt_open_section(ctx, CPT_SECT_NET_IFADDR);
++	for_each_netdev(net, dev) {
++		struct in_device *idev = in_dev_get(dev);
++		struct in_ifaddr *ifa;
++
++		if (!idev)
++			continue;
++
++		for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) {
++			struct cpt_ifaddr_image v;
++			cpt_open_object(NULL, ctx);
++
++			v.cpt_next = CPT_NULL;
++			v.cpt_object = CPT_OBJ_NET_IFADDR;
++			v.cpt_hdrlen = sizeof(v);
++			v.cpt_content = CPT_CONTENT_VOID;
++
++			v.cpt_index = dev->ifindex;
++			v.cpt_family = AF_INET;
++			v.cpt_masklen = ifa->ifa_prefixlen;
++			v.cpt_flags = ifa->ifa_flags;
++			v.cpt_scope = ifa->ifa_scope;
++			memset(&v.cpt_address, 0, sizeof(v.cpt_address));
++			memset(&v.cpt_peer, 0, sizeof(v.cpt_peer));
++			memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
++			v.cpt_address[0] = ifa->ifa_local;
++			v.cpt_peer[0] = ifa->ifa_address;
++			v.cpt_broadcast[0] = ifa->ifa_broadcast;
++			memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ);
++			ctx->write(&v, sizeof(v), ctx);
++			cpt_close_object(ctx);
++		}
++		in_dev_put(idev);
++	}
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++	for_each_netdev(net, dev) {
++		struct inet6_dev *idev = in6_dev_get(dev);
++		struct inet6_ifaddr *ifa;
++
++		if (!idev)
++			continue;
++
++		for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
++			struct cpt_ifaddr_image v;
++
++			if (dev == net->loopback_dev &&
++			    ifa->prefix_len == 128 &&
++			    ifa->addr.s6_addr32[0] == 0 &&
++			    ifa->addr.s6_addr32[1] == 0 &&
++			    ifa->addr.s6_addr32[2] == 0 &&
++			    ifa->addr.s6_addr32[3] == htonl(1))
++				continue;
++
++			cpt_open_object(NULL, ctx);
++
++			v.cpt_next = CPT_NULL;
++			v.cpt_object = CPT_OBJ_NET_IFADDR;
++			v.cpt_hdrlen = sizeof(v);
++			v.cpt_content = CPT_CONTENT_VOID;
++
++			v.cpt_index = dev->ifindex;
++			v.cpt_family = AF_INET6;
++			v.cpt_masklen = ifa->prefix_len;
++			v.cpt_flags = ifa->flags;
++			v.cpt_scope = ifa->scope;
++			v.cpt_valid_lft = ifa->valid_lft;
++			v.cpt_prefered_lft = ifa->prefered_lft;
++			memcpy(&v.cpt_address, &ifa->addr, 16);
++			memcpy(&v.cpt_peer, &ifa->addr, 16);
++			memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
++			memcpy(v.cpt_label, dev->name, IFNAMSIZ);
++			ctx->write(&v, sizeof(v), ctx);
++			cpt_close_object(ctx);
++		}
++		in6_dev_put(idev);
++	}
++#endif
++	cpt_close_section(ctx);
++	return 0;
++}
++
++static int cpt_dump_route(struct cpt_context * ctx)
++{
++	int err;
++	struct socket *sock;
++	struct msghdr msg;
++	struct iovec iov;
++	struct {
++		struct nlmsghdr nlh;
++		struct rtgenmsg g;
++	} req;
++	struct sockaddr_nl nladdr;
++	struct cpt_object_hdr v;
++	mm_segment_t oldfs;
++	char *pg;
++
++	err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
++	if (err)
++		return err;
++
++	memset(&nladdr, 0, sizeof(nladdr));
++	nladdr.nl_family = AF_NETLINK;
++
++	req.nlh.nlmsg_len = sizeof(req);
++	req.nlh.nlmsg_type = RTM_GETROUTE;
++	req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
++	req.nlh.nlmsg_pid = 0;
++	req.g.rtgen_family = AF_INET;
++
++	iov.iov_base=&req;
++	iov.iov_len=sizeof(req);
++	msg.msg_name=&nladdr;
++	msg.msg_namelen=sizeof(nladdr);
++	msg.msg_iov=&iov;
++	msg.msg_iovlen=1;
++	msg.msg_control=NULL;
++	msg.msg_controllen=0;
++	msg.msg_flags=MSG_DONTWAIT;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	err = sock_sendmsg(sock, &msg, sizeof(req));
++	set_fs(oldfs);
++
++	if (err < 0)
++		goto out_sock;
++
++	pg = (char*)__get_free_page(GFP_KERNEL);
++	if (pg == NULL) {
++		err = -ENOMEM;
++		goto out_sock;
++	}
++
++	cpt_open_section(ctx, CPT_SECT_NET_ROUTE);
++	cpt_open_object(NULL, ctx);
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NET_ROUTE;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_NLMARRAY;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++restart:
++#endif
++	for (;;) {
++		struct nlmsghdr *h;
++
++		iov.iov_base = pg;
++		iov.iov_len = PAGE_SIZE;
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
++		set_fs(oldfs);
++
++		if (err < 0)
++			goto out_sock_pg;
++		if (msg.msg_flags & MSG_TRUNC) {
++			err = -ENOBUFS;
++			goto out_sock_pg;
++		}
++
++		h = (struct nlmsghdr*)pg;
++		while (NLMSG_OK(h, err)) {
++			if (h->nlmsg_type == NLMSG_DONE) {
++				err = 0;
++				goto done;
++			}
++			if (h->nlmsg_type == NLMSG_ERROR) {
++				struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h);
++				err = errm->error;
++				eprintk_ctx("NLMSG error: %d\n", errm->error);
++				goto done;
++			}
++			if (h->nlmsg_type != RTM_NEWROUTE) {
++				eprintk_ctx("NLMSG: %d\n", h->nlmsg_type);
++				err = -EINVAL;
++				goto done;
++			}
++			ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx);
++			h = NLMSG_NEXT(h, err);
++		}
++		if (err) {
++			eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type);
++			err = -EINVAL;
++			break;
++		}
++	}
++done:
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++	if (!err && req.g.rtgen_family == AF_INET) {
++		req.g.rtgen_family = AF_INET6;
++		iov.iov_base=&req;
++		iov.iov_len=sizeof(req);
++		msg.msg_name=&nladdr;
++		msg.msg_namelen=sizeof(nladdr);
++		msg.msg_iov=&iov;
++		msg.msg_iovlen=1;
++		msg.msg_control=NULL;
++		msg.msg_controllen=0;
++		msg.msg_flags=MSG_DONTWAIT;
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		err = sock_sendmsg(sock, &msg, sizeof(req));
++		set_fs(oldfs);
++
++		if (err > 0)
++			goto restart;
++	}
++#endif
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++	cpt_close_section(ctx);
++
++out_sock_pg:
++	free_page((unsigned long)pg);
++out_sock:
++	sock_release(sock);
++	return err;
++}
++
++static int dumpfn(void *arg)
++{
++	int i;
++	int *pfd = arg;
++	char *argv[] = { "iptables-save", "-c", NULL };
++
++	i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
++	if (i < 0) {
++		eprintk("cannot enter ve to dump iptables\n");
++		module_put(THIS_MODULE);
++		return 255 << 8;
++	}
++
++	if (pfd[1] != 1)
++		sc_dup2(pfd[1], 1);
++
++	for (i=0; i<current->files->fdt->max_fds; i++) {
++		if (i != 1)
++			sc_close(i);
++	}
++
++	module_put(THIS_MODULE);
++
++	set_fs(KERNEL_DS);
++	i = sc_execve("/sbin/iptables-save", argv, NULL);
++	if (i == -ENOENT)
++		i = sc_execve("/usr/sbin/iptables-save", argv, NULL);
++	eprintk("failed to exec iptables-save: %d\n", i);
++	return 255 << 8;
++}
++
++
++static int cpt_dump_iptables(struct cpt_context * ctx)
++{
++	int err = 0;
++#ifdef CONFIG_VE_IPTABLES
++	int pid;
++	int pfd[2];
++	struct file *f;
++	struct cpt_object_hdr v;
++	char buf[16];
++	loff_t pos;
++	int n;
++	int status;
++	mm_segment_t oldfs;
++	sigset_t ignore, blocked;
++
++	if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD))
++		return 0;
++
++	err = sc_pipe(pfd);
++	if (err < 0) {
++		eprintk_ctx("sc_pipe: %d\n", err);
++		return err;
++	}
++	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
++	sigprocmask(SIG_BLOCK, &ignore, &blocked);
++	err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0);
++	if (err < 0) {
++		eprintk_ctx("local_kernel_thread: %d\n", err);
++		goto out;
++	}
++
++	f = fget(pfd[0]);
++	sc_close(pfd[1]);
++	sc_close(pfd[0]);
++
++	cpt_open_section(ctx, CPT_SECT_NET_IPTABLES);
++
++	cpt_open_object(NULL, ctx);
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_NAME;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_NAME;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	pos = ctx->file->f_pos;
++	do {
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
++		set_fs(oldfs);
++		if (n > 0)
++			ctx->write(buf, n, ctx);
++	} while (n > 0);
++
++	if (n < 0)
++		eprintk_ctx("read: %d\n", n);
++
++	fput(f);
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if ((err = sc_waitx(pid, 0, &status)) < 0)
++		eprintk_ctx("wait4: %d\n", err);
++	else if ((status & 0x7f) == 0) {
++		err = (status & 0xff00) >> 8;
++		if (err != 0) {
++			eprintk_ctx("iptables-save exited with %d\n", err);
++			err = -EINVAL;
++		}
++	} else {
++		eprintk_ctx("iptables-save terminated\n");
++		err = -EINVAL;
++	}
++	set_fs(oldfs);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++
++	if (ctx->file->f_pos != pos) {
++		buf[0] = 0;
++		ctx->write(buf, 1, ctx);
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_close_section(ctx);
++	} else {
++		pos = ctx->current_section;
++		cpt_close_object(ctx);
++		cpt_close_section(ctx);
++		ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL;
++		ctx->file->f_pos = pos;
++	}
++	return n ? : err;
++
++out:
++	if (pfd[1] >= 0)
++		sc_close(pfd[1]);
++	if (pfd[0] >= 0)
++		sc_close(pfd[0]);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++#endif
++	return err;
++}
++
++int cpt_dump_ifinfo(struct cpt_context * ctx)
++{
++	int err;
++
++	rtnl_lock();
++	err = cpt_dump_link(ctx);
++	if (!err)
++		err = cpt_dump_ifaddr(ctx);
++	rtnl_unlock();
++	if (!err)
++		err = cpt_dump_route(ctx);
++	if (!err)
++		err = cpt_dump_iptables(ctx);
++	return err;
++}
+diff --git a/kernel/cpt/cpt_net.h b/kernel/cpt/cpt_net.h
+new file mode 100644
+index 0000000..5d33877
+--- /dev/null
++++ b/kernel/cpt/cpt_net.h
+@@ -0,0 +1,7 @@
++int cpt_dump_ifinfo(struct cpt_context *ctx);
++int rst_restore_net(struct cpt_context *ctx);
++int cpt_suspend_network(struct cpt_context *ctx);
++int cpt_resume_network(struct cpt_context *ctx);
++int rst_resume_network(struct cpt_context *ctx);
++int cpt_dump_ip_conntrack(struct cpt_context *ctx);
++int rst_restore_ip_conntrack(struct cpt_context * ctx);
+diff --git a/kernel/cpt/cpt_obj.c b/kernel/cpt/cpt_obj.c
+new file mode 100644
+index 0000000..7ab23d7
+--- /dev/null
++++ b/kernel/cpt/cpt_obj.c
+@@ -0,0 +1,162 @@
++/*
++ *
++ *  kernel/cpt/cpt_obj.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = kmalloc(sizeof(cpt_object_t), gfp);
++	if (obj) {
++		INIT_LIST_HEAD(&obj->o_list);
++		INIT_LIST_HEAD(&obj->o_hash);
++		INIT_LIST_HEAD(&obj->o_alist);
++		obj->o_count = 1;
++		obj->o_pos = CPT_NULL;
++		obj->o_lock = 0;
++		obj->o_parent = NULL;
++		obj->o_index = CPT_NOINDEX;
++		obj->o_obj = NULL;
++		obj->o_image = NULL;
++		ctx->objcount++;
++	}
++	return obj;
++}
++
++void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx)
++{
++	list_del(&obj->o_alist);
++	kfree(obj);
++	ctx->objcount--;
++}
++
++void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx)
++{
++	list_add_tail(&obj->o_list, &ctx->object_array[type]);
++}
++
++void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj,
++			cpt_object_t *head, cpt_context_t *ctx)
++{
++	list_add(&obj->o_list, &head->o_list);
++}
++
++cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p,
++		unsigned gfp_mask, cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = lookup_cpt_object(type, p, ctx);
++
++	if (obj) {
++		obj->o_count++;
++		return obj;
++	}
++
++	if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) {
++		if (p)
++			cpt_obj_setobj(obj, p, ctx);
++		intern_cpt_object(type, obj, ctx);
++		return obj;
++	}
++	return NULL;
++}
++
++cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
++{
++	return __cpt_object_add(type, p, GFP_KERNEL, ctx);
++}
++
++cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = lookup_cpt_object(type, p, ctx);
++
++	if (obj)
++		obj->o_count++;
++
++	return obj;
++}
++
++int cpt_object_init(cpt_context_t *ctx)
++{
++	int i;
++
++	for (i=0; i<CPT_OBJ_MAX; i++) {
++		INIT_LIST_HEAD(&ctx->object_array[i]);
++	}
++	return 0;
++}
++
++int cpt_object_destroy(cpt_context_t *ctx)
++{
++	int i;
++
++	for (i=0; i<CPT_OBJ_MAX; i++) {
++		while (!list_empty(&ctx->object_array[i])) {
++			struct list_head *head = ctx->object_array[i].next;
++			cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
++			list_del(head);
++			if (obj->o_image)
++				kfree(obj->o_image);
++			free_cpt_object(obj, ctx);
++		}
++	}
++	if (ctx->objcount != 0)
++		eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount);
++	return 0;
++}
++
++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, type) {
++		if (obj->o_obj == p)
++			return obj;
++	}
++	return NULL;
++}
++
++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, type) {
++		if (obj->o_pos == pos)
++			return obj;
++	}
++	return NULL;
++}
++
++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, type) {
++		if (obj->o_index == index)
++			return obj;
++	}
++	return NULL;
++}
+diff --git a/kernel/cpt/cpt_obj.h b/kernel/cpt/cpt_obj.h
+new file mode 100644
+index 0000000..7762623
+--- /dev/null
++++ b/kernel/cpt/cpt_obj.h
+@@ -0,0 +1,62 @@
++#ifndef __CPT_OBJ_H_
++#define __CPT_OBJ_H_ 1
++
++#include <linux/list.h>
++#include <linux/cpt_image.h>
++
++typedef struct _cpt_object
++{
++	struct list_head	o_list;
++	struct list_head	o_hash;
++	int			o_count;
++	int			o_index;
++	int			o_lock;
++	loff_t			o_pos;
++	loff_t			o_ppos;
++	void			*o_obj;
++	void			*o_image;
++	void			*o_parent;
++	struct list_head	o_alist;
++} cpt_object_t;
++
++struct cpt_context;
++
++#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list)
++
++
++extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx);
++extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx);
++
++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx);
++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx);
++
++static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx)
++{
++	cpt->o_pos = pos;
++	/* Add to pos hash table */
++}
++
++static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx)
++{
++	cpt->o_obj = ptr;
++	/* Add to hash table */
++}
++
++static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx)
++{
++	cpt->o_index = index;
++	/* Add to index hash table */
++}
++
++
++extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx);
++extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx);
++extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx);
++extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++
++extern int cpt_object_init(struct cpt_context *ctx);
++extern int cpt_object_destroy(struct cpt_context *ctx);
++
++#endif /* __CPT_OBJ_H_ */
+diff --git a/kernel/cpt/cpt_proc.c b/kernel/cpt/cpt_proc.c
+new file mode 100644
+index 0000000..08d5fd4
+--- /dev/null
++++ b/kernel/cpt/cpt_proc.c
+@@ -0,0 +1,595 @@
++/*
++ *
++ *  kernel/cpt/cpt_proc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/list.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_ioctl.h>
++#include <linux/delay.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++
++MODULE_AUTHOR("Alexey Kuznetsov <alexey at sw.ru>");
++MODULE_LICENSE("GPL");
++
++/* List of contexts and lock protecting the list */
++static struct list_head cpt_context_list;
++static spinlock_t cpt_context_lock;
++
++static int proc_read(char *buffer, char **start, off_t offset,
++		     int length, int *eof, void *data)
++{
++	off_t pos = 0;
++	off_t begin = 0;
++	int len = 0;
++	cpt_context_t *ctx;
++
++	len += sprintf(buffer, "Ctx      Id       VE       State\n");
++
++	spin_lock(&cpt_context_lock);
++
++	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++		len += sprintf(buffer+len,"%p %08x %-8u %d",
++			       ctx,
++			       ctx->contextid,
++			       ctx->ve_id,
++			       ctx->ctx_state
++			       );
++
++		buffer[len++] = '\n';
++
++		pos = begin+len;
++		if (pos < offset) {
++			len = 0;
++			begin = pos;
++		}
++		if (pos > offset+length)
++			goto done;
++	}
++	*eof = 1;
++
++done:
++	spin_unlock(&cpt_context_lock);
++	*start = buffer + (offset - begin);
++	len -= (offset - begin);
++	if(len > length)
++		len = length;
++	if(len < 0)
++		len = 0;
++	return len;
++}
++
++void cpt_context_release(cpt_context_t *ctx)
++{
++	list_del(&ctx->ctx_list);
++	spin_unlock(&cpt_context_lock);
++
++	if (ctx->ctx_state > 0)
++		cpt_resume(ctx);
++	ctx->ctx_state = CPT_CTX_ERROR;
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	if (ctx->pgin_task)
++		put_task_struct(ctx->pgin_task);
++	if (ctx->pgin_dir)
++		cpt_free_pgin_dir(ctx);
++	if (ctx->pagein_file_out)
++		fput(ctx->pagein_file_out);
++	if (ctx->pagein_file_in)
++		fput(ctx->pagein_file_in);
++#endif
++	if (ctx->objcount)
++		eprintk_ctx("%d objects leaked\n", ctx->objcount);
++	if (ctx->file)
++		fput(ctx->file);
++	cpt_flush_error(ctx);
++	if (ctx->errorfile) {
++		fput(ctx->errorfile);
++		ctx->errorfile = NULL;
++	}
++	if (ctx->error_msg) {
++		free_page((unsigned long)ctx->error_msg);
++		ctx->error_msg = NULL;
++	}
++	if (ctx->statusfile)
++		fput(ctx->statusfile);
++	if (ctx->lockfile)
++		fput(ctx->lockfile);
++	kfree(ctx);
++
++	spin_lock(&cpt_context_lock);
++}
++
++static void __cpt_context_put(cpt_context_t *ctx)
++{
++	if (!--ctx->refcount)
++		cpt_context_release(ctx);
++}
++
++static void cpt_context_put(cpt_context_t *ctx)
++{
++	spin_lock(&cpt_context_lock);
++	__cpt_context_put(ctx);
++	spin_unlock(&cpt_context_lock);
++}
++
++cpt_context_t * cpt_context_open(void)
++{
++	cpt_context_t *ctx;
++
++	if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
++		cpt_context_init(ctx);
++		spin_lock(&cpt_context_lock);
++		list_add_tail(&ctx->ctx_list, &cpt_context_list);
++		spin_unlock(&cpt_context_lock);
++		ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
++		if (ctx->error_msg != NULL)
++			ctx->error_msg[0] = 0;
++	}
++	return ctx;
++}
++
++static cpt_context_t * cpt_context_lookup(unsigned int contextid)
++{
++	cpt_context_t *ctx;
++
++	spin_lock(&cpt_context_lock);
++	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++		if (ctx->contextid == contextid) {
++			ctx->refcount++;
++			spin_unlock(&cpt_context_lock);
++			return ctx;
++		}
++	}
++	spin_unlock(&cpt_context_lock);
++	return NULL;
++}
++
++int cpt_context_lookup_veid(unsigned int veid)
++{
++	cpt_context_t *ctx;
++
++	spin_lock(&cpt_context_lock);
++	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++		if (ctx->ve_id == veid && ctx->ctx_state > 0) {
++			spin_unlock(&cpt_context_lock);
++			return 1;
++		}
++	}
++	spin_unlock(&cpt_context_lock);
++	return 0;
++}
++
++static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
++{
++	int err = 0;
++	cpt_context_t *ctx;
++	struct file *dfile = NULL;
++	int try;
++
++	unlock_kernel();
++
++	if (cmd == CPT_VMPREP) {
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++		err = cpt_mm_prepare(arg);
++#else
++		err = -EINVAL;
++#endif
++		goto out_lock;
++	}
++
++	if (cmd == CPT_TEST_CAPS) {
++		unsigned int src_flags, dst_flags = arg;
++
++		err = 0;
++		src_flags = test_cpu_caps();
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
++		test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
++		goto out_lock;
++	}
++
++	if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
++		cpt_context_t *old_ctx;
++
++		ctx = NULL;
++		if (cmd == CPT_JOIN_CONTEXT) {
++			err = -ENOENT;
++			ctx = cpt_context_lookup(arg);
++			if (!ctx)
++				goto out_lock;
++		}
++
++		spin_lock(&cpt_context_lock);
++		old_ctx = (cpt_context_t*)file->private_data;
++		file->private_data = ctx;
++
++		if (old_ctx) {
++			if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
++				old_ctx->sticky = 0;
++				old_ctx->refcount--;
++			}
++			__cpt_context_put(old_ctx);
++		}
++		spin_unlock(&cpt_context_lock);
++		err = 0;
++		goto out_lock;
++	}
++
++	spin_lock(&cpt_context_lock);
++	ctx = (cpt_context_t*)file->private_data;
++	if (ctx)
++		ctx->refcount++;
++	spin_unlock(&cpt_context_lock);
++
++	if (!ctx) {
++		cpt_context_t *old_ctx;
++
++		err = -ENOMEM;
++		ctx = cpt_context_open();
++		if (!ctx)
++			goto out_lock;
++
++		spin_lock(&cpt_context_lock);
++		old_ctx = (cpt_context_t*)file->private_data;
++		if (!old_ctx) {
++			ctx->refcount++;
++			file->private_data = ctx;
++		} else {
++			old_ctx->refcount++;
++		}
++		if (old_ctx) {
++			__cpt_context_put(ctx);
++			ctx = old_ctx;
++		}
++		spin_unlock(&cpt_context_lock);
++	}
++
++	if (cmd == CPT_GET_CONTEXT) {
++		unsigned int contextid = (unsigned int)arg;
++
++		if (ctx->contextid && ctx->contextid != contextid) {
++			err = -EINVAL;
++			goto out_nosem;
++		}
++		if (!ctx->contextid) {
++			cpt_context_t *c1 = cpt_context_lookup(contextid);
++			if (c1) {
++				cpt_context_put(c1);
++				err = -EEXIST;
++				goto out_nosem;
++			}
++			ctx->contextid = contextid;
++		}
++		spin_lock(&cpt_context_lock);
++		if (!ctx->sticky) {
++			ctx->sticky = 1;
++			ctx->refcount++;
++		}
++		spin_unlock(&cpt_context_lock);
++		goto out_nosem;
++	}
++
++	down(&ctx->main_sem);
++
++	err = -EBUSY;
++	if (ctx->ctx_state < 0)
++		goto out;
++
++	err = 0;
++	switch (cmd) {
++	case CPT_SET_DUMPFD:
++		if (ctx->ctx_state == CPT_CTX_DUMPING) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			err = -EBADF;
++			dfile = fget(arg);
++			if (dfile == NULL)
++				break;
++			if (dfile->f_op == NULL ||
++			    dfile->f_op->write == NULL) {
++				fput(dfile);
++				break;
++			}
++			err = 0;
++		}
++		if (ctx->file)
++			fput(ctx->file);
++		ctx->file = dfile;
++		break;
++	case CPT_SET_ERRORFD:
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->errorfile)
++			fput(ctx->errorfile);
++		ctx->errorfile = dfile;
++		break;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	case CPT_SET_PAGEINFDIN:
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->pagein_file_in)
++			fput(ctx->pagein_file_in);
++		ctx->pagein_file_in = dfile;
++		break;
++	case CPT_SET_PAGEINFDOUT:
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->pagein_file_out)
++			fput(ctx->pagein_file_out);
++		ctx->pagein_file_out = dfile;
++		break;
++	case CPT_SET_LAZY:
++		ctx->lazy_vm = arg;
++		break;
++	case CPT_ITER:
++		err = cpt_iteration(ctx);
++		break;
++	case CPT_PAGEIND:
++		err = cpt_start_pagein(ctx);
++		break;
++#endif
++	case CPT_SET_VEID:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		ctx->ve_id = arg;
++		break;
++	case CPT_SET_CPU_FLAGS:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		ctx->dst_cpu_flags = arg;
++		ctx->src_cpu_flags = test_cpu_caps();
++		break;
++	case CPT_SUSPEND:
++		if (cpt_context_lookup_veid(ctx->ve_id) ||
++		    ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		ctx->ctx_state = CPT_CTX_SUSPENDING;
++		try = 0;
++		do {
++			err = cpt_vps_suspend(ctx);
++			if (err)
++				cpt_resume(ctx);
++			if (err == -EAGAIN)
++				msleep(1000);
++			try++;
++		} while (err == -EAGAIN && try < 3);
++		if (err) {
++			ctx->ctx_state = CPT_CTX_IDLE;
++		} else {
++			ctx->ctx_state = CPT_CTX_SUSPENDED;
++		}
++		break;
++	case CPT_DUMP:
++		if (!ctx->ctx_state) {
++			err = -ENOENT;
++			break;
++		}
++		if (!ctx->file) {
++			err = -EBADF;
++			break;
++		}
++		err = cpt_dump(ctx);
++		break;
++	case CPT_RESUME:
++		if (ctx->ctx_state == CPT_CTX_IDLE) {
++			err = -ENOENT;
++			break;
++		}
++		err = cpt_resume(ctx);
++		if (!err)
++			ctx->ctx_state = CPT_CTX_IDLE;
++		break;
++	case CPT_KILL:
++		if (ctx->ctx_state == CPT_CTX_IDLE) {
++			err = -ENOENT;
++			break;
++		}
++		err = cpt_kill(ctx);
++		if (!err)
++			ctx->ctx_state = CPT_CTX_IDLE;
++		break;
++	case CPT_TEST_VECAPS:
++	{
++		__u32 dst_flags = arg;
++		__u32 src_flags;
++
++		err = cpt_vps_caps(ctx, &src_flags);
++		if (err)
++			break;
++
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err);
++		test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err);
++		if (src_flags & CPT_UNSUPPORTED_MASK)
++			err = 2;
++		break;
++	}
++	default:
++		err = -EINVAL;
++		break;
++	}
++
++out:
++	cpt_flush_error(ctx);
++	up(&ctx->main_sem);
++out_nosem:
++	cpt_context_put(ctx);
++out_lock:
++	lock_kernel();
++	if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
++	    err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
++		err = -EINTR;
++	return err;
++}
++
++static int cpt_open(struct inode *inode, struct file *file)
++{
++	if (!try_module_get(THIS_MODULE))
++		return -EBUSY;
++
++	return 0;
++}
++
++static int cpt_release(struct inode * inode, struct file * file)
++{
++	cpt_context_t *ctx;
++
++	spin_lock(&cpt_context_lock);
++	ctx = (cpt_context_t*)file->private_data;
++	file->private_data = NULL;
++
++	if (ctx)
++		__cpt_context_put(ctx);
++	spin_unlock(&cpt_context_lock);
++
++	module_put(THIS_MODULE);
++	return 0;
++}
++
++
++static struct file_operations cpt_fops = {
++	.owner	 = THIS_MODULE,
++	.open    = cpt_open,
++	.release = cpt_release,
++	.ioctl	 = cpt_ioctl,
++};
++
++static struct proc_dir_entry *proc_ent;
++
++static struct ctl_table_header *ctl_header;
++
++static ctl_table debug_table[] = {
++	{
++		.procname	= "cpt",
++		.data		= &debug_level,
++		.maxlen		= sizeof(debug_level),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{ .ctl_name = 0 }
++};
++static ctl_table root_table[] = {
++	{
++		.ctl_name	= CTL_DEBUG,
++		.procname	= "debug",
++		.mode		= 0555,
++		.child		= debug_table,
++	},
++	{ .ctl_name = 0 }
++};
++
++static int __init init_cpt(void)
++{
++	int err;
++
++	err = -ENOMEM;
++	ctl_header = register_sysctl_table(root_table);
++	if (!ctl_header)
++		goto err_mon;
++
++	spin_lock_init(&cpt_context_lock);
++	INIT_LIST_HEAD(&cpt_context_list);
++
++	err = -EINVAL;
++	proc_ent = proc_create("cpt", 0600, NULL, NULL);
++	if (!proc_ent)
++		goto err_out;
++
++	cpt_fops.read = proc_ent->proc_fops->read;
++	cpt_fops.write = proc_ent->proc_fops->write;
++	cpt_fops.llseek = proc_ent->proc_fops->llseek;
++	proc_ent->proc_fops = &cpt_fops;
++
++	proc_ent->read_proc = proc_read;
++	proc_ent->data = NULL;
++	proc_ent->owner = THIS_MODULE;
++	return 0;
++
++err_out:
++	unregister_sysctl_table(ctl_header);
++err_mon:
++	return err;
++}
++module_init(init_cpt);
++
++static void __exit exit_cpt(void)
++{
++	remove_proc_entry("cpt", NULL);
++	unregister_sysctl_table(ctl_header);
++
++	spin_lock(&cpt_context_lock);
++	while (!list_empty(&cpt_context_list)) {
++		cpt_context_t *ctx;
++		ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
++
++		if (!ctx->sticky)
++			ctx->refcount++;
++		ctx->sticky = 0;
++
++		BUG_ON(ctx->refcount != 1);
++
++		__cpt_context_put(ctx);
++	}
++	spin_unlock(&cpt_context_lock);
++}
++module_exit(exit_cpt);
+diff --git a/kernel/cpt/cpt_process.c b/kernel/cpt/cpt_process.c
+new file mode 100644
+index 0000000..4ceb351
+--- /dev/null
++++ b/kernel/cpt/cpt_process.c
+@@ -0,0 +1,1366 @@
++/*
++ *
++ *  kernel/cpt/cpt_process.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/poll.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/compat.h>
++#include <linux/cpt_image.h>
++#include <linux/nsproxy.h>
++#include <linux/futex.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_ubc.h"
++#include "cpt_process.h"
++#include "cpt_kernel.h"
++
++#ifdef CONFIG_X86_32
++#undef task_pt_regs
++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
++#endif
++
++int check_task_state(struct task_struct *tsk, struct cpt_context *ctx)
++{
++#ifdef CONFIG_X86_64
++	if (!(task_thread_info(tsk)->flags&_TIF_IA32)) {
++		if (task_pt_regs(tsk)->ip >= VSYSCALL_START &&
++				task_pt_regs(tsk)->ip < VSYSCALL_END) {
++			eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk));
++			return -EAGAIN;
++		}
++	}
++#endif
++	return 0;
++}
++
++#ifdef CONFIG_X86
++
++static u32 encode_segment(u32 segreg)
++{
++	segreg &= 0xFFFF;
++
++	if (segreg == 0)
++		return CPT_SEG_ZERO;
++	if ((segreg & 3) != 3) {
++		wprintk("Invalid RPL of a segment reg %x\n", segreg);
++		return CPT_SEG_ZERO;
++	}
++
++	/* LDT descriptor, it is just an index to LDT array */
++	if (segreg & 4)
++		return CPT_SEG_LDT + (segreg >> 3);
++
++	/* TLS descriptor. */
++	if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN &&
++	    (segreg >> 3) <= GDT_ENTRY_TLS_MAX)
++		return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN);
++
++	/* One of standard desriptors */
++#ifdef CONFIG_X86_64
++	if (segreg == __USER32_DS)
++		return CPT_SEG_USER32_DS;
++	if (segreg == __USER32_CS)
++		return CPT_SEG_USER32_CS;
++	if (segreg == __USER_DS)
++		return CPT_SEG_USER64_DS;
++	if (segreg == __USER_CS)
++		return CPT_SEG_USER64_CS;
++#else
++	if (segreg == __USER_DS)
++		return CPT_SEG_USER32_DS;
++	if (segreg == __USER_CS)
++		return CPT_SEG_USER32_CS;
++#endif
++	wprintk("Invalid segment reg %x\n", segreg);
++	return CPT_SEG_ZERO;
++}
++
++#ifdef CONFIG_X86_64
++static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s,
++		struct task_struct *tsk)
++{
++	d->cpt_ebp = s->bp;
++	d->cpt_ebx = s->bx;
++	d->cpt_eax = s->ax;
++	d->cpt_ecx = s->cx;
++	d->cpt_edx = s->dx;
++	d->cpt_esi = s->si;
++	d->cpt_edi = s->di;
++	d->cpt_orig_eax = s->orig_ax;
++	d->cpt_eip = s->ip;
++	d->cpt_xcs = encode_segment(s->cs);
++	d->cpt_eflags = s->flags;
++	d->cpt_esp = s->sp;
++	d->cpt_xss = encode_segment(s->ss);
++	d->cpt_xds = encode_segment(tsk->thread.ds);
++	d->cpt_xes = encode_segment(tsk->thread.es);
++}
++
++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	cpt_open_object(NULL, ctx);
++
++	if (task_thread_info(tsk)->flags & _TIF_IA32) {
++		struct cpt_x86_regs ri;
++		ri.cpt_next = sizeof(ri);
++		ri.cpt_object = CPT_OBJ_X86_REGS;
++		ri.cpt_hdrlen = sizeof(ri);
++		ri.cpt_content = CPT_CONTENT_VOID;
++
++		ri.cpt_debugreg[0] = tsk->thread.debugreg0;
++		ri.cpt_debugreg[1] = tsk->thread.debugreg1;
++		ri.cpt_debugreg[2] = tsk->thread.debugreg2;
++		ri.cpt_debugreg[3] = tsk->thread.debugreg3;
++		ri.cpt_debugreg[4] = 0;
++		ri.cpt_debugreg[5] = 0;
++		ri.cpt_debugreg[6] = tsk->thread.debugreg6;
++		ri.cpt_debugreg[7] = tsk->thread.debugreg7;
++		ri.cpt_fs = encode_segment(tsk->thread.fsindex);
++		ri.cpt_gs = encode_segment(tsk->thread.gsindex);
++
++		xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk);
++
++		ctx->write(&ri, sizeof(ri), ctx);
++	} else {
++		struct cpt_x86_64_regs ri;
++		ri.cpt_next = sizeof(ri);
++		ri.cpt_object = CPT_OBJ_X86_64_REGS;
++		ri.cpt_hdrlen = sizeof(ri);
++		ri.cpt_content = CPT_CONTENT_VOID;
++
++		ri.cpt_fsbase = tsk->thread.fs;
++		ri.cpt_gsbase = tsk->thread.gs;
++		ri.cpt_fsindex = encode_segment(tsk->thread.fsindex);
++		ri.cpt_gsindex = encode_segment(tsk->thread.gsindex);
++		ri.cpt_ds = encode_segment(tsk->thread.ds);
++		ri.cpt_es = encode_segment(tsk->thread.es);
++		ri.cpt_debugreg[0] = tsk->thread.debugreg0;
++		ri.cpt_debugreg[1] = tsk->thread.debugreg1;
++		ri.cpt_debugreg[2] = tsk->thread.debugreg2;
++		ri.cpt_debugreg[3] = tsk->thread.debugreg3;
++		ri.cpt_debugreg[4] = 0;
++		ri.cpt_debugreg[5] = 0;
++		ri.cpt_debugreg[6] = tsk->thread.debugreg6;
++		ri.cpt_debugreg[7] = tsk->thread.debugreg7;
++
++		memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs));
++
++		ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs);
++		ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss);
++
++		ctx->write(&ri, sizeof(ri), ctx);
++
++	}
++	cpt_close_object(ctx);
++
++	return 0;
++}
++
++#else
++
++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	struct cpt_x86_regs ri;
++	struct pt_regs *pt_regs;
++
++	cpt_open_object(NULL, ctx);
++
++	ri.cpt_next = sizeof(ri);
++	ri.cpt_object = CPT_OBJ_X86_REGS;
++	ri.cpt_hdrlen = sizeof(ri);
++	ri.cpt_content = CPT_CONTENT_VOID;
++
++	ri.cpt_debugreg[0] = tsk->thread.debugreg0;
++	ri.cpt_debugreg[1] = tsk->thread.debugreg1;
++	ri.cpt_debugreg[2] = tsk->thread.debugreg2;
++	ri.cpt_debugreg[3] = tsk->thread.debugreg3;
++	ri.cpt_debugreg[6] = tsk->thread.debugreg6;
++	ri.cpt_debugreg[7] = tsk->thread.debugreg7;
++
++	pt_regs = task_pt_regs(tsk);
++
++	ri.cpt_fs = encode_segment(pt_regs->fs);
++	ri.cpt_gs = encode_segment(tsk->thread.gs);
++
++	ri.cpt_ebx = pt_regs->bx;
++	ri.cpt_ecx = pt_regs->cx;
++	ri.cpt_edx = pt_regs->dx;
++	ri.cpt_esi = pt_regs->si;
++	ri.cpt_edi = pt_regs->di;
++	ri.cpt_ebp = pt_regs->bp;
++	ri.cpt_eax = pt_regs->ax;
++	ri.cpt_xds = pt_regs->ds;
++	ri.cpt_xes = pt_regs->es;
++	ri.cpt_orig_eax = pt_regs->orig_ax;
++	ri.cpt_eip = pt_regs->ip;
++	ri.cpt_xcs = pt_regs->cs;
++	ri.cpt_eflags = pt_regs->flags;
++	ri.cpt_esp = pt_regs->sp;
++	ri.cpt_xss = pt_regs->ss;
++
++	ri.cpt_xcs = encode_segment(pt_regs->cs);
++	ri.cpt_xss = encode_segment(pt_regs->ss);
++	ri.cpt_xds = encode_segment(pt_regs->ds);
++	ri.cpt_xes = encode_segment(pt_regs->es);
++
++	ctx->write(&ri, sizeof(ri), ctx);
++	cpt_close_object(ctx);
++
++	return 0;
++}
++#endif
++#endif
++
++#ifdef CONFIG_IA64
++
++/*
++   PMD?
++ */
++
++#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \
++						 CPT_TID(tsk), err); return -EINVAL; } } while (0) 
++
++static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk,
++			struct cpt_context *ctx)
++{
++	int err;
++	struct unw_frame_info info;
++	struct ia64_fpreg fpval;
++	int i;
++
++	unw_init_from_blocked_task(&info, tsk);
++	_C(unw_unwind_to_user(&info));
++
++	/* NAT_BITS */
++	do {
++		unsigned long scratch_unat;
++
++		scratch_unat = info.sw->caller_unat;
++		if (info.pri_unat_loc)
++			scratch_unat = *info.pri_unat_loc;
++
++		r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat);
++		/* Just to be on safe side. */
++		r->nat[0] &= 0xFFFFFFFFUL;
++	} while (0);
++
++	/* R4-R7 */
++	for (i = 4; i <= 7; i++) {
++		char nat = 0;
++		_C(unw_access_gr(&info, i, &r->gr[i], &nat, 0));
++		r->nat[0] |= (nat != 0) << i;
++	}
++
++	/* B1-B5 */
++	for (i = 1; i <= 5; i++) {
++		_C(unw_access_br(&info, i, &r->br[i], 0));
++	}
++
++	/* AR_EC, AR_LC */
++	_C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0));
++	_C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0));
++
++	/* F2..F5, F16..F31 */
++	for (i = 2; i <= 5; i++) {
++		_C(unw_get_fr(&info, i, &fpval));
++		memcpy(&r->fr[i*2], &fpval, 16);
++	}
++	for (i = 16; i <= 31; i++) {
++		_C(unw_get_fr(&info, i, &fpval));
++		memcpy(&r->fr[i*2], &fpval, 16);
++	}
++	return 0;
++}
++
++#undef _C
++
++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	int err;
++	unsigned long pg;
++	struct cpt_ia64_regs *r;
++	struct ia64_psr *psr;
++	struct switch_stack *sw;
++	struct pt_regs *pt;
++	void *krbs = (void *)tsk + IA64_RBS_OFFSET;
++	unsigned long reg;
++
++	if (tsk->exit_state)
++		return 0;
++
++	pt = task_pt_regs(tsk);
++
++	sw = (struct switch_stack *) (tsk->thread.ksp + 16);
++
++	if ((pg = __get_free_page(GFP_KERNEL)) == 0)
++		return -ENOMEM;
++
++	r = (void*)pg;
++	/* To catch if we forgot some register */
++	memset(r, 0xA5, sizeof(*r));
++
++	r->gr[0] = 0;
++	r->fr[0] = r->fr[1] = 0;
++	r->fr[2] = 0x8000000000000000UL;
++	r->fr[3] = 0xffff;
++
++	r->nat[0] = r->nat[1] = 0;
++
++	err = ass_to_mouth(r, tsk, ctx);
++	if (err) {
++		printk("ass_to_mouth error %d\n", err);
++		goto out;
++	}
++
++	/* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
++	memcpy(&r->gr[1], &pt->r1, 8*(2-1));
++	memcpy(&r->gr[2], &pt->r2, 8*(4-2));
++	memcpy(&r->gr[8], &pt->r8, 8*(12-8));
++	memcpy(&r->gr[12], &pt->r12, 8*(14-12));
++	memcpy(&r->gr[14], &pt->r14, 8*(15-14));
++	memcpy(&r->gr[15], &pt->r15, 8*(16-15));
++	memcpy(&r->gr[16], &pt->r16, 8*(32-16));
++
++	r->br[0] = pt->b0;
++	r->br[6] = pt->b6;
++	r->br[7] = pt->b7;
++
++	r->ar_bspstore = pt->ar_bspstore;
++	r->ar_unat = pt->ar_unat;
++	r->ar_pfs = pt->ar_pfs;
++	r->ar_ccv = pt->ar_ccv;
++	r->ar_fpsr = pt->ar_fpsr;
++	r->ar_csd = pt->ar_csd;
++	r->ar_ssd = pt->ar_ssd;
++	r->ar_rsc = pt->ar_rsc;
++
++	r->cr_iip = pt->cr_iip;
++	r->cr_ipsr = pt->cr_ipsr;
++
++	r->pr = pt->pr;
++
++	r->cfm = pt->cr_ifs;
++	r->ar_rnat = pt->ar_rnat;
++
++	/* fpregs 6..9,10..11 are in pt_regs */
++	memcpy(&r->fr[2*6], &pt->f6, 16*(10-6));
++	memcpy(&r->fr[2*10], &pt->f10, 16*(12-10));
++	/* fpreg 12..15 are on switch stack */
++	memcpy(&r->fr[2*12], &sw->f12, 16*(16-12));
++	/* fpregs 32...127 */
++	psr = ia64_psr(task_pt_regs(tsk));
++	preempt_disable();
++	if (ia64_is_local_fpu_owner(tsk) && psr->mfh) {
++		psr->mfh = 0;
++		tsk->thread.flags |= IA64_THREAD_FPH_VALID;
++		ia64_save_fpu(&tsk->thread.fph[0]);
++	}
++	preempt_enable();
++	memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32));
++
++	if (tsk->thread.flags & IA64_THREAD_DBG_VALID) {
++		memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr));
++		memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr));
++	} else {
++		memset(r->ibr, 0, sizeof(r->ibr));
++		memset(r->dbr, 0, sizeof(r->dbr));
++	}
++
++	r->loadrs = pt->loadrs;
++	r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19));
++	if ((long)pt->cr_ifs > 0)
++		r->num_regs += (pt->cr_ifs & 0x7f);
++
++	if (r->num_regs > 96) {
++		eprintk_ctx(CPT_FID " too much RSE regs %lu\n",
++			    CPT_TID(tsk), r->num_regs);
++		return -EINVAL;
++	}
++
++	for (reg = 0; reg < r->num_regs; reg++) {
++		unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
++		unsigned long *rnatp = ia64_rse_rnat_addr(ptr);
++
++		r->gr[32+reg] = *ptr;
++
++		if ((unsigned long)rnatp >= sw->ar_bspstore)
++			rnatp = &sw->ar_rnat;
++		if (*rnatp & (1UL<<ia64_rse_slot_num(ptr))) {
++			if (reg < 32)
++				r->nat[0] |= (1UL<<(reg+32));
++			else
++				r->nat[1] |= (1UL<<(reg-32));
++		}
++	}
++	if (r->nat[0] | r->nat[1])
++		wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk),
++			    r->nat[1], r->nat[0]);
++
++	cpt_open_object(NULL, ctx);
++	r->cpt_next = sizeof(*r);
++	r->cpt_object = CPT_OBJ_IA64_REGS;
++	r->cpt_hdrlen = sizeof(*r);
++	r->cpt_content = CPT_CONTENT_VOID;
++	ctx->write(r, sizeof(*r), ctx);
++	cpt_close_object(ctx);
++	err = 0;
++
++out:
++	free_page(pg);
++	return err;
++}
++#endif
++
++static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	struct cpt_obj_bits hdr;
++	unsigned long size;
++	void *start;
++
++	cpt_open_object(NULL, ctx);
++
++#ifdef CONFIG_X86_64
++	size = tsk->thread.sp0 - tsk->thread.sp;
++	start = (void*)tsk->thread.sp;
++#elif defined(CONFIG_X86_32)
++	size = tsk->thread.sp0 - tsk->thread.sp;
++	start = (void*)tsk->thread.sp;
++#elif defined(CONFIG_IA64)
++	size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp;
++	start = (void*)tsk->thread.ksp;
++#else
++#error Arch is not supported
++#endif
++
++	hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
++	hdr.cpt_object = CPT_OBJ_BITS;
++	hdr.cpt_hdrlen = sizeof(hdr);
++	hdr.cpt_content = CPT_CONTENT_STACK;
++	hdr.cpt_size = size;
++
++	ctx->write(&hdr, sizeof(hdr), ctx);
++	ctx->write(start, size, ctx);
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++	return 0;
++}
++
++#ifdef CONFIG_X86
++/* Formats of i387_fxsave_struct are the same for x86_64
++ * and i386. Plain luck. */
++
++static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	struct cpt_obj_bits hdr;
++	unsigned long size;
++	int type;
++
++	cpt_open_object(NULL, ctx);
++
++	type = CPT_CONTENT_X86_FPUSTATE;
++	size = sizeof(struct i387_fxsave_struct);
++#ifndef CONFIG_X86_64
++	if (!cpu_has_fxsr) {
++		size = sizeof(struct i387_fsave_struct);
++		type = CPT_CONTENT_X86_FPUSTATE_OLD;
++	}
++#endif
++
++	hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
++	hdr.cpt_object = CPT_OBJ_BITS;
++	hdr.cpt_hdrlen = sizeof(hdr);
++	hdr.cpt_content = type;
++	hdr.cpt_size = size;
++
++	ctx->write(&hdr, sizeof(hdr), ctx);
++	ctx->write(&tsk->thread.xstate, size, ctx);
++	ctx->align(ctx);
++	cpt_close_object(ctx);
++	return 0;
++}
++#endif
++
++#ifdef CONFIG_IA64
++
++static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
++{
++	return 0;
++}
++#endif
++
++static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info)
++{
++	si->cpt_signo = info->si_signo;
++	si->cpt_errno = info->si_errno;
++	si->cpt_code = info->si_code;
++
++	switch(si->cpt_code & __SI_MASK) {
++	case __SI_TIMER:
++		si->cpt_pid = info->si_tid;
++		si->cpt_uid = info->si_overrun;
++		si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr);
++		si->cpt_utime = info->si_sys_private;
++		break;
++	case __SI_POLL:
++		si->cpt_pid = info->si_band;
++		si->cpt_uid = info->si_fd;
++		break;
++	case __SI_FAULT:
++		si->cpt_sigval = cpt_ptr_export(info->si_addr);
++#ifdef __ARCH_SI_TRAPNO
++		si->cpt_pid = info->si_trapno;
++#endif
++		break;
++	case __SI_CHLD:
++		si->cpt_pid = info->si_pid;
++		si->cpt_uid = info->si_uid;
++		si->cpt_sigval = info->si_status;
++		si->cpt_stime = info->si_stime;
++		si->cpt_utime = info->si_utime;
++		break;
++	case __SI_KILL:
++	case __SI_RT:
++	case __SI_MESGQ:
++	default:
++		si->cpt_pid = info->si_pid;
++		si->cpt_uid = info->si_uid;
++		si->cpt_sigval = cpt_ptr_export(info->si_ptr);
++		break;
++	}
++	return 0;
++}
++
++static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx)
++{
++	struct sigqueue *q;
++	loff_t saved_obj;
++
++	if (list_empty(&list->list))
++		return 0;
++
++	cpt_push_object(&saved_obj, ctx);
++	list_for_each_entry(q, &list->list, list) {
++		struct cpt_siginfo_image si;
++
++		si.cpt_next = sizeof(si);
++		si.cpt_object = CPT_OBJ_SIGINFO;
++		si.cpt_hdrlen = sizeof(si);
++		si.cpt_content = CPT_CONTENT_VOID;
++
++		si.cpt_qflags = q->flags;
++		si.cpt_user = q->user->uid;
++
++		if (encode_siginfo(&si, &q->info))
++			return -EINVAL;
++
++		ctx->write(&si, sizeof(si), ctx);
++	}
++	cpt_pop_object(&saved_obj, ctx);
++	return 0;
++}
++
++
++
++static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct signal_struct *sig = obj->o_obj;
++	struct cpt_signal_image *v = cpt_get_buf(ctx);
++	struct task_struct *tsk;
++	int i;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_SIGNAL_STRUCT;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	if (sig->__pgrp <= 0) {
++		eprintk_ctx("bad pgid\n");
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++	v->cpt_pgrp_type = CPT_PGRP_NORMAL;
++	read_lock(&tasklist_lock);
++	tsk = find_task_by_pid(sig->__pgrp);
++	if (tsk == NULL)
++		v->cpt_pgrp_type = CPT_PGRP_ORPHAN;
++	read_unlock(&tasklist_lock);
++	v->cpt_pgrp = pid_to_vpid(sig->__pgrp);
++
++	v->cpt_old_pgrp = 0;
++/*	if (!sig->tty_old_pgrp) {
++		eprintk_ctx("bad tty_old_pgrp\n");
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}*/
++	if (sig->tty_old_pgrp) {
++		v->cpt_old_pgrp_type = CPT_PGRP_NORMAL;
++		read_lock(&tasklist_lock);
++		tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID);
++		if (tsk == NULL) {
++			v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN;
++			tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID);
++		}
++		read_unlock(&tasklist_lock);
++		if (tsk == NULL) {
++			eprintk_ctx("tty_old_pgrp does not exist anymore\n");
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++		v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp);
++		if ((int)v->cpt_old_pgrp < 0) {
++			dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp));
++			v->cpt_old_pgrp = -1;
++			v->cpt_old_pgrp_type = CPT_PGRP_STRAY;
++		}
++	}
++
++	if (sig->__session <= 0) {
++		eprintk_ctx("bad session\n");
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++	v->cpt_session_type = CPT_PGRP_NORMAL;
++	read_lock(&tasklist_lock);
++	tsk = find_task_by_pid(sig->__session);
++	if (tsk == NULL)
++		v->cpt_session_type = CPT_PGRP_ORPHAN;
++	read_unlock(&tasklist_lock);
++	v->cpt_session = pid_to_vpid(sig->__session);
++
++	v->cpt_leader = sig->leader;
++	v->cpt_ctty = CPT_NULL;
++	if (sig->tty) {
++		cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx);
++		if (cobj)
++			v->cpt_ctty = cobj->o_pos;
++		else {
++			eprintk_ctx("controlling tty is not found\n");
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++	}
++	memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8);
++
++	v->cpt_curr_target = 0;
++	if (sig->curr_target)
++		v->cpt_curr_target = task_pid_vnr(sig->curr_target);
++	v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0);
++	v->cpt_group_exit_code = sig->group_exit_code;
++	v->cpt_group_exit_task = 0;
++	if (sig->group_exit_task)
++		v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task);
++	v->cpt_notify_count = sig->notify_count;
++	v->cpt_group_stop_count = sig->group_stop_count;
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8)
++	v->cpt_utime = sig->utime;
++	v->cpt_stime = sig->stime;
++	v->cpt_cutime = sig->cutime;
++	v->cpt_cstime = sig->cstime;
++	v->cpt_nvcsw = sig->nvcsw;
++	v->cpt_nivcsw = sig->nivcsw;
++	v->cpt_cnvcsw = sig->cnvcsw;
++	v->cpt_cnivcsw = sig->cnivcsw;
++	v->cpt_min_flt = sig->min_flt;
++	v->cpt_maj_flt = sig->maj_flt;
++	v->cpt_cmin_flt = sig->cmin_flt;
++	v->cpt_cmaj_flt = sig->cmaj_flt;
++
++	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++		__asm__("undefined\n");
++
++	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++		if (i < RLIM_NLIMITS) {
++			v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur;
++			v->cpt_rlim_max[i] = sig->rlim[i].rlim_max;
++		} else {
++			v->cpt_rlim_cur[i] = CPT_NULL;
++			v->cpt_rlim_max[i] = CPT_NULL;
++		}
++	}
++#endif
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	dump_sigqueue(&sig->shared_pending, ctx);
++
++	cpt_close_object(ctx);
++	return 0;
++}
++
++int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx)
++{
++	if (tsk->splice_pipe) {
++		eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk));
++		return -EBUSY;
++	}
++#ifdef CONFIG_KEYS
++	if (tsk->request_key_auth || tsk->thread_keyring) {
++		eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk));
++		return -EBUSY;
++	}
++#endif
++#ifdef CONFIG_NUMA
++	if (tsk->mempolicy) {
++		eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk));
++		return -EBUSY;
++	}
++#endif
++#ifdef CONFIG_TUX
++	if (tsk->tux_info) {
++		eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk));
++		return -EBUSY;
++	}
++#endif
++	return 0;
++}
++
++static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct task_struct *tsk = obj->o_obj;
++	int last_thread;
++	struct cpt_task_image *v = cpt_get_buf(ctx);
++	cpt_object_t *tobj;
++	cpt_object_t *tg_obj;
++	loff_t saved_obj;
++	int i;
++	int err;
++	struct timespec delta;
++	struct mm_struct * tsk_mm;
++	struct files_struct * tsk_files;
++	struct fs_struct * tsk_fs;
++	struct mnt_namespace * tsk_ns;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_signal = CPT_NULL;
++	tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx);
++	if (!tg_obj) BUG();
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_TASK;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_state = tsk->state;
++	if (tsk->state == EXIT_ZOMBIE) {
++		eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk));
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	} else if (tsk->state == EXIT_DEAD) {
++		if (tsk->exit_state != EXIT_DEAD &&
++		    tsk->exit_state != EXIT_ZOMBIE) {
++			eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk));
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++	}
++	if (tsk->exit_state) {
++		v->cpt_state = tsk->exit_state;
++		if (tsk->state != EXIT_DEAD) {
++			eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n",
++				tsk->state, tsk->exit_state, CPT_TID(tsk));
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++	}
++	if (cpt_check_unsupported(tsk, ctx)) {
++		cpt_release_buf(ctx);
++		return -EBUSY;
++	}
++
++	v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART);
++	v->cpt_ptrace = tsk->ptrace;
++	v->cpt_prio = tsk->prio;
++	v->cpt_exit_code = tsk->exit_code;
++	v->cpt_exit_signal = tsk->exit_signal;
++	v->cpt_pdeath_signal = tsk->pdeath_signal;
++	v->cpt_static_prio = tsk->static_prio;
++	v->cpt_rt_priority = tsk->rt_priority;
++	v->cpt_policy = tsk->policy;
++	if (v->cpt_policy != SCHED_NORMAL) {
++		eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++
++	/* Unpleasant moment. When leader of thread group exits,
++	 * it remains in zombie state until all the group exits.
++	 * We save not-NULL pointers to process mm/files/fs, so
++	 * that we can restore this thread group.
++	 */
++	tsk_mm = tsk->mm;
++	tsk_files = tsk->files;
++	tsk_fs = tsk->fs;
++	tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL;
++
++	if (tsk->exit_state && !thread_group_empty(tsk) &&
++	    thread_group_leader(tsk)) {
++		struct task_struct * p = tsk;
++
++		read_lock(&tasklist_lock);
++		do {
++			if (p->mm)
++				tsk_mm = p->mm;
++			if (p->files)
++				tsk_files = p->files;
++			if (p->fs)
++				tsk_fs = p->fs;
++			if (p->nsproxy && p->nsproxy->mnt_ns)
++				tsk_ns = p->nsproxy->mnt_ns;
++			p = next_thread(p);
++		} while (p != tsk);
++		read_unlock(&tasklist_lock);
++	}
++
++	v->cpt_mm = CPT_NULL;
++	if (tsk_mm) {
++		tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx);
++		if (!tobj) BUG();
++		v->cpt_mm = tobj->o_pos;
++	}
++	v->cpt_files = CPT_NULL;
++	if (tsk_files) {
++		tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx);
++		if (!tobj) BUG();
++		v->cpt_files = tobj->o_pos;
++	}
++	v->cpt_fs = CPT_NULL;
++	if (tsk_fs) {
++		tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx);
++		if (!tobj) BUG();
++		v->cpt_fs = tobj->o_pos;
++	}
++	v->cpt_namespace = CPT_NULL;
++	if (tsk_ns) {
++		tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx);
++		if (!tobj) BUG();
++		v->cpt_namespace = tobj->o_pos;
++
++		if (tsk_ns != current->nsproxy->mnt_ns)
++			eprintk_ctx("namespaces are not supported:"
++					"process " CPT_FID "\n", CPT_TID(tsk));
++	}
++	v->cpt_sysvsem_undo = CPT_NULL;
++	if (tsk->sysvsem.undo_list && !tsk->exit_state) {
++		tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx);
++		if (!tobj) BUG();
++		v->cpt_sysvsem_undo = tobj->o_pos;
++	}
++	v->cpt_sighand = CPT_NULL;
++	if (tsk->sighand) {
++		tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx);
++		if (!tobj) BUG();
++		v->cpt_sighand = tobj->o_pos;
++	}
++	v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked);
++	v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked);
++	v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask);
++
++	v->cpt_pid = task_pid_vnr(tsk);
++	v->cpt_tgid = task_tgid_vnr(tsk);
++	v->cpt_ppid = 0;
++	if (tsk->parent) {
++		if (tsk->parent != tsk->real_parent &&
++		    !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) {
++			eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm);
++			cpt_release_buf(ctx);
++			return -EBUSY;
++		}
++		v->cpt_ppid = task_pid_vnr(tsk->parent);
++	}
++	v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0;
++	v->cpt_pgrp = task_pgrp_vnr(tsk);
++	v->cpt_session = task_session_vnr(tsk);
++	v->cpt_old_pgrp = 0;
++	if (tsk->signal->tty_old_pgrp)
++		v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp);
++	v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0;
++	v->cpt_set_tid = (unsigned long)tsk->set_child_tid;
++	v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid;
++	memcpy(v->cpt_comm, tsk->comm, 16);
++	v->cpt_user = tsk->user->uid;
++	v->cpt_uid = tsk->uid;
++	v->cpt_euid = tsk->euid;
++	v->cpt_suid = tsk->suid;
++	v->cpt_fsuid = tsk->fsuid;
++	v->cpt_gid = tsk->gid;
++	v->cpt_egid = tsk->egid;
++	v->cpt_sgid = tsk->sgid;
++	v->cpt_fsgid = tsk->fsgid;
++	v->cpt_ngids = 0;
++	if (tsk->group_info && tsk->group_info->ngroups != 0) {
++		int i = tsk->group_info->ngroups;
++		if (i > 32) {
++			/* Shame... I did a simplified version and _forgot_
++			 * about this. Later, later. */
++			eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk));
++			return -EINVAL;
++		}
++		v->cpt_ngids = i;
++		for (i--; i>=0; i--)
++			v->cpt_gids[i] = tsk->group_info->small_block[i];
++	}
++	v->cpt_prctl_uac = 0;
++	v->cpt_prctl_fpemu = 0;
++	v->__cpt_pad1 = 0;
++#ifdef CONFIG_IA64
++	v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT;
++	v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT;
++#endif
++	memcpy(&v->cpt_ecap, &tsk->cap_effective, 8);
++	memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8);
++	memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8);
++	v->cpt_keepcap = tsk->securebits;
++
++	v->cpt_did_exec = tsk->did_exec;
++	v->cpt_exec_domain = -1;
++	v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<<TIF_FREEZE);
++	v->cpt_64bit = 0;
++#ifdef CONFIG_X86_64
++	/* Clear x86_64 specific flags */
++	v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32);
++	if (!(task_thread_info(tsk)->flags & _TIF_IA32)) {
++		ctx->tasks64++;
++		v->cpt_64bit = 1;
++	}
++#endif
++#ifdef CONFIG_IA64
++	/* Clear ia64 specific flags */
++	//// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32);
++	if (!IS_IA32_PROCESS(task_pt_regs(tsk))) {
++		ctx->tasks64++;
++		v->cpt_64bit = 1;
++	}
++#endif
++	v->cpt_thrstatus = task_thread_info(tsk)->status;
++	v->cpt_addr_limit = -1;
++
++	v->cpt_personality = tsk->personality;
++
++#ifdef CONFIG_X86
++	for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) {
++		if (i>=3) {
++			eprintk_ctx("too many tls descs\n");
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++		v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a;
++	}
++#endif
++
++	v->cpt_restart.fn = CPT_RBL_0;
++	if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) {
++		struct restart_block *rb = &task_thread_info(tsk)->restart_block;
++		ktime_t e;
++
++		if (rb->fn == hrtimer_nanosleep_restart) {
++			v->cpt_restart.fn = CPT_RBL_NANOSLEEP;
++
++			e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2;
++			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
++			v->cpt_restart.arg0 = rb->arg0;
++			v->cpt_restart.arg1 = rb->arg1;
++			v->cpt_restart.arg2 = ktime_to_ns(e);
++			v->cpt_restart.arg3 = 0;
++			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
++			goto continue_dump;
++		}
++#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
++		if (rb->fn == compat_nanosleep_restart) {
++			v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP;
++
++			e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2;
++			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
++			v->cpt_restart.arg0 = rb->arg0;
++			v->cpt_restart.arg1 = rb->arg1;
++			v->cpt_restart.arg2 = ktime_to_ns(e);
++			v->cpt_restart.arg3 = 0;
++			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
++			goto continue_dump;
++		}
++#endif
++		if (rb->fn == do_restart_poll) {
++			u64 timeout_jiffies;
++
++			timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2;
++			e.tv64 = timeout_jiffies * TICK_NSEC;
++
++			v->cpt_restart.fn = CPT_RBL_POLL;
++			v->cpt_restart.arg0 = rb->arg0;
++			v->cpt_restart.arg1 = rb->arg1;
++			v->cpt_restart.arg2 = ktime_to_ns(e);
++			v->cpt_restart.arg3 = 0;
++			dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
++			goto continue_dump;
++		}
++		if (rb->fn == futex_wait_restart) {
++			v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT;
++
++			e.tv64 = rb->futex.time;
++			e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
++			v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr;
++			v->cpt_restart.arg1 = rb->futex.val;
++			v->cpt_restart.arg2 = ktime_to_ns(e);
++			v->cpt_restart.arg3 = rb->futex.flags;
++			goto continue_dump;
++		}
++		eprintk_ctx("unknown restart block %p\n", rb->fn);
++		return -EINVAL;
++	}
++
++continue_dump:
++	v->cpt_it_real_incr = 0;
++	v->cpt_it_prof_incr = 0;
++	v->cpt_it_virt_incr = 0;
++	v->cpt_it_real_value = 0;
++	v->cpt_it_prof_value = 0;
++	v->cpt_it_virt_value = 0;
++	if (thread_group_leader(tsk) && tsk->exit_state == 0) {
++		ktime_t rem;
++
++		v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr);
++		v->cpt_it_prof_incr = tsk->signal->it_prof_incr;
++		v->cpt_it_virt_incr = tsk->signal->it_virt_incr;
++
++		rem = hrtimer_get_remaining(&tsk->signal->real_timer);
++
++		if (hrtimer_active(&tsk->signal->real_timer)) {
++			if (rem.tv64 <= 0)
++				rem.tv64 = NSEC_PER_USEC;
++			v->cpt_it_real_value = ktime_to_ns(rem);
++			dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value);
++		}
++		v->cpt_it_prof_value = tsk->signal->it_prof_expires;
++		v->cpt_it_virt_value = tsk->signal->it_virt_expires;
++	}
++	v->cpt_used_math = (tsk_used_math(tsk) != 0);
++
++	if (tsk->notifier) {
++		eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++
++	v->cpt_utime = tsk->utime;
++	v->cpt_stime = tsk->stime;
++	delta = tsk->start_time;
++	_set_normalized_timespec(&delta,
++			delta.tv_sec - get_exec_env()->start_timespec.tv_sec,
++			delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
++	v->cpt_starttime = cpt_timespec_export(&delta);
++	v->cpt_nvcsw = tsk->nvcsw;
++	v->cpt_nivcsw = tsk->nivcsw;
++	v->cpt_min_flt = tsk->min_flt;
++	v->cpt_maj_flt = tsk->maj_flt;
++
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
++	v->cpt_cutime = tsk->cutime;
++	v->cpt_cstime = tsk->cstime;
++	v->cpt_cnvcsw = tsk->cnvcsw;
++	v->cpt_cnivcsw = tsk->cnivcsw;
++	v->cpt_cmin_flt = tsk->cmin_flt;
++	v->cpt_cmaj_flt = tsk->cmaj_flt;
++
++	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++		__asm__("undefined\n");
++
++	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++		if (i < RLIM_NLIMITS) {
++			v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur;
++			v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max;
++		} else {
++			v->cpt_rlim_cur[i] = CPT_NULL;
++			v->cpt_rlim_max[i] = CPT_NULL;
++		}
++	}
++#else
++	v->cpt_cutime = tsk->signal->cutime;
++	v->cpt_cstime = tsk->signal->cstime;
++	v->cpt_cnvcsw = tsk->signal->cnvcsw;
++	v->cpt_cnivcsw = tsk->signal->cnivcsw;
++	v->cpt_cmin_flt = tsk->signal->cmin_flt;
++	v->cpt_cmaj_flt = tsk->signal->cmaj_flt;
++
++	if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++		__asm__("undefined\n");
++
++	for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++		if (i < RLIM_NLIMITS) {
++			v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur;
++			v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max;
++		} else {
++			v->cpt_rlim_cur[i] = CPT_NULL;
++			v->cpt_rlim_max[i] = CPT_NULL;
++		}
++	}
++#endif
++
++#ifdef CONFIG_BEANCOUNTERS
++	if (tsk->mm)
++		v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx);
++	else
++		v->cpt_mm_ub = CPT_NULL;
++	v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx);
++	v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx);
++	v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx);
++#endif
++
++	v->cpt_ptrace_message = tsk->ptrace_message;
++	v->cpt_pn_state = tsk->pn_state;
++	v->cpt_stopped_state = tsk->stopped_state;
++	v->cpt_sigsuspend_state = 0;
++
++#ifdef CONFIG_X86_32
++	if (tsk->thread.vm86_info) {
++		eprintk_ctx("vm86 task is running\n");
++		cpt_release_buf(ctx);
++		return -EBUSY;
++	}
++#endif
++
++	v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal);
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	dump_kstack(tsk, ctx);
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	err = dump_registers(tsk, ctx);
++	cpt_pop_object(&saved_obj, ctx);
++	if (err)
++		return err;
++
++	if (tsk_used_math(tsk)) {
++		cpt_push_object(&saved_obj, ctx);
++		dump_fpustate(tsk, ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++
++	if (tsk->last_siginfo) {
++		struct cpt_siginfo_image si;
++		cpt_push_object(&saved_obj, ctx);
++
++		si.cpt_next = sizeof(si);
++		si.cpt_object = CPT_OBJ_LASTSIGINFO;
++		si.cpt_hdrlen = sizeof(si);
++		si.cpt_content = CPT_CONTENT_VOID;
++
++		if (encode_siginfo(&si, tsk->last_siginfo))
++			return -EINVAL;
++
++		ctx->write(&si, sizeof(si), ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++
++	if (tsk->sas_ss_size) {
++		struct cpt_sigaltstack_image si;
++		cpt_push_object(&saved_obj, ctx);
++
++		si.cpt_next = sizeof(si);
++		si.cpt_object = CPT_OBJ_SIGALTSTACK;
++		si.cpt_hdrlen = sizeof(si);
++		si.cpt_content = CPT_CONTENT_VOID;
++
++		si.cpt_stack = tsk->sas_ss_sp;
++		si.cpt_stacksize = tsk->sas_ss_size;
++
++		ctx->write(&si, sizeof(si), ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++
++	if (tsk->robust_list
++#ifdef CONFIG_COMPAT
++	    || tsk->compat_robust_list
++#endif
++	    ) {
++		struct cpt_task_aux_image ai;
++		cpt_push_object(&saved_obj, ctx);
++
++		ai.cpt_next = sizeof(ai);
++		ai.cpt_object = CPT_OBJ_TASK_AUX;
++		ai.cpt_hdrlen = sizeof(ai);
++		ai.cpt_content = CPT_CONTENT_VOID;
++
++		ai.cpt_robust_list = (unsigned long)tsk->robust_list;
++#ifdef CONFIG_X86_64
++#ifdef CONFIG_COMPAT
++		if (task_thread_info(tsk)->flags & _TIF_IA32)
++			ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list;
++#endif
++#endif
++		ctx->write(&ai, sizeof(ai), ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++
++	dump_sigqueue(&tsk->pending, ctx);
++
++	last_thread = 1;
++	read_lock(&tasklist_lock);
++	do {
++		struct task_struct * next = next_thread(tsk);
++		if (next != tsk && !thread_group_leader(next))
++			last_thread = 0;
++	} while (0);
++	read_unlock(&tasklist_lock);
++
++	if (last_thread) {
++		struct task_struct *prev_tsk;
++		int err;
++		loff_t pos = ctx->file->f_pos;
++
++		cpt_push_object(&saved_obj, ctx);
++		err = dump_one_signal_struct(tg_obj, ctx);
++		cpt_pop_object(&saved_obj, ctx);
++		if (err)
++			return err;
++
++		prev_tsk = tsk;
++		for (;;) {
++			if (prev_tsk->tgid == tsk->tgid) {
++				loff_t tg_pos;
++
++				tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal);
++				ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos);
++				if (thread_group_leader(prev_tsk))
++					break;
++			}
++
++			if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) {
++				eprintk_ctx("bug: thread group leader is lost\n");
++				return -EINVAL;
++			}
++
++			obj = list_entry(obj->o_list.prev, cpt_object_t, o_list);
++			prev_tsk = obj->o_obj;
++		}
++	}
++
++	cpt_close_object(ctx);
++	return 0;
++}
++
++int cpt_dump_tasks(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_TASKS);
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		int err;
++
++		if ((err = dump_one_process(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++	return 0;
++}
++
++int cpt_collect_signals(cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++
++	/* Collect process fd sets */
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) {
++			eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm);
++			return -EBUSY;
++		}
++		if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL)
++			return -ENOMEM;
++		if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL)
++			return -ENOMEM;
++	}
++	return 0;
++}
++
++
++static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct sighand_struct *sig = obj->o_obj;
++	struct cpt_sighand_image *v = cpt_get_buf(ctx);
++	int i;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_SIGHAND_STRUCT;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	for (i=0; i< _NSIG; i++) {
++		if (sig->action[i].sa.sa_handler != SIG_DFL ||
++		    sig->action[i].sa.sa_flags) {
++			loff_t saved_obj;
++			struct cpt_sighandler_image *o = cpt_get_buf(ctx);
++
++			cpt_push_object(&saved_obj, ctx);
++			cpt_open_object(NULL, ctx);
++
++			o->cpt_next = CPT_NULL;
++			o->cpt_object = CPT_OBJ_SIGHANDLER;
++			o->cpt_hdrlen = sizeof(*o);
++			o->cpt_content = CPT_CONTENT_VOID;
++
++			o->cpt_signo = i;
++			o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler;
++			o->cpt_restorer = 0;
++#ifdef CONFIG_X86
++			o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer;
++#endif
++			o->cpt_flags = sig->action[i].sa.sa_flags;
++			memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8);
++			ctx->write(o, sizeof(*o), ctx);
++			cpt_release_buf(ctx);
++			cpt_close_object(ctx);
++			cpt_pop_object(&saved_obj, ctx);
++		}
++	}
++
++	cpt_close_object(ctx);
++	return 0;
++}
++
++int cpt_dump_sighand(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT);
++
++	for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) {
++		int err;
++
++		if ((err = dump_one_sighand_struct(obj, ctx)) != 0)
++			return err;
++	}
++
++	cpt_close_section(ctx);
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_process.h b/kernel/cpt/cpt_process.h
+new file mode 100644
+index 0000000..b9f28af
+--- /dev/null
++++ b/kernel/cpt/cpt_process.h
+@@ -0,0 +1,13 @@
++int cpt_collect_signals(cpt_context_t *);
++int cpt_dump_signal(struct cpt_context *);
++int cpt_dump_sighand(struct cpt_context *);
++int cpt_dump_tasks(struct cpt_context *);
++
++int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx);
++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int rst_restore_process(struct cpt_context *ctx);
++int rst_process_linkage(struct cpt_context *ctx);
++
++int check_task_state(struct task_struct *tsk, struct cpt_context *ctx);
++struct pid *alloc_vpid_safe(pid_t vnr);
+diff --git a/kernel/cpt/cpt_socket.c b/kernel/cpt/cpt_socket.c
+new file mode 100644
+index 0000000..4878df1
+--- /dev/null
++++ b/kernel/cpt/cpt_socket.c
+@@ -0,0 +1,790 @@
++/*
++ *
++ *  kernel/cpt/cpt_socket.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/un.h>
++#include <linux/tcp.h>
++#include <net/sock.h>
++#include <net/scm.h>
++#include <net/af_unix.h>
++#include <net/tcp.h>
++#include <net/netlink_sock.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++
++static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx);
++
++
++/* Sockets are quite different of another kinds of files.
++ * There is one simplification: only one struct file can refer to a socket,
++ * so we could store information about socket directly in section FILES as
++ * a description of a file and append f.e. array of not-yet-accepted
++ * connections of listening socket as array of auxiliary data.
++ *
++ * Complications are:
++ * 1. TCP sockets can be orphans. We have to relocate orphans as well,
++ *    so we have to create special section for orphans.
++ * 2. AF_UNIX sockets are distinguished objects: set of links between
++ *    AF_UNIX sockets is quite arbitrary.
++ *    A. Each socket can refers to many of files due to FD passing.
++ *    B. Each socket except for connected ones can have in queue skbs
++ *       sent by any of sockets.
++ *
++ *    2A is relatively easy: after our tasks are frozen we make an additional
++ *    recursive pass throgh set of collected files and get referenced to
++ *    FD passed files. After end of recursion, all the files are treated
++ *    in the same way. All they will be stored in section FILES.
++ *
++ *    2B. We have to resolve all those references at some point.
++ *    It is the place where pipe-like approach to image fails.
++ *
++ * All this makes socket checkpointing quite chumbersome.
++ * Right now we collect all the sockets and assign some numeric index value
++ * to each of them. The socket section is separate and put after section FILES,
++ * so section FILES refers to sockets by index, section SOCKET refers to FILES
++ * as usual by position in image. All the refs inside socket section are
++ * by index. When restoring we read socket section, create objects to hold
++ * mappings index <-> pos. At the second pass we open sockets (simultaneosly
++ * with their pairs) and create FILE objects.
++ */ 
++
++
++/* ====== FD passing ====== */
++
++/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we
++ * have to implement this. A problem is that in general case we receive
++ * skbs from an unknown context, so new files can arrive to checkpointed
++ * set of processes even after they are stopped. Well, we are going just
++ * to ignore unknown fds while doing real checkpointing. It is fair because
++ * links outside checkpointed set are going to fail anyway.
++ *
++ * ATTN: the procedure is recursive. We linearize the recursion adding
++ * newly found files to the end of file list, so they will be analyzed
++ * in the same loop.
++ */
++
++static int collect_one_passedfd(struct file *file, cpt_context_t * ctx)
++{
++	struct inode *inode = file->f_dentry->d_inode;
++	struct socket *sock;
++	struct sock *sk;
++	struct sk_buff *skb;
++
++	if (!S_ISSOCK(inode->i_mode))
++		return -ENOTSOCK;
++
++	sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
++
++	if (sock->ops->family != AF_UNIX)
++		return 0;
++
++	sk = sock->sk;
++
++	/* Subtle locking issue. skbs cannot be removed while
++	 * we are scanning, because all the processes are stopped.
++	 * They still can be added to tail of queue. Locking while
++	 * we dereference skb->next is enough to resolve this.
++	 * See above about collision with skbs added after we started
++	 * checkpointing.
++	 */
++
++	skb = skb_peek(&sk->sk_receive_queue);
++	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++		if (UNIXCB(skb).fp && skb->sk &&
++		    (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) {
++			struct scm_fp_list *fpl = UNIXCB(skb).fp;
++			int i;
++
++			for (i = fpl->count-1; i >= 0; i--) {
++				if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL)
++					return -ENOMEM;
++			}
++		}
++
++		spin_lock_irq(&sk->sk_receive_queue.lock);
++		skb = skb->next;
++		spin_unlock_irq(&sk->sk_receive_queue.lock);
++	}
++
++	return 0;
++}
++
++int cpt_collect_passedfds(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++
++		if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
++			int err;
++
++			if ((err = collect_one_passedfd(file, ctx)) < 0)
++				return err;
++		}
++	}
++
++	return 0;
++}
++
++/* ====== End of FD passing ====== */
++
++/* Must be called under bh_lock_sock() */
++
++void clear_backlog(struct sock *sk)
++{
++	struct sk_buff *skb = sk->sk_backlog.head;
++
++	sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
++	while (skb) {
++		struct sk_buff *next = skb->next;
++
++		skb->next = NULL;
++		kfree_skb(skb);
++		skb = next;
++	}
++}
++
++void release_sock_nobacklog(struct sock *sk)
++{
++	spin_lock_bh(&(sk->sk_lock.slock));
++	clear_backlog(sk);
++	sk->sk_lock.owned = 0;
++        if (waitqueue_active(&(sk->sk_lock.wq)))
++		wake_up(&(sk->sk_lock.wq));
++	spin_unlock_bh(&(sk->sk_lock.slock));
++}
++
++int cpt_dump_skb(int type, int owner, struct sk_buff *skb,
++		 struct cpt_context *ctx)
++{
++	struct cpt_skb_image *v = cpt_get_buf(ctx);
++	loff_t saved_obj;
++	struct timeval tmptv;
++
++	cpt_push_object(&saved_obj, ctx);
++	cpt_open_object(NULL, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_SKB;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_owner = owner;
++	v->cpt_queue = type;
++	skb_get_timestamp(skb, &tmptv);
++	v->cpt_stamp = cpt_timeval_export(&tmptv);
++	v->cpt_hspace = skb->data - skb->head;
++	v->cpt_tspace = skb->end - skb->tail;
++	v->cpt_h = skb_transport_header(skb) - skb->head;
++	v->cpt_nh = skb_network_header(skb) - skb->head;
++	v->cpt_mac = skb_mac_header(skb) - skb->head;
++	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb));
++	memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb));
++	if (sizeof(skb->cb) > sizeof(v->cpt_cb)) {
++		int i;
++		for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) {
++			if (skb->cb[i]) {
++				wprintk_ctx("dirty skb cb");
++				break;
++			}
++		}
++	}
++	v->cpt_len = skb->len;
++	v->cpt_mac_len = skb->mac_len;
++	v->cpt_csum = skb->csum;
++	v->cpt_local_df = skb->local_df;
++	v->cpt_pkt_type = skb->pkt_type;
++	v->cpt_ip_summed = skb->ip_summed;
++	v->cpt_priority = skb->priority;
++	v->cpt_protocol = skb->protocol;
++	v->cpt_security = 0;
++	v->cpt_gso_segs = skb_shinfo(skb)->gso_segs;
++	v->cpt_gso_size = skb_shinfo(skb)->gso_size;
++	if (skb_shinfo(skb)->gso_type) {
++		eprintk_ctx("skb ufo is not supported\n");
++		return -EINVAL;
++	}
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	if (skb->len + (skb->data - skb->head) > 0) {
++		struct cpt_obj_bits ob;
++		loff_t saved_obj2;
++
++		cpt_push_object(&saved_obj2, ctx);
++		cpt_open_object(NULL, ctx);
++		ob.cpt_next = CPT_NULL;
++		ob.cpt_object = CPT_OBJ_BITS;
++		ob.cpt_hdrlen = sizeof(ob);
++		ob.cpt_content = CPT_CONTENT_DATA;
++		ob.cpt_size = skb->len + v->cpt_hspace;
++
++		ctx->write(&ob, sizeof(ob), ctx);
++
++		ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx);
++		if (skb->data_len) {
++			int offset = skb->len - skb->data_len;
++			while (offset < skb->len) {
++				int copy = skb->len - offset;
++				if (copy > PAGE_SIZE)
++					copy = PAGE_SIZE;
++				(void)cpt_get_buf(ctx);
++				if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy))
++					BUG();
++				ctx->write(ctx->tmpbuf, copy, ctx);
++				__cpt_release_buf(ctx);
++				offset += copy;
++			}
++		}
++
++		ctx->align(ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_obj2, ctx);
++	}
++
++	if (skb->sk && skb->sk->sk_family == AF_UNIX) {
++		struct scm_fp_list *fpl = UNIXCB(skb).fp;
++
++		if (fpl) {
++			int i;
++
++			for (i = 0; i < fpl->count; i++) {
++				struct cpt_fd_image v;
++				cpt_object_t *obj;
++				loff_t saved_obj2;
++
++				obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx);
++
++				if (!obj) {
++					eprintk_ctx("lost passed FD\n");
++					return -EINVAL;
++				}
++
++				cpt_push_object(&saved_obj2, ctx);
++				cpt_open_object(NULL, ctx);
++				v.cpt_next = CPT_NULL;
++				v.cpt_object = CPT_OBJ_FILEDESC;
++				v.cpt_hdrlen = sizeof(v);
++				v.cpt_content = CPT_CONTENT_VOID;
++
++				v.cpt_fd = i;
++				v.cpt_file = obj->o_pos;
++				v.cpt_flags = 0;
++				ctx->write(&v, sizeof(v), ctx);
++				cpt_close_object(ctx);
++				cpt_pop_object(&saved_obj2, ctx);
++			}
++		}
++	}
++
++	cpt_close_object(ctx);
++	cpt_pop_object(&saved_obj, ctx);
++	return 0;
++}
++
++static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++	struct sk_buff *skb;
++	struct sock *sk_cache = NULL;
++
++	skb = skb_peek(&sk->sk_receive_queue);
++	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++		int err;
++
++		if (sk->sk_family == AF_UNIX) {
++			cpt_object_t *obj;
++			if (skb->sk != sk_cache) {
++				idx = -1;
++				sk_cache = NULL;
++				obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx);
++				if (obj) {
++					idx = obj->o_index;
++					sk_cache = skb->sk;
++				} else if (unix_peer(sk) != skb->sk)
++					goto next_skb;
++			}
++		}
++
++		err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx);
++		if (err)
++			return err;
++
++next_skb:
++		spin_lock_irq(&sk->sk_receive_queue.lock);
++		skb = skb->next;
++		spin_unlock_irq(&sk->sk_receive_queue.lock);
++	}
++	return 0;
++}
++
++static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++	struct sk_buff *skb;
++
++	skb = skb_peek(&sk->sk_write_queue);
++	while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) {
++		int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx);
++		if (err)
++			return err;
++
++		spin_lock_irq(&sk->sk_write_queue.lock);
++		skb = skb->next;
++		spin_unlock_irq(&sk->sk_write_queue.lock);
++	}
++	return 0;
++}
++
++void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx)
++{
++	loff_t saved_obj;
++	if (sk->sk_filter) {
++		struct cpt_obj_bits v;
++
++		cpt_push_object(&saved_obj, ctx);
++		cpt_open_object(NULL, ctx);
++
++		v.cpt_next = CPT_NULL;
++		v.cpt_object = CPT_OBJ_SKFILTER;
++		v.cpt_hdrlen = sizeof(v);
++		v.cpt_content = CPT_CONTENT_DATA;
++		v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter);
++
++		ctx->write(&v, sizeof(v), ctx);
++		ctx->write(sk->sk_filter->insns, v.cpt_size, ctx);
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++		cpt_push_object(&saved_obj, ctx);
++		cpt_dump_mcfilter(sk, ctx);
++		cpt_pop_object(&saved_obj, ctx);
++	}
++}
++
++/* Dump socket content */
++
++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx)
++{
++	struct cpt_sock_image *v = cpt_get_buf(ctx);
++	struct socket *sock;
++	struct timeval tmptv;
++
++	cpt_open_object(obj, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_SOCKET;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_file = CPT_NULL;
++	sock = sk->sk_socket;
++	if (sock && sock->file) {
++		cpt_object_t *tobj;
++		tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx);
++		if (tobj)
++			v->cpt_file = tobj->o_pos;
++	}
++	v->cpt_index = index;
++	v->cpt_parent = parent;
++
++	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++		if (sock && !obj->o_lock) {
++			lockdep_off();
++			lock_sock(sk);
++			lockdep_on();
++			obj->o_lock = 1;
++		}
++	}
++
++	/* Some bits stored in inode */
++	v->cpt_ssflags = sock ? sock->flags : 0;
++	v->cpt_sstate = sock ? sock->state : 0;
++	v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0;
++
++	/* Common data */
++	v->cpt_family = sk->sk_family;
++	v->cpt_type = sk->sk_type;
++	v->cpt_state = sk->sk_state;
++	v->cpt_reuse = sk->sk_reuse;
++	v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED);
++	v->cpt_shutdown = sk->sk_shutdown;
++	v->cpt_userlocks = sk->sk_userlocks;
++	v->cpt_no_check = sk->sk_no_check;
++	v->cpt_zapped = sock_flag(sk, SOCK_DBG);
++	v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP);
++	v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE);
++	v->cpt_protocol = sk->sk_protocol;
++	v->cpt_err = sk->sk_err;
++	v->cpt_err_soft = sk->sk_err_soft;
++	v->cpt_max_ack_backlog = sk->sk_max_ack_backlog;
++	v->cpt_priority = sk->sk_priority;
++	v->cpt_rcvlowat = sk->sk_rcvlowat;
++	v->cpt_rcvtimeo = CPT_NULL;
++	if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT)
++		v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo;
++	v->cpt_sndtimeo = CPT_NULL;
++	if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT)
++		v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo;
++	v->cpt_rcvbuf = sk->sk_rcvbuf;
++	v->cpt_sndbuf = sk->sk_sndbuf;
++	v->cpt_bound_dev_if = sk->sk_bound_dev_if;
++	v->cpt_flags = sk->sk_flags;
++	v->cpt_lingertime = CPT_NULL;
++	if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT)
++		v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime;
++	v->cpt_peer_pid = sk->sk_peercred.pid;
++	v->cpt_peer_uid = sk->sk_peercred.uid;
++	v->cpt_peer_gid = sk->sk_peercred.gid;
++	tmptv = ktime_to_timeval(sk->sk_stamp);
++	v->cpt_stamp = cpt_timeval_export(&tmptv);
++
++	v->cpt_peer = -1;
++	v->cpt_socketpair = 0;
++	v->cpt_deleted = 0;
++
++	v->cpt_laddrlen = 0;
++	if (sock) {
++		int alen = sizeof(v->cpt_laddr);
++		int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++		v->cpt_laddrlen = alen;
++	}
++	v->cpt_raddrlen = 0;
++	if (sock) {
++		int alen = sizeof(v->cpt_raddr);
++		int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2);
++		if (!err)
++			v->cpt_raddrlen = alen;
++	}
++
++	if (sk->sk_family == AF_UNIX) {
++		if (unix_sk(sk)->dentry) {
++			struct dentry *d = unix_sk(sk)->dentry;
++			v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d);
++			if (!v->cpt_deleted) {
++				int err = 0;
++				char *path;
++				struct path p;
++				unsigned long pg = __get_free_page(GFP_KERNEL);
++
++				if (!pg) {
++					cpt_release_buf(ctx);
++					return -ENOMEM;
++				}
++
++				p.dentry = d;
++				p.mnt = unix_sk(sk)->mnt;
++				path = d_path(&p, (char *)pg, PAGE_SIZE);
++
++				if (!IS_ERR(path)) {
++					int len = strlen(path);
++					if (len < 126) {
++						strcpy(((char*)v->cpt_laddr)+2, path); 
++						v->cpt_laddrlen = len + 2;
++					} else {
++						wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2);
++					}
++					err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx);
++				} else {
++					eprintk_ctx("cannot get path of an af_unix socket\n");
++					err = PTR_ERR(path);
++				}
++				free_page(pg);
++				if (err) {
++					cpt_release_buf(ctx);
++					return err;
++				}
++			}
++		}
++
++		/* If the socket is connected, find its peer. If peer is not
++		 * in our table, the socket is connected to external process
++		 * and we consider it disconnected.
++		 */
++		if (unix_peer(sk)) {
++			cpt_object_t *pobj;
++			pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx);
++			if (pobj)
++				v->cpt_peer = pobj->o_index;
++			else
++				v->cpt_shutdown = SHUTDOWN_MASK;
++
++			if (unix_peer(unix_peer(sk)) == sk)
++				v->cpt_socketpair = 1;
++		}
++
++		/* If the socket shares address with another socket it is
++		 * child of some listening socket. Find and record it. */
++		if (unix_sk(sk)->addr &&
++		    atomic_read(&unix_sk(sk)->addr->refcnt) > 1 &&
++		    sk->sk_state != TCP_LISTEN) {
++			cpt_object_t *pobj;
++			for_each_object(pobj, CPT_OBJ_SOCKET) {
++				struct sock *psk = pobj->o_obj;
++				if (psk->sk_family == AF_UNIX &&
++				    psk->sk_state == TCP_LISTEN &&
++				    unix_sk(psk)->addr == unix_sk(sk)->addr) {
++					v->cpt_parent = pobj->o_index;
++					break;
++				}
++			}
++		}
++	}
++
++	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++		cpt_dump_socket_in(v, sk, ctx);
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	cpt_dump_sock_attr(sk, ctx);
++
++	dump_rqueue(index, sk, ctx);
++	if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++		dump_wqueue(index, sk, ctx);
++		cpt_dump_ofo_queue(index, sk, ctx);
++	}
++
++	if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++	    && sk->sk_state == TCP_LISTEN)
++		cpt_dump_synwait_queue(sk, index, ctx);
++
++	cpt_close_object(ctx);
++
++	if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++	    && sk->sk_state == TCP_LISTEN)
++		cpt_dump_accept_queue(sk, index, ctx);
++
++	return 0;
++}
++
++int cpt_dump_orphaned_sockets(struct cpt_context *ctx)
++{
++	int i;
++
++	cpt_open_section(ctx, CPT_SECT_ORPHANS);
++
++	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
++		struct sock *sk;
++		struct hlist_node *node;
++		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
++retry:
++		read_lock_bh(lock);
++		sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) {
++
++			if (sk->owner_env != get_exec_env())
++				continue;
++			if (sk->sk_socket)
++				continue;
++			if (!sock_flag(sk, SOCK_DEAD))
++				continue;
++			if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx))
++				continue;
++			sock_hold(sk);
++			read_unlock_bh(lock);
++
++			local_bh_disable();
++			bh_lock_sock(sk);
++			if (sock_owned_by_user(sk))
++				eprintk_ctx("BUG: sk locked by whom?\n");
++			sk->sk_lock.owned = 1;
++			bh_unlock_sock(sk);
++			local_bh_enable();
++
++			cpt_dump_socket(NULL, sk, -1, -1, ctx);
++
++			local_bh_disable();
++			bh_lock_sock(sk);
++			sk->sk_lock.owned = 0;
++			clear_backlog(sk);
++			tcp_done(sk);
++			bh_unlock_sock(sk);
++			local_bh_enable();
++			sock_put(sk);
++
++			goto retry;
++		}
++		read_unlock_bh(lock);
++	}
++	cpt_close_section(ctx);
++	return 0;
++}
++
++static int can_dump(struct sock *sk, cpt_context_t *ctx)
++{
++	switch (sk->sk_family) {
++	case AF_NETLINK:
++		if (((struct netlink_sock *)sk)->cb) {
++			eprintk_ctx("netlink socket has active callback\n");
++			return 0;
++		}
++		break;
++	}
++	return 1;
++}
++
++/* We are not going to block suspend when we have external AF_UNIX connections.
++ * But we cannot stop feed of new packets/connections to our environment
++ * from outside. Taking into account that it is intrincically unreliable,
++ * we collect some amount of data, but when checkpointing/restoring we
++ * are going to drop everything, which does not make sense: skbs sent
++ * by outside processes, connections from outside etc. etc.
++ */
++
++/* The first pass. When we see socket referenced by a file, we just
++ * add it to socket table */
++int cpt_collect_socket(struct file *file, cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++	struct socket *sock;
++	struct sock *sk;
++
++	if (!S_ISSOCK(file->f_dentry->d_inode->i_mode))
++		return -ENOTSOCK;
++	sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket;
++	sk = sock->sk;
++	if (!can_dump(sk, ctx))
++		return -EAGAIN;
++	if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL)
++		return -ENOMEM;
++	obj->o_parent = file;
++
++	return 0;
++}
++
++/*
++ * We should end with table containing:
++ *  * all sockets opened by our processes in the table.
++ *  * all the sockets queued in listening queues on _our_ listening sockets,
++ *    which are connected to our opened sockets.
++ */
++
++static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx)
++{
++	struct sock *sk = obj->o_obj;
++	cpt_object_t *cobj;
++	struct sk_buff *skb;
++
++	skb = skb_peek(&sk->sk_receive_queue);
++	while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++		struct sock *lsk = skb->sk;
++		if (unix_peer(lsk) &&
++		    lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) {
++			if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL)
++				return -ENOMEM;
++			cobj->o_parent = obj->o_parent;
++		}
++		spin_lock_irq(&sk->sk_receive_queue.lock);
++		skb = skb->next;
++		spin_unlock_irq(&sk->sk_receive_queue.lock);
++	}
++
++	return 0;
++}
++
++int cpt_index_sockets(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++	unsigned long index = 0;
++
++	/* Collect not-yet-accepted children of listening sockets. */
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct sock *sk = obj->o_obj;
++
++		if (sk->sk_state != TCP_LISTEN)
++			continue;
++
++		if (sk->sk_family == AF_UNIX)
++			collect_one_unix_listening_sock(obj, ctx);
++	}
++
++	/* Assign indices to all the sockets. */
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct sock *sk = obj->o_obj;
++		cpt_obj_setindex(obj, index++, ctx);
++
++		if (sk->sk_socket && sk->sk_socket->file) {
++			cpt_object_t *tobj;
++			tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx);
++			if (tobj)
++				cpt_obj_setindex(tobj, obj->o_index, ctx);
++		}
++	}
++
++	return 0;
++}
++
++void cpt_unlock_sockets(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++
++	lockdep_off();
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct sock *sk = obj->o_obj;
++		if (sk && obj->o_lock) {
++			if (sk->sk_socket)
++				release_sock(sk);
++		}
++	}
++	lockdep_on();
++}
++
++void cpt_kill_sockets(cpt_context_t * ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct sock *sk = obj->o_obj;
++		if (sk && obj->o_lock) {
++			struct ve_struct *old_env;
++			old_env = set_exec_env(sk->owner_env);
++			cpt_kill_socket(sk, ctx);
++			if (sk->sk_socket)
++				release_sock_nobacklog(sk);
++			set_exec_env(old_env);
++		}
++	}
++}
++
++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx)
++{
++	struct fasync_struct *fa;
++	struct inode *inode = file->f_dentry->d_inode;
++	struct socket *sock;
++
++	sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
++
++	for (fa = sock->fasync_list; fa; fa = fa->fa_next) {
++		if (fa->fa_file == file)
++			return fa->fa_fd;
++	}
++	return -1;
++}
+diff --git a/kernel/cpt/cpt_socket.h b/kernel/cpt/cpt_socket.h
+new file mode 100644
+index 0000000..6489184
+--- /dev/null
++++ b/kernel/cpt/cpt_socket.h
+@@ -0,0 +1,33 @@
++struct sock;
++
++int cpt_collect_passedfds(cpt_context_t *);
++int cpt_index_sockets(cpt_context_t *);
++int cpt_collect_socket(struct file *, cpt_context_t *);
++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx);
++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx);
++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx);
++int rst_sockets(struct cpt_context *ctx);
++int rst_sockets_complete(struct cpt_context *ctx);
++int cpt_dump_orphaned_sockets(struct cpt_context *ctx);
++
++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx);
++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx);
++
++void cpt_unlock_sockets(cpt_context_t *);
++void cpt_kill_sockets(cpt_context_t *);
++
++
++int cpt_kill_socket(struct sock *, cpt_context_t *);
++int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*);
++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx);
++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx);
++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *);
++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx);
++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx);
++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx);
++int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx);
++
++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
++		       loff_t pos, cpt_context_t *ctx);
++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
++			loff_t pos, cpt_context_t *ctx);
+diff --git a/kernel/cpt/cpt_socket_in.c b/kernel/cpt/cpt_socket_in.c
+new file mode 100644
+index 0000000..c02d459
+--- /dev/null
++++ b/kernel/cpt/cpt_socket_in.c
+@@ -0,0 +1,450 @@
++/*
++ *
++ *  kernel/cpt/cpt_socket_in.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/tcp.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <net/if_inet6.h>
++#include <linux/igmp.h>
++#include <linux/ipv6.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++static inline __u32 jiffies_export(unsigned long tmo)
++{
++	__s32 delta = (long)(tmo - jiffies);
++	return delta;
++}
++
++static inline __u32 tcp_jiffies_export(__u32 tmo)
++{
++	__s32 delta = tmo - tcp_time_stamp;
++	return delta;
++}
++
++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++	struct sk_buff *skb;
++	struct tcp_sock *tp;
++
++	if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP)
++		return 0;
++
++	tp = tcp_sk(sk);
++
++	skb = skb_peek(&tp->out_of_order_queue);
++	while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) {
++		int err;
++
++		err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx);
++		if (err)
++			return err;
++
++		spin_lock_irq(&tp->out_of_order_queue.lock);
++		skb = skb->next;
++		spin_unlock_irq(&tp->out_of_order_queue.lock);
++	}
++	return 0;
++}
++
++static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk,
++			       struct cpt_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	si->cpt_pred_flags = tp->pred_flags;
++	si->cpt_rcv_nxt = tp->rcv_nxt;
++	si->cpt_snd_nxt = tp->snd_nxt;
++	si->cpt_snd_una = tp->snd_una;
++	si->cpt_snd_sml = tp->snd_sml;
++	si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp);
++	si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime);
++	si->cpt_tcp_header_len = tp->tcp_header_len;
++	si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending;
++	si->cpt_quick = inet_csk(sk)->icsk_ack.quick;
++	si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong;
++	si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked;
++	si->cpt_ato = inet_csk(sk)->icsk_ack.ato;
++	si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout);
++	si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime);
++	si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size;
++	si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss;
++	si->cpt_snd_wl1 = tp->snd_wl1;
++	si->cpt_snd_wnd = tp->snd_wnd;
++	si->cpt_max_window = tp->max_window;
++	si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie;
++	si->cpt_mss_cache = tp->mss_cache;
++	si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */
++	si->cpt_mss_clamp = tp->rx_opt.mss_clamp;
++	si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len;
++	si->cpt_ext2_header_len = 0;
++	si->cpt_ca_state = inet_csk(sk)->icsk_ca_state;
++	si->cpt_retransmits = inet_csk(sk)->icsk_retransmits;
++	si->cpt_reordering = tp->reordering;
++	si->cpt_frto_counter = tp->frto_counter;
++	si->cpt_frto_highmark = tp->frto_highmark;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++	// // si->cpt_adv_cong = tp->adv_cong;
++#endif
++	si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept;
++	si->cpt_backoff = inet_csk(sk)->icsk_backoff;
++	si->cpt_srtt = tp->srtt;
++	si->cpt_mdev = tp->mdev;
++	si->cpt_mdev_max = tp->mdev_max;
++	si->cpt_rttvar = tp->rttvar;
++	si->cpt_rtt_seq = tp->rtt_seq;
++	si->cpt_rto = inet_csk(sk)->icsk_rto;
++	si->cpt_packets_out = tp->packets_out;
++	si->cpt_left_out = tp->sacked_out + tp->lost_out;
++	si->cpt_retrans_out = tp->retrans_out;
++	si->cpt_lost_out = tp->lost_out;
++	si->cpt_sacked_out = tp->sacked_out;
++	si->cpt_fackets_out = tp->fackets_out;
++	si->cpt_snd_ssthresh = tp->snd_ssthresh;
++	si->cpt_snd_cwnd = tp->snd_cwnd;
++	si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt;
++	si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp;
++	si->cpt_snd_cwnd_used = tp->snd_cwnd_used;
++	si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp);
++	si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout);
++	si->cpt_ka_timeout = 0;
++	si->cpt_rcv_wnd = tp->rcv_wnd;
++	si->cpt_rcv_wup = tp->rcv_wup;
++	si->cpt_write_seq = tp->write_seq;
++	si->cpt_pushed_seq = tp->pushed_seq;
++	si->cpt_copied_seq = tp->copied_seq;
++	si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok;
++	si->cpt_wscale_ok = tp->rx_opt.wscale_ok;
++	si->cpt_sack_ok = tp->rx_opt.sack_ok;
++	si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp;
++	si->cpt_snd_wscale = tp->rx_opt.snd_wscale;
++	si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale;
++	si->cpt_nonagle = tp->nonagle;
++	si->cpt_keepalive_probes = tp->keepalive_probes;
++	si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval;
++	si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr;
++	si->cpt_ts_recent = tp->rx_opt.ts_recent;
++	si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
++	si->cpt_user_mss = tp->rx_opt.user_mss;
++	si->cpt_dsack = tp->rx_opt.dsack;
++	si->cpt_eff_sacks = tp->rx_opt.eff_sacks;
++	si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq;
++	si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq;
++	si->cpt_sack_array[2] = tp->selective_acks[0].start_seq;
++	si->cpt_sack_array[3] = tp->selective_acks[0].end_seq;
++	si->cpt_sack_array[4] = tp->selective_acks[1].start_seq;
++	si->cpt_sack_array[5] = tp->selective_acks[1].end_seq;
++	si->cpt_sack_array[6] = tp->selective_acks[2].start_seq;
++	si->cpt_sack_array[7] = tp->selective_acks[2].end_seq;
++	si->cpt_sack_array[8] = tp->selective_acks[3].start_seq;
++	si->cpt_sack_array[9] = tp->selective_acks[3].end_seq;
++	si->cpt_window_clamp = tp->window_clamp;
++	si->cpt_rcv_ssthresh = tp->rcv_ssthresh;
++	si->cpt_probes_out = inet_csk(sk)->icsk_probes_out;
++	si->cpt_num_sacks = tp->rx_opt.num_sacks;
++	si->cpt_advmss = tp->advmss;
++	si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries;
++	si->cpt_ecn_flags = tp->ecn_flags;
++	si->cpt_prior_ssthresh = tp->prior_ssthresh;
++	si->cpt_high_seq = tp->high_seq;
++	si->cpt_retrans_stamp = tp->retrans_stamp;
++	si->cpt_undo_marker = tp->undo_marker;
++	si->cpt_undo_retrans = tp->undo_retrans;
++	si->cpt_urg_seq = tp->urg_seq;
++	si->cpt_urg_data = tp->urg_data;
++	si->cpt_pending = inet_csk(sk)->icsk_pending;
++	si->cpt_urg_mode = tp->urg_mode;
++	si->cpt_snd_up = tp->snd_up;
++	si->cpt_keepalive_time = tp->keepalive_time;
++	si->cpt_keepalive_intvl = tp->keepalive_intvl;
++	si->cpt_linger2 = tp->linger2;
++
++	if (sk->sk_state != TCP_LISTEN &&
++	    sk->sk_state != TCP_CLOSE &&
++	    sock_flag(sk, SOCK_KEEPOPEN)) {
++		si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires);
++	}
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++	{
++		extern struct inet_connection_sock_af_ops ipv6_mapped;
++		if (sk->sk_family == AF_INET6 &&
++			inet_csk(sk)->icsk_af_ops == &ipv6_mapped)
++			si->cpt_mapped = 1;
++	}
++#endif
++
++	return 0;
++}
++
++
++int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk,
++		       struct cpt_context *ctx)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	struct ipv6_pinfo *np = inet6_sk(sk);
++
++	if (sk->sk_family == AF_INET) {
++		struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr);
++		sin->sin_family = AF_INET;
++		sin->sin_port = inet->sport;
++		sin->sin_addr.s_addr = inet->rcv_saddr;
++		si->cpt_laddrlen = sizeof(*sin);
++	} else if (sk->sk_family == AF_INET6) {
++		struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr);
++		sin6->sin6_family = AF_INET6;
++		sin6->sin6_port = inet->sport;
++		memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16);
++		si->cpt_laddrlen = sizeof(*sin6);
++	}
++	if (!inet->num)
++		si->cpt_laddrlen = 0;
++
++	si->cpt_daddr = inet->daddr;
++	si->cpt_dport = inet->dport;
++	si->cpt_saddr = inet->saddr;
++	si->cpt_rcv_saddr = inet->rcv_saddr;
++	si->cpt_sport = inet->sport;
++	si->cpt_uc_ttl = inet->uc_ttl;
++	si->cpt_tos = inet->tos;
++	si->cpt_cmsg_flags = inet->cmsg_flags;
++	si->cpt_mc_index = inet->mc_index;
++	si->cpt_mc_addr = inet->mc_addr;
++	si->cpt_hdrincl = inet->hdrincl;
++	si->cpt_mc_ttl = inet->mc_ttl;
++	si->cpt_mc_loop = inet->mc_loop;
++	si->cpt_pmtudisc = inet->pmtudisc;
++	si->cpt_recverr = inet->recverr;
++	si->cpt_freebind = inet->freebind;
++	si->cpt_idcounter = inet->id;
++
++	si->cpt_cork_flags = inet->cork.flags;
++	si->cpt_cork_fragsize = 0;
++	si->cpt_cork_length = inet->cork.length;
++	si->cpt_cork_addr = inet->cork.addr;
++	si->cpt_cork_saddr = inet->cork.fl.fl4_src;
++	si->cpt_cork_daddr = inet->cork.fl.fl4_dst;
++	si->cpt_cork_oif = inet->cork.fl.oif;
++	if (inet->cork.dst) {
++		struct rtable *rt = (struct rtable *)inet->cork.dst;
++		si->cpt_cork_fragsize = inet->cork.fragsize;
++		si->cpt_cork_saddr = rt->fl.fl4_src;
++		si->cpt_cork_daddr = rt->fl.fl4_dst;
++		si->cpt_cork_oif = rt->fl.oif;
++	}
++
++	if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
++		struct udp_sock *up = udp_sk(sk);
++		si->cpt_udp_pending  = up->pending;
++		si->cpt_udp_corkflag  = up->corkflag;
++		si->cpt_udp_encap  = up->encap_type;
++		si->cpt_udp_len  = up->len;
++	}
++
++	if (sk->sk_family == AF_INET6) {
++		memcpy(si->cpt_saddr6, &np->saddr, 16);
++		memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16);
++		memcpy(si->cpt_daddr6, &np->daddr, 16);
++		si->cpt_flow_label6 = np->flow_label;
++		si->cpt_frag_size6 = np->frag_size;
++		si->cpt_hop_limit6 = np->hop_limit;
++		si->cpt_mcast_hops6 = np->mcast_hops;
++		si->cpt_mcast_oif6 = np->mcast_oif;
++		si->cpt_rxopt6 = np->rxopt.all;
++		si->cpt_mc_loop6 = np->mc_loop;
++		si->cpt_recverr6 = np->recverr;
++		si->cpt_sndflow6 = np->sndflow;
++		si->cpt_pmtudisc6 = np->pmtudisc;
++		si->cpt_ipv6only6 = np->ipv6only;
++		si->cpt_mapped = 0;
++	}
++
++	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
++		cpt_dump_socket_tcp(si, sk, ctx);
++
++	return 0;
++}
++
++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx)
++{
++	struct request_sock *req;
++
++	for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next)
++		cpt_dump_socket(NULL, req->sk, -1, index, ctx);
++	return 0;
++}
++
++
++static int dump_openreq(struct request_sock *req, struct sock *sk, int index,
++			struct cpt_context *ctx)
++{
++	struct cpt_openreq_image *v = cpt_get_buf(ctx);
++
++	cpt_open_object(NULL, ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_OPENREQ;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn;
++	v->cpt_snt_isn = tcp_rsk(req)->snt_isn;
++	v->cpt_rmt_port = inet_rsk(req)->rmt_port;
++	v->cpt_mss = req->mss;
++	// // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6);
++	v->cpt_retrans = req->retrans;
++	v->cpt_snd_wscale = inet_rsk(req)->snd_wscale;
++	v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale;
++	v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok;
++	v->cpt_sack_ok = inet_rsk(req)->sack_ok;
++	v->cpt_wscale_ok = inet_rsk(req)->wscale_ok;
++	v->cpt_ecn_ok = inet_rsk(req)->ecn_ok;
++	v->cpt_acked = inet_rsk(req)->acked;
++	v->cpt_window_clamp = req->window_clamp;
++	v->cpt_rcv_wnd = req->rcv_wnd;
++	v->cpt_ts_recent = req->ts_recent;
++	v->cpt_expires = jiffies_export(req->expires);
++
++	if (v->cpt_family == AF_INET) {
++		memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4);
++		memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4);
++	} else {
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++		memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16);
++		memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16);
++		v->cpt_iif = inet6_rsk(req)->iif;
++#endif
++	}
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	cpt_close_object(ctx);
++	return 0;
++}
++
++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx)
++{
++	struct inet_connection_sock *icsk;
++	struct listen_sock *lopt;
++	struct request_sock *req;
++	int nr_entries;
++	int i;
++
++	icsk = inet_csk(sk);
++	lopt = icsk->icsk_accept_queue.listen_opt;
++	nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries;
++
++	for (i=0; i < nr_entries; i++) {
++		for (req=lopt->syn_table[i]; req; req=req->dl_next) {
++			loff_t saved_obj;
++			cpt_push_object(&saved_obj, ctx);
++			dump_openreq(req, sk, index, ctx);
++			cpt_pop_object(&saved_obj, ctx);
++		}
++	}
++	return 0;
++}
++
++
++int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx)
++{
++	if (sk->sk_state != TCP_CLOSE &&
++	    (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
++	    sk->sk_protocol == IPPROTO_TCP) {
++		if (sk->sk_state != TCP_LISTEN)
++			tcp_set_state(sk, TCP_CLOSE);
++		else
++			sk->sk_prot->disconnect(sk, 0);
++	}
++	return 0;
++}
++
++int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	struct ip_mc_socklist *iml;
++
++	for (iml = inet->mc_list; iml; iml = iml->next) {
++		struct cpt_sockmc_image smi;
++		int scnt = 0;
++		int i;
++
++		if (iml->sflist)
++			scnt = iml->sflist->sl_count*16;
++
++		smi.cpt_next = sizeof(smi) + scnt;
++		smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
++		smi.cpt_hdrlen = sizeof(smi);
++		smi.cpt_content = CPT_CONTENT_DATA;
++
++		smi.cpt_family = AF_INET;
++		smi.cpt_mode = iml->sfmode;
++		smi.cpt_ifindex = iml->multi.imr_ifindex;
++		memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr));
++		smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr;
++
++		ctx->write(&smi, sizeof(smi), ctx);
++
++		for (i = 0; i < scnt; i++) {
++			u32 addr[4];
++			memset(&addr, 0, sizeof(addr));
++			addr[0] = iml->sflist->sl_addr[i];
++			ctx->write(&addr, sizeof(addr), ctx);
++		}
++	}
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	if (sk->sk_family == AF_INET6) {
++		struct ipv6_mc_socklist *mcl;
++		struct ipv6_pinfo *np = inet6_sk(sk);
++
++		for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) {
++			struct cpt_sockmc_image smi;
++			int scnt = 0;
++			int i;
++
++			if (mcl->sflist)
++				scnt = mcl->sflist->sl_count*16;
++
++			smi.cpt_next = sizeof(smi) + scnt;
++			smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
++			smi.cpt_hdrlen = sizeof(smi);
++			smi.cpt_content = CPT_CONTENT_DATA;
++
++			smi.cpt_family = AF_INET6;
++			smi.cpt_mode = mcl->sfmode;
++			smi.cpt_ifindex = mcl->ifindex;
++			memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr));
++
++			ctx->write(&smi, sizeof(smi), ctx);
++			for (i = 0; i < scnt; i++)
++				ctx->write(&mcl->sflist->sl_addr[i], 16, ctx);
++		}
++	}
++#endif
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_syscalls.h b/kernel/cpt/cpt_syscalls.h
+new file mode 100644
+index 0000000..ba69cb5
+--- /dev/null
++++ b/kernel/cpt/cpt_syscalls.h
+@@ -0,0 +1,101 @@
++#include <linux/unistd.h>
++#include <linux/syscalls.h>
++#include <linux/fs.h>
++#include <asm/uaccess.h>
++
++#define WRAP(c, args) return sys_##c args
++#define WRAP2(c, args) int err; mm_segment_t oldfs; \
++	               oldfs = get_fs(); set_fs(KERNEL_DS); \
++                       err = sys_##c args ;\
++                       set_fs(oldfs); \
++                       return err
++
++static inline int sc_close(int fd)
++{
++	WRAP(close, (fd));
++}
++
++static inline int sc_dup2(int fd1, int fd2)
++{
++	WRAP(dup2, (fd1, fd2));
++}
++
++static inline int sc_unlink(char *name)
++{
++	WRAP2(unlink, (name));
++}
++
++static inline int sc_pipe(int *pfd)
++{
++	return do_pipe(pfd);
++}
++
++static inline int sc_mknod(char *name, int mode, int dev)
++{
++	WRAP2(mknod, (name, mode, dev));
++}
++
++static inline int sc_chmod(char *name, int mode)
++{
++	WRAP2(mkdir, (name, mode));
++}
++
++static inline int sc_chown(char *name, int uid, int gid)
++{
++	WRAP2(chown, (name, uid, gid));
++}
++
++static inline int sc_mkdir(char *name, int mode)
++{
++	WRAP2(mkdir, (name, mode));
++}
++
++static inline int sc_rmdir(char *name)
++{
++	WRAP2(rmdir, (name));
++}
++
++static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags)
++{
++	WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL));
++}
++
++static inline int sc_mprotect(unsigned long start, size_t len,
++			      unsigned long prot)
++{
++	WRAP(mprotect, (start, len, prot));
++}
++
++static inline int sc_mlock(unsigned long start, size_t len)
++{
++	WRAP(mlock, (start, len));
++}
++
++static inline int sc_munlock(unsigned long start, size_t len)
++{
++	WRAP(munlock, (start, len));
++}
++
++static inline int sc_remap_file_pages(unsigned long start, size_t len,
++				      unsigned long prot, unsigned long pgoff,
++				      unsigned long flags)
++{
++	WRAP(remap_file_pages, (start, len, prot, pgoff, flags));
++}
++
++static inline int sc_waitx(int pid, int opt, int *stat_addr)
++{
++	WRAP(wait4, (pid, stat_addr, opt, NULL));
++}
++
++static inline int sc_flock(int fd, int flags)
++{
++	WRAP(flock, (fd, flags));
++}
++
++static inline int sc_open(char* path, int flags, int mode)
++{
++	WRAP(open, (path, flags, mode));
++}
++
++extern int sc_execve(char *cms, char **argv, char **env);
+diff --git a/kernel/cpt/cpt_sysvipc.c b/kernel/cpt/cpt_sysvipc.c
+new file mode 100644
+index 0000000..8117307
+--- /dev/null
++++ b/kernel/cpt/cpt_sysvipc.c
+@@ -0,0 +1,403 @@
++/*
++ *
++ *  kernel/cpt/cpt_sysvipc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/shm.h>
++#include <linux/sem.h>
++#include <linux/msg.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++
++struct _warg {
++		struct file			*file;
++		struct cpt_sysvshm_image	*v;
++};
++
++static int dump_one_shm(struct shmid_kernel *shp, void *arg)
++{
++	struct _warg *warg = arg;
++	struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v;
++
++	if (shp->shm_file != warg->file)
++		return 0;
++
++	v->cpt_key = shp->shm_perm.key;
++	v->cpt_uid = shp->shm_perm.uid;
++	v->cpt_gid = shp->shm_perm.gid;
++	v->cpt_cuid = shp->shm_perm.cuid;
++	v->cpt_cgid = shp->shm_perm.cgid;
++	v->cpt_mode = shp->shm_perm.mode;
++	v->cpt_seq = shp->shm_perm.seq;
++
++	v->cpt_id = shp->shm_perm.id;
++	v->cpt_segsz = shp->shm_segsz;
++	v->cpt_atime = shp->shm_atim;
++	v->cpt_ctime = shp->shm_ctim;
++	v->cpt_dtime = shp->shm_dtim;
++	v->cpt_creator = shp->shm_cprid;
++	v->cpt_last = shp->shm_lprid;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++	v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1;
++#else
++	v->cpt_mlockuser = -1;
++#endif
++	return 1;
++}
++
++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx)
++{
++	struct cpt_sysvshm_image *v = cpt_get_buf(ctx);
++	struct _warg warg;
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_SYSV_SHM;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	warg.file = file;
++	warg.v = v;
++	if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) {
++		cpt_release_buf(ctx);
++		return -ESRCH;
++	}
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++	return 0;
++}
++
++
++int match_sem(int id, struct sem_array *sema, void *arg)
++{
++	if (id != (unsigned long)arg)
++		return 0;
++	return sema->sem_nsems + 1;
++}
++
++static int get_sem_nsem(int id, cpt_context_t *ctx)
++{
++	int res;
++	res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id);
++	if (res > 0)
++		return res - 1;
++	eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id);
++	return -ESRCH;
++}
++
++static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx)
++{
++	struct cpt_sysvsem_undo_image v;
++	loff_t saved_obj;
++
++	cpt_open_object(NULL, ctx);
++
++	v.cpt_next = CPT_NULL;
++	v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC;
++	v.cpt_hdrlen = sizeof(v);
++	v.cpt_content = CPT_CONTENT_SEMUNDO;
++	v.cpt_id = su->semid;
++	v.cpt_nsem = get_sem_nsem(su->semid, ctx);
++	if ((int)v.cpt_nsem < 0)
++		return -ESRCH;
++
++	ctx->write(&v, sizeof(v), ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx);
++	cpt_pop_object(&saved_obj, ctx);
++
++	cpt_close_object(ctx);
++	return 0;
++}
++
++struct sem_warg {
++	int				last_id;
++	struct cpt_sysvsem_image	*v;
++};
++
++static int dump_one_sem(int id, struct sem_array *sma, void *arg)
++{
++	struct sem_warg * warg = (struct sem_warg *)arg;
++	struct cpt_sysvsem_image *v = warg->v;
++	int i;
++
++	if (warg->last_id != -1) {
++		if ((id % IPCMNI) <= warg->last_id)
++			return 0;
++	}
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_SYSV_SEM;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_SEMARRAY;
++
++	v->cpt_key = sma->sem_perm.key;
++	v->cpt_uid = sma->sem_perm.uid;
++	v->cpt_gid = sma->sem_perm.gid;
++	v->cpt_cuid = sma->sem_perm.cuid;
++	v->cpt_cgid = sma->sem_perm.cgid;
++	v->cpt_mode = sma->sem_perm.mode;
++	v->cpt_seq = sma->sem_perm.seq;
++
++	v->cpt_id = id;
++	v->cpt_ctime = sma->sem_ctime;
++	v->cpt_otime = sma->sem_otime;
++
++	for (i=0; i<sma->sem_nsems; i++) {
++		struct {
++			__u32 semval;
++			__u32 sempid;
++		} *s = (void*)v + v->cpt_next;
++		if (v->cpt_next >= PAGE_SIZE - sizeof(*s))
++			return -EINVAL;
++		s->semval = sma->sem_base[i].semval;
++		s->sempid = sma->sem_base[i].sempid;
++		v->cpt_next += sizeof(*s);
++	}
++
++	warg->last_id = id % IPCMNI;
++	return 1;
++}
++
++
++int cpt_dump_sysvsem(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++	struct sem_warg warg;
++
++	/* Dumping semaphores is quite tricky because we cannot
++	 * write to dump file under lock inside sysvipc_walk_sem().
++	 */
++	cpt_open_section(ctx, CPT_SECT_SYSV_SEM);
++	warg.last_id = -1;
++	warg.v = cpt_get_buf(ctx);
++	for (;;) {
++		if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0)
++			break;
++		ctx->write(warg.v, warg.v->cpt_next, ctx);
++	}
++	cpt_release_buf(ctx);
++	cpt_close_section(ctx);
++
++	cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO);
++	for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
++		struct sem_undo_list *semu = obj->o_obj;
++		struct sem_undo *su;
++		struct cpt_object_hdr v;
++		loff_t saved_obj;
++
++		cpt_open_object(obj, ctx);
++
++		v.cpt_next = CPT_NULL;
++		v.cpt_object = CPT_OBJ_SYSVSEM_UNDO;
++		v.cpt_hdrlen = sizeof(v);
++		v.cpt_content = CPT_CONTENT_ARRAY;
++
++		ctx->write(&v, sizeof(v), ctx);
++
++		cpt_push_object(&saved_obj, ctx);
++		for (su = semu->proc_list; su; su = su->proc_next) {
++			if (su->semid != -1) {
++				int err;
++				err = dump_one_semundo(su, ctx);
++				if (err < 0)
++					return err;
++			}
++		}
++		cpt_pop_object(&saved_obj, ctx);
++
++		cpt_close_object(ctx);
++	}
++	cpt_close_section(ctx);
++	return 0;
++}
++
++struct msg_warg {
++	int				last_id;
++	struct msg_queue		*msq;
++	struct cpt_sysvmsg_image	*v;
++};
++
++static int dump_one_msg(int id, struct msg_queue *msq, void *arg)
++{
++	struct msg_warg * warg = (struct msg_warg *)arg;
++	struct cpt_sysvmsg_image *v = warg->v;
++
++	if (warg->last_id != -1) {
++		if ((id % IPCMNI) <= warg->last_id)
++			return 0;
++	}
++
++	v->cpt_next = sizeof(*v);
++	v->cpt_object = CPT_OBJ_SYSVMSG;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_key = msq->q_perm.key;
++	v->cpt_uid = msq->q_perm.uid;
++	v->cpt_gid = msq->q_perm.gid;
++	v->cpt_cuid = msq->q_perm.cuid;
++	v->cpt_cgid = msq->q_perm.cgid;
++	v->cpt_mode = msq->q_perm.mode;
++	v->cpt_seq = msq->q_perm.seq;
++
++	v->cpt_id = id;
++	v->cpt_stime = msq->q_stime;
++	v->cpt_rtime = msq->q_rtime;
++	v->cpt_ctime = msq->q_ctime;
++	v->cpt_last_sender = msq->q_lspid;
++	v->cpt_last_receiver = msq->q_lrpid;
++	v->cpt_qbytes = msq->q_qbytes;
++
++	warg->msq = msq;
++	warg->last_id = id % IPCMNI;
++	return 1;
++}
++
++static int do_store(void * src, int len, int offset, void * data)
++{
++	cpt_context_t * ctx = data;
++	ctx->write(src, len, ctx);
++	return 0;
++}
++
++static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx)
++{
++	loff_t saved_obj;
++	struct cpt_sysvmsg_msg_image mv;
++			
++	cpt_open_object(NULL, ctx);
++	mv.cpt_next = CPT_NULL;
++	mv.cpt_object = CPT_OBJ_SYSVMSG_MSG;
++	mv.cpt_hdrlen = sizeof(mv);
++	mv.cpt_content = CPT_CONTENT_DATA;
++
++	mv.cpt_type = m->m_type;
++	mv.cpt_size = m->m_ts;
++
++	ctx->write(&mv, sizeof(mv), ctx);
++
++	cpt_push_object(&saved_obj, ctx);
++	sysv_msg_store(m, do_store, m->m_ts, ctx);
++	cpt_pop_object(&saved_obj, ctx);
++	cpt_close_object(ctx);
++}
++
++int cpt_dump_sysvmsg(struct cpt_context *ctx)
++{
++	struct msg_warg warg;
++
++	/* Dumping msg queues is tricky because we cannot
++	 * write to dump file under lock inside sysvipc_walk_msg().
++	 *
++	 * And even worse, we have to access msg list in an unserialized
++	 * context. It is fragile. But VE is still frozen, remember?
++	 */
++	cpt_open_section(ctx, CPT_SECT_SYSV_MSG);
++	warg.last_id = -1;
++	warg.v = cpt_get_buf(ctx);
++	for (;;) {
++		loff_t saved_obj;
++		struct msg_msg * m;
++
++		if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0)
++			break;
++
++		cpt_open_object(NULL, ctx);
++
++		ctx->write(warg.v, warg.v->cpt_next, ctx);
++
++		cpt_push_object(&saved_obj, ctx);
++		list_for_each_entry(m, &warg.msq->q_messages, m_list) {
++			cpt_dump_one_sysvmsg(m, ctx);
++		}
++		cpt_pop_object(&saved_obj, ctx);
++
++		cpt_close_object(ctx);
++	}
++	cpt_release_buf(ctx);
++	cpt_close_section(ctx);
++	return 0;
++}
++
++static int cpt_collect_sysvsem_undo(cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		if (tsk->exit_state) {
++			/* ipc/sem.c forgets to clear tsk->sysvsem.undo_list
++			 * on exit. Grrr... */
++			continue;
++		}
++		if (tsk->sysvsem.undo_list &&
++		    cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL)
++			return -ENOMEM;
++	}
++
++	for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
++		struct sem_undo_list *semu = obj->o_obj;
++
++		if (atomic_read(&semu->refcnt) != obj->o_count) {
++			eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt));
++			return -EBUSY;
++		}
++	}
++	return 0;
++}
++
++static int collect_one_shm(struct shmid_kernel *shp, void *arg)
++{
++	cpt_context_t *ctx = arg;
++
++	if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL)
++		return -ENOMEM;
++	return 0;
++}
++
++int cpt_collect_sysvshm(cpt_context_t * ctx)
++{
++	int err;
++
++	err = sysvipc_walk_shm(collect_one_shm, ctx);
++
++	return err < 0 ? err : 0;
++}
++
++int cpt_collect_sysv(cpt_context_t * ctx)
++{
++	int err;
++
++	err = cpt_collect_sysvsem_undo(ctx);
++	if (err)
++		return err;
++	err = cpt_collect_sysvshm(ctx);
++	if (err)
++		return err;
++
++	return 0;
++}
+diff --git a/kernel/cpt/cpt_tty.c b/kernel/cpt/cpt_tty.c
+new file mode 100644
+index 0000000..8ac9417
+--- /dev/null
++++ b/kernel/cpt/cpt_tty.c
+@@ -0,0 +1,215 @@
++/*
++ *
++ *  kernel/cpt/cpt_tty.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/tty.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++/* We must support at least N_TTY. */
++
++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx)
++{
++	struct tty_struct *tty = file->private_data;
++	cpt_object_t *obj;
++	struct cpt_obj_ref o;
++	loff_t saved_pos;
++
++	obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx);
++	if (!obj)
++		return -EINVAL;
++
++	cpt_push_object(&saved_pos, ctx);
++
++	o.cpt_next = sizeof(o);
++	o.cpt_object = CPT_OBJ_REF;
++	o.cpt_hdrlen = sizeof(o);
++	o.cpt_content = CPT_CONTENT_VOID;
++	o.cpt_pos = obj->o_pos;
++	ctx->write(&o, sizeof(o), ctx);
++
++	cpt_pop_object(&saved_pos, ctx);
++
++	return 0;
++}
++
++int cpt_collect_tty(struct file *file, cpt_context_t * ctx)
++{
++	struct tty_struct *tty = file->private_data;
++
++	if (tty) {
++		if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL)
++			return -ENOMEM;
++		if (tty->link) {
++			cpt_object_t *obj;
++
++			obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx);
++			if (obj == NULL)
++				return -ENOMEM;
++			/* Undo o_count, tty->link is not a reference */
++			obj->o_count--;
++		}
++	}
++	return 0;
++}
++
++int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct tty_struct *tty = obj->o_obj;
++	struct cpt_tty_image *v;
++
++	if (tty->link) {
++		if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) {
++			eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE);
++			return -EINVAL;
++		}
++		if (tty->link->link != tty) {
++			eprintk_ctx("bad pty pair\n");
++			return -EINVAL;
++		}
++		if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
++		    tty->driver->subtype == PTY_TYPE_SLAVE &&
++		    tty->link->count)
++			obj->o_count++;
++	}
++	if (obj->o_count != tty->count) {
++		eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count);
++		return -EBUSY;
++	}
++
++	cpt_open_object(obj, ctx);
++
++	v = cpt_get_buf(ctx);
++	v->cpt_next = -1;
++	v->cpt_object = CPT_OBJ_TTY;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_ARRAY;
++
++	v->cpt_index = tty->index;
++	v->cpt_link = -1;
++	if (tty->link)
++		v->cpt_link = tty->link->index;
++	v->cpt_drv_type = tty->driver->type;
++	v->cpt_drv_subtype = tty->driver->subtype;
++	v->cpt_drv_flags = tty->driver->flags;
++	v->cpt_packet = tty->packet;
++	v->cpt_stopped = tty->stopped;
++	v->cpt_hw_stopped = tty->hw_stopped;
++	v->cpt_flow_stopped = tty->flow_stopped;
++	v->cpt_flags = tty->flags;
++	v->cpt_ctrl_status = tty->ctrl_status;
++	v->cpt_canon_data = tty->canon_data;
++	v->cpt_canon_head = tty->canon_head - tty->read_tail;
++	v->cpt_canon_column = tty->canon_column;
++	v->cpt_column = tty->column;
++	v->cpt_erasing = tty->erasing;
++	v->cpt_lnext = tty->lnext;
++	v->cpt_icanon = tty->icanon;
++	v->cpt_raw = tty->raw;
++	v->cpt_real_raw = tty->real_raw;
++	v->cpt_closing = tty->closing;
++	v->cpt_minimum_to_wake = tty->minimum_to_wake;
++	v->cpt_pgrp = 0;
++	if (tty->pgrp) {
++		v->cpt_pgrp = pid_vnr(tty->pgrp);
++		if ((int)v->cpt_pgrp < 0) {
++			dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp);
++			v->cpt_pgrp = -1;
++		}
++	}
++	v->cpt_session = 0;
++	if (tty->session) {
++		v->cpt_session = pid_vnr(tty->session);
++		if ((int)v->cpt_session < 0) {
++			eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session);
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++	}
++	memcpy(v->cpt_name, tty->name, 64);
++	v->cpt_ws_row = tty->winsize.ws_row;
++	v->cpt_ws_col = tty->winsize.ws_col;
++	v->cpt_ws_prow = tty->winsize.ws_ypixel;
++	v->cpt_ws_pcol = tty->winsize.ws_xpixel;
++	if (tty->termios == NULL) {
++		eprintk_ctx("NULL termios");
++		cpt_release_buf(ctx);
++		return -EINVAL;
++	}
++	v->cpt_c_line = tty->termios->c_line;
++	v->cpt_c_iflag = tty->termios->c_iflag;
++	v->cpt_c_oflag = tty->termios->c_oflag;
++	v->cpt_c_cflag = tty->termios->c_cflag;
++	v->cpt_c_lflag = tty->termios->c_lflag;
++	memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS);
++	if (NCCS < 32)
++		memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS);
++	memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags));
++
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_release_buf(ctx);
++
++	if (tty->read_buf && tty->read_cnt) {
++		struct cpt_obj_bits *v = cpt_get_buf(ctx);
++		loff_t saved_pos;
++
++		cpt_push_object(&saved_pos, ctx);
++		cpt_open_object(NULL, ctx);
++		v->cpt_next = CPT_NULL;
++		v->cpt_object = CPT_OBJ_BITS;
++		v->cpt_hdrlen = sizeof(*v);
++		v->cpt_content = CPT_CONTENT_DATA;
++		v->cpt_size = tty->read_cnt;
++		ctx->write(v, sizeof(*v), ctx);
++		cpt_release_buf(ctx);
++
++		if (tty->read_cnt) {
++			int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail);
++			ctx->write(tty->read_buf + tty->read_tail, n, ctx);
++			if (tty->read_cnt > n)
++				ctx->write(tty->read_buf, tty->read_cnt-n, ctx);
++			ctx->align(ctx);
++		}
++
++		cpt_close_object(ctx);
++		cpt_pop_object(&saved_pos, ctx);
++	}
++
++	cpt_close_object(ctx);
++
++	return 0;
++}
++
++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx)
++{
++	struct tty_struct * tty;
++	struct fasync_struct *fa;
++
++	tty = (struct tty_struct *)file->private_data;
++
++	for (fa = tty->fasync; fa; fa = fa->fa_next) {
++		if (fa->fa_file == file)
++			return fa->fa_fd;
++	}
++	return -1;
++}
+diff --git a/kernel/cpt/cpt_ubc.c b/kernel/cpt/cpt_ubc.c
+new file mode 100644
+index 0000000..fc27e74
+--- /dev/null
++++ b/kernel/cpt/cpt_ubc.c
+@@ -0,0 +1,132 @@
++/*
++ *
++ *  kernel/cpt/cpt_ubc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/types.h>
++#include <bc/beancounter.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx);
++	if (obj != NULL) {
++		if (obj->o_count == 1)
++			get_beancounter(bc);
++		if (bc->parent != NULL && obj->o_parent == NULL)
++			obj->o_parent = cpt_add_ubc(bc->parent, ctx);
++	}
++	return obj;
++}
++
++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx);
++	if (obj == NULL) {
++		char buf[48];
++		print_ub_uid(bc, buf, sizeof(buf));
++		eprintk("CPT: unknown ub %s (%p)\n", buf, bc);
++		dump_stack();
++		return CPT_NULL;
++	}
++	return obj->o_pos;
++}
++
++static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
++		int held)
++{
++	dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL);
++	dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL);
++	dmp->held = (held ? prm->held : CPT_NULL);
++	dmp->maxheld = prm->maxheld;
++	dmp->minheld = prm->minheld;
++	dmp->failcnt = prm->failcnt;
++}
++
++static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct user_beancounter *bc;
++	struct cpt_beancounter_image *v;
++	int i;
++
++	bc = obj->o_obj;
++	v = cpt_get_buf(ctx);
++
++	v->cpt_next = CPT_NULL;
++	v->cpt_object = CPT_OBJ_UBC;
++	v->cpt_hdrlen = sizeof(*v);
++	v->cpt_content = CPT_CONTENT_VOID;
++
++	if (obj->o_parent != NULL)
++		v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos;
++	else
++		v->cpt_parent = CPT_NULL;
++	v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0;
++	for (i = 0; i < UB_RESOURCES; i++) {
++		dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0);
++		dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1);
++	}
++	memset(v->cpt_parms + UB_RESOURCES * 2, 0,
++			sizeof(v->cpt_parms)
++				- UB_RESOURCES * 2 * sizeof(v->cpt_parms[0]));
++
++	cpt_open_object(obj, ctx);
++	ctx->write(v, sizeof(*v), ctx);
++	cpt_close_object(ctx);
++
++	cpt_release_buf(ctx);
++	return 0;
++}
++
++int cpt_dump_ubc(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++	int skipped;
++	int top;
++
++	cpt_open_section(ctx, CPT_SECT_UBC);
++
++	do {
++		skipped = 0;
++		top = 0;
++		for_each_object(obj, CPT_OBJ_UBC) {
++			if (obj->o_parent == NULL)
++				top++;
++			if (obj->o_pos != CPT_NULL)
++				continue;
++			if (obj->o_parent != NULL &&
++			    ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL)
++				skipped++;
++			else
++				dump_one_bc(obj, ctx);
++		}
++	} while (skipped && (top < 2));
++
++	cpt_close_section(ctx);
++	if (top > 1) {
++		eprintk_ctx("More than one top level ub exist");
++		return -EINVAL;
++	}
++		
++	return 0;
++}
++
++void cpt_finish_ubc(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_UBC)
++		put_beancounter(obj->o_obj);
++}
+diff --git a/kernel/cpt/cpt_ubc.h b/kernel/cpt/cpt_ubc.h
+new file mode 100644
+index 0000000..645ba79
+--- /dev/null
++++ b/kernel/cpt/cpt_ubc.h
+@@ -0,0 +1,23 @@
++#ifdef CONFIG_BEANCOUNTERS
++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
++int cpt_dump_ubc(struct cpt_context *ctx);
++
++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx);
++int rst_undump_ubc(struct cpt_context *ctx);
++
++void cpt_finish_ubc(struct cpt_context *ctx);
++void rst_finish_ubc(struct cpt_context *ctx);
++void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id);
++void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id);
++#else
++static int inline cpt_dump_ubc(struct cpt_context *ctx)
++{ return 0; }
++static int inline rst_undump_ubc(struct cpt_context *ctx)
++{ return 0; }
++static void inline cpt_finish_ubc(struct cpt_context *ctx)
++{ return; }
++static void inline rst_finish_ubc(struct cpt_context *ctx)
++{ return; }
++#endif
++
+diff --git a/kernel/cpt/cpt_x8664.S b/kernel/cpt/cpt_x8664.S
+new file mode 100644
+index 0000000..0d5e361
+--- /dev/null
++++ b/kernel/cpt/cpt_x8664.S
+@@ -0,0 +1,67 @@
++#define ASSEMBLY 1
++
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/cache.h>
++#include <asm/errno.h>
++#include <asm/dwarf2.h>
++#include <asm/calling.h>
++#include <asm/msr.h>
++#include <asm/unistd.h>
++#include <asm/thread_info.h>
++#include <asm/hw_irq.h>
++#include <asm/errno.h>
++
++	.code64
++
++	.macro FAKE_STACK_FRAME child_rip
++	/* push in order ss, rsp, eflags, cs, rip */
++	xorq %rax, %rax
++	pushq %rax /* ss */
++	pushq %rax /* rsp */
++	pushq $(1<<9) /* eflags - interrupts on */
++	pushq $__KERNEL_CS /* cs */
++	pushq \child_rip /* rip */
++	pushq	%rax /* orig rax */
++	.endm
++
++	.macro UNFAKE_STACK_FRAME
++	addq $8*6, %rsp
++	.endm
++
++ENTRY(asm_kernel_thread)
++	CFI_STARTPROC
++	FAKE_STACK_FRAME $child_rip
++	SAVE_ALL
++
++	# rdi: flags, rsi: usp, rdx: will be &pt_regs
++	movq %rdx,%rdi
++	orq  $0x00800000,%rdi
++	movq $-1, %rsi
++	movq %rsp, %rdx
++
++	xorl %r8d,%r8d
++	xorl %r9d,%r9d
++	pushq %rcx
++	call do_fork_pid
++	addq $8, %rsp
++	/* call do_fork */
++	movq %rax,RAX(%rsp)
++	xorl %edi,%edi
++	RESTORE_ALL
++	UNFAKE_STACK_FRAME
++	ret
++	CFI_ENDPROC
++ENDPROC(asm_kernel_thread)
++
++child_rip:
++	pushq $0		# fake return address
++	CFI_STARTPROC
++	movq %rdi, %rax
++	movq %rsi, %rdi
++	call *%rax
++	movq %rax, %rdi
++	call do_exit
++	CFI_ENDPROC
++ENDPROC(child_rip)
++
+diff --git a/kernel/cpt/rst_conntrack.c b/kernel/cpt/rst_conntrack.c
+new file mode 100644
+index 0000000..4c31f32
+--- /dev/null
++++ b/kernel/cpt/rst_conntrack.c
+@@ -0,0 +1,283 @@
++/*
++ *
++ *  kernel/cpt/rst_conntrack.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/unistd.h>
++#include <linux/ve.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++#include <linux/icmp.h>
++#include <linux/ip.h>
++
++#if defined(CONFIG_VE_IPTABLES) && \
++    (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv4/ip_conntrack.h>
++#include <linux/netfilter_ipv4/ip_nat.h>
++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
++#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++#include <linux/netfilter_ipv4/ip_conntrack_core.h>
++#include <linux/netfilter_ipv4/ip_nat_helper.h>
++#include <linux/netfilter_ipv4/ip_nat_core.h>
++
++#define ASSERT_READ_LOCK(x) do { } while (0)
++#define ASSERT_WRITE_LOCK(x) do { } while (0)
++
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++struct ct_holder
++{
++	struct ct_holder *next;
++	struct ip_conntrack *ct;
++	int index;
++};
++
++static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir)
++{
++	tuple->dst.ip = v->cpt_dst;
++	tuple->dst.u.all = v->cpt_dstport;
++	tuple->dst.protonum = v->cpt_protonum;
++	tuple->dst.dir = v->cpt_dir;
++	if (dir != tuple->dst.dir)
++		wprintk("dir != tuple->dst.dir\n");
++
++	tuple->src.ip = v->cpt_src;
++	tuple->src.u.all = v->cpt_srcport;
++}
++
++
++static int undump_expect_list(struct ip_conntrack *ct,
++			      struct cpt_ip_conntrack_image *ci,
++			      loff_t pos, struct ct_holder *ct_list,
++			      cpt_context_t *ctx)
++{
++	loff_t end;
++	int err;
++
++	end = pos + ci->cpt_next;
++	pos += ci->cpt_hdrlen;
++	while (pos < end) {
++		struct cpt_ip_connexpect_image v;
++		struct ip_conntrack_expect *exp;
++		struct ip_conntrack *sibling;
++
++		err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx);
++		if (err)
++			return err;
++
++		sibling = NULL;
++		if (v.cpt_sibling_conntrack) {
++			struct ct_holder *c;
++
++			for (c = ct_list; c; c = c->next) {
++				if (c->index == v.cpt_sibling_conntrack) {
++					sibling = c->ct;
++					break;
++				}
++			}
++			if (!sibling) {
++				eprintk_ctx("lost sibling of expectation\n");
++				return -EINVAL;
++			}
++		}
++
++		write_lock_bh(&ip_conntrack_lock);
++
++		/* It is possible. Helper module could be just unregistered,
++		 * if expectation were on the list, it would be destroyed. */
++		if (ct->helper == NULL) {
++			write_unlock_bh(&ip_conntrack_lock);
++			dprintk_ctx("conntrack: no helper and non-trivial expectation\n");
++			continue;
++		}
++
++		exp = ip_conntrack_expect_alloc(NULL);
++		if (exp == NULL) {
++			write_unlock_bh(&ip_conntrack_lock);
++			return -ENOMEM;
++		}
++
++		if (ct->helper->timeout && !del_timer(&exp->timeout)) {
++			/* Dying already. We can do nothing. */
++			write_unlock_bh(&ip_conntrack_lock);
++			dprintk_ctx("conntrack expectation is dying\n");
++			continue;
++		}
++
++		decode_tuple(&v.cpt_tuple, &exp->tuple, 0);
++		decode_tuple(&v.cpt_mask, &exp->mask, 0);
++
++		exp->master = ct;
++		nf_conntrack_get(&ct->ct_general);
++		ip_conntrack_expect_insert(exp);
++#if 0
++		if (sibling) {
++			exp->sibling = sibling;
++			sibling->master = exp;
++			LIST_DELETE(&ve_ip_conntrack_expect_list, exp);
++			ct->expecting--;
++			nf_conntrack_get(&master_ct(sibling)->infos[0]);
++		} else
++#endif
++		if (ct->helper->timeout) {
++			exp->timeout.expires = jiffies + v.cpt_timeout;
++			add_timer(&exp->timeout);
++		}
++		write_unlock_bh(&ip_conntrack_lock);
++
++		pos += v.cpt_next;
++	}
++	return 0;
++}
++
++static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos,
++			 struct ct_holder **ct_list, cpt_context_t *ctx)
++{
++	int err = 0;
++	struct ip_conntrack *conntrack;
++	struct ct_holder *c;
++	struct ip_conntrack_tuple orig, repl;
++
++	c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
++	if (c == NULL)
++		return -ENOMEM;
++
++	decode_tuple(&ci->cpt_tuple[0], &orig, 0);
++	decode_tuple(&ci->cpt_tuple[1], &repl, 1);
++
++	conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub);
++	if (!conntrack || IS_ERR(conntrack)) {
++		kfree(c);
++		return -ENOMEM;
++	}
++
++	c->ct = conntrack;
++	c->next = *ct_list;
++	*ct_list = c;
++	c->index = ci->cpt_index;
++
++	decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0);
++	decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1);
++
++	conntrack->status = ci->cpt_status;
++
++	memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto));
++	memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help));
++
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
++	defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
++	conntrack->nat.masq_index = ci->cpt_masq_index;
++#endif
++	if (ci->cpt_initialized) {
++		conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos;
++		conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before;
++		conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after;
++		conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos;
++		conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before;
++		conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after;
++	}
++	if (conntrack->status & IPS_NAT_DONE_MASK)
++		ip_nat_hash_conntrack(conntrack);
++#endif
++
++	if (ci->cpt_ct_helper) {
++		conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple);
++		if (conntrack->helper == NULL) {
++			eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n");
++			err = -EINVAL;
++		}
++	}
++
++	ip_conntrack_hash_insert(conntrack);
++	conntrack->timeout.expires = jiffies + ci->cpt_timeout;
++
++	if (err == 0 && ci->cpt_next > ci->cpt_hdrlen)
++		err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx);
++
++	return err;
++}
++
++int rst_restore_ip_conntrack(struct cpt_context * ctx)
++{
++	int err = 0;
++	loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_ip_conntrack_image ci;
++	struct ct_holder *c;
++	struct ct_holder *ct_list = NULL;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) {
++		eprintk_ctx("conntrack module ct->proto version mismatch\n");
++		return -EINVAL;
++	}
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx);
++		if (err)
++			break;
++		err = undump_one_ct(&ci, sec, &ct_list, ctx);
++		if (err)
++			break;
++		sec += ci.cpt_next;
++	}
++
++	while ((c = ct_list) != NULL) {
++		ct_list = c->next;
++		if (c->ct)
++			add_timer(&c->ct->timeout);
++		kfree(c);
++	}
++
++	return err;
++}
++
++#else
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++int rst_restore_ip_conntrack(struct cpt_context * ctx)
++{
++	if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL)
++		return -EINVAL;
++	return 0;
++}
++
++#endif
+diff --git a/kernel/cpt/rst_context.c b/kernel/cpt/rst_context.c
+new file mode 100644
+index 0000000..47e4f35
+--- /dev/null
++++ b/kernel/cpt/rst_context.c
+@@ -0,0 +1,323 @@
++/*
++ *
++ *  kernel/cpt/rst_context.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx)
++{
++	mm_segment_t oldfs;
++	ssize_t err = -EBADF;
++	struct file *file = ctx->file;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if (file)
++		err = file->f_op->read(file, addr, count, &file->f_pos);
++	set_fs(oldfs);
++	if (err != count)
++		return err >= 0 ? -EIO : err;
++	return 0;
++}
++
++static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
++{
++	mm_segment_t oldfs;
++	ssize_t err = -EBADF;
++	struct file *file = ctx->file;
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if (file)
++		err = file->f_op->read(file, addr, count, &pos);
++	set_fs(oldfs);
++	if (err != count)
++		return err >= 0 ? -EIO : err;
++	return 0;
++}
++
++static void file_align(struct cpt_context *ctx)
++{
++	struct file *file = ctx->file;
++
++	if (file)
++		file->f_pos = CPT_ALIGN(file->f_pos);
++}
++
++int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end)
++{
++	struct cpt_section_hdr hdr;
++	int err;
++	loff_t pos;
++
++	pos = ctx->sections[type];
++	*start = *end = pos;
++
++	if (pos != CPT_NULL) {
++		if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0)
++			return err;
++		if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr))
++			return -EINVAL;
++		*start = pos + hdr.cpt_hdrlen;
++		*end = pos + hdr.cpt_next;
++	}
++	return 0;
++}
++EXPORT_SYMBOL(rst_get_section);
++
++void rst_context_init(struct cpt_context *ctx)
++{
++	int i;
++
++	memset(ctx, 0, sizeof(*ctx));
++
++	init_MUTEX(&ctx->main_sem);
++	ctx->refcount = 1;
++
++	ctx->current_section = -1;
++	ctx->current_object = -1;
++	ctx->pagesize = PAGE_SIZE;
++	ctx->read = file_read;
++	ctx->pread = file_pread;
++	ctx->align = file_align;
++	for (i=0; i < CPT_SECT_MAX; i++)
++		ctx->sections[i] = CPT_NULL;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	init_completion(&ctx->pgin_notify);
++#endif
++	cpt_object_init(ctx);
++}
++
++static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx)
++{
++	struct cpt_section_hdr h;
++
++	while (start < end) {
++		int err;
++
++		err = ctx->pread(&h, sizeof(h), ctx, start);
++		if (err)
++			return err;
++		if (h.cpt_hdrlen < sizeof(h) ||
++		    h.cpt_next < h.cpt_hdrlen ||
++		    start + h.cpt_next > end)
++			return -EINVAL;
++		if (h.cpt_section >= CPT_SECT_MAX)
++			return -EINVAL;
++		ctx->sections[h.cpt_section] = start;
++		start += h.cpt_next;
++	}
++	return 0;
++}
++
++int rst_open_dumpfile(struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_major_tail *v;
++	struct cpt_major_hdr  h;
++	unsigned long size;
++
++	err = -EBADF;
++	if (!ctx->file)
++		goto err_out;
++
++	err = -ENOMEM;
++	ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
++	if (ctx->tmpbuf == NULL)
++		goto err_out;
++	__cpt_release_buf(ctx);
++
++	size = ctx->file->f_dentry->d_inode->i_size;
++
++	if (size & 7) {
++		err = -EINVAL;
++		goto err_out;
++	}
++	if (size < sizeof(struct cpt_major_hdr) +
++	    sizeof(struct cpt_major_tail)) {
++		err = -EINVAL;
++		goto err_out;
++	}
++	err = ctx->pread(&h, sizeof(h), ctx, 0);
++	if (err) {
++		eprintk_ctx("too short image 1 %d\n", err);
++		goto err_out;
++	}
++	if (h.cpt_signature[0] != CPT_SIGNATURE0 ||
++	    h.cpt_signature[1] != CPT_SIGNATURE1 ||
++	    h.cpt_signature[2] != CPT_SIGNATURE2 ||
++	    h.cpt_signature[3] != CPT_SIGNATURE3) {
++		err = -EINVAL;
++		goto err_out;
++	}
++	if (h.cpt_hz != HZ) {
++		err = -EINVAL;
++		eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ);
++		goto err_out;
++	}
++	ctx->virt_jiffies64 = h.cpt_start_jiffies64;
++	ctx->start_time.tv_sec = h.cpt_start_sec;
++	ctx->start_time.tv_nsec = h.cpt_start_nsec;
++	ctx->kernel_config_flags = h.cpt_kernel_config[0];
++	ctx->iptables_mask = h.cpt_iptables_mask;
++	if (h.cpt_image_version > CPT_VERSION_20 ||
++			CPT_VERSION_MINOR(h.cpt_image_version) > 1) {
++		eprintk_ctx("Unknown image version: %x. Can't restore.\n",
++				h.cpt_image_version);
++		err = -EINVAL;
++		goto err_out;
++	}
++	ctx->image_version = h.cpt_image_version;
++	ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features);
++	ctx->image_arch = h.cpt_os_arch;
++
++	v = cpt_get_buf(ctx);
++	err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v));
++	if (err) {
++		eprintk_ctx("too short image 2 %d\n", err);
++		cpt_release_buf(ctx);
++		goto err_out;
++	}
++	if (v->cpt_signature[0] != CPT_SIGNATURE0 ||
++	    v->cpt_signature[1] != CPT_SIGNATURE1 ||
++	    v->cpt_signature[2] != CPT_SIGNATURE2 ||
++	    v->cpt_signature[3] != CPT_SIGNATURE3 ||
++	    v->cpt_nsect != CPT_SECT_MAX_INDEX) {
++		err = -EINVAL;
++		cpt_release_buf(ctx);
++		goto err_out;
++	}
++	if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) {
++		cpt_release_buf(ctx);
++		goto err_out;
++	}
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	ctx->lazypages = v->cpt_lazypages;
++#endif
++	ctx->tasks64 = v->cpt_64bit;
++	cpt_release_buf(ctx);
++	return 0;
++
++err_out:
++	if (ctx->tmpbuf) {
++		free_page((unsigned long)ctx->tmpbuf);
++		ctx->tmpbuf = NULL;
++	}
++	return err;
++}
++
++void rst_close_dumpfile(struct cpt_context *ctx)
++{
++	if (ctx->file) {
++		fput(ctx->file);
++		ctx->file = NULL;
++	}
++	if (ctx->tmpbuf) {
++		free_page((unsigned long)ctx->tmpbuf);
++		ctx->tmpbuf = NULL;
++	}
++}
++
++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_object_hdr *hdr = tmp;
++	err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos);
++	if (err)
++		return err;
++	if (type > 0 && type != hdr->cpt_object)
++		return -EINVAL;
++	if (hdr->cpt_hdrlen > hdr->cpt_next)
++		return -EINVAL;
++	if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr))
++		return -EINVAL;
++	if (size < sizeof(*hdr))
++		return -EINVAL;
++	if (size > hdr->cpt_hdrlen)
++		size = hdr->cpt_hdrlen;
++	if (size > sizeof(*hdr))
++		err = ctx->pread(hdr+1, size - sizeof(*hdr),
++				 ctx, pos + sizeof(*hdr));
++	return err;
++}
++EXPORT_SYMBOL(_rst_get_object);
++
++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	void *tmp;
++	struct cpt_object_hdr hdr;
++	err = ctx->pread(&hdr, sizeof(hdr), ctx, pos);
++	if (err)
++		return NULL;
++	if (type > 0 && type != hdr.cpt_object)
++		return NULL;
++	if (hdr.cpt_hdrlen > hdr.cpt_next)
++		return NULL;
++	if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr))
++		return NULL;
++	tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL);
++	if (!tmp)
++		return NULL;
++	err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos);
++	if (!err)
++		return tmp;
++	kfree(tmp);
++	return NULL;
++}
++EXPORT_SYMBOL(__rst_get_object);
++
++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_object_hdr hdr;
++	__u8 *name;
++
++	err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx);
++	if (err)
++		return NULL;
++	if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE)
++		return NULL;
++	name = (void*)__get_free_page(GFP_KERNEL);
++	if (!name)
++		return NULL;
++	err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen,
++		   ctx, *pos_p + hdr.cpt_hdrlen);
++	if (err) {
++		free_page((unsigned long)name);
++		return NULL;
++	}
++	*pos_p += hdr.cpt_next;
++	return name;
++}
++
++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx)
++{
++	return __rst_get_name(&pos, ctx);
++}
++
++void rst_put_name(__u8 *name, struct cpt_context *ctx)
++{
++	unsigned long addr = (unsigned long)name;
++
++	if (addr)
++		free_page(addr&~(PAGE_SIZE-1));
++}
+diff --git a/kernel/cpt/rst_epoll.c b/kernel/cpt/rst_epoll.c
+new file mode 100644
+index 0000000..0ac4cae
+--- /dev/null
++++ b/kernel/cpt/rst_epoll.c
+@@ -0,0 +1,169 @@
++/*
++ *
++ *  kernel/cpt/rst_epoll.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mnt_namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/eventpoll.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++/* Those funcations are static in fs/eventpoll.c */
++extern int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++		     struct file *tfile, int fd);
++extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
++extern void ep_release_epitem(struct epitem *epi);
++
++
++struct file *cpt_open_epolldev(struct cpt_file_image *fi,
++			       unsigned flags,
++			       struct cpt_context *ctx)
++{
++	struct file *file;
++	int efd;
++
++	/* Argument "size" is ignored, use just 1 */
++	efd = sys_epoll_create(1);
++	if (efd < 0)
++		return ERR_PTR(efd);
++
++	file = fget(efd);
++	sys_close(efd);
++	return file;
++}
++
++static int restore_one_epoll(cpt_object_t *obj,
++			     loff_t pos,
++			     struct cpt_epoll_image *ebuf,
++			     cpt_context_t *ctx)
++{
++	int err = 0;
++	loff_t endpos;
++	struct file *file = obj->o_obj;
++	struct eventpoll *ep;
++
++	if (file->f_op != &eventpoll_fops) {
++		eprintk_ctx("bad epoll file\n");
++		return -EINVAL;
++	}
++
++	ep = file->private_data;
++
++	if (unlikely(ep == NULL)) {
++		eprintk_ctx("bad epoll device\n");
++		return -EINVAL;
++	}
++
++	endpos = pos + ebuf->cpt_next;
++	pos += ebuf->cpt_hdrlen;
++	while (pos < endpos) {
++		struct cpt_epoll_file_image efi;
++		struct epoll_event epds;
++		
++		cpt_object_t *tobj;
++
++		err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx);
++		if (err)
++			return err;
++		tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx);
++		if (!tobj) {
++			eprintk_ctx("epoll file not found\n");
++			return -EINVAL;
++		}
++		epds.events = efi.cpt_events;
++		epds.data = efi.cpt_data;
++		mutex_lock(&ep->mtx);
++		err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd);
++		if (!err) {
++			struct epitem *epi;
++			epi = ep_find(ep, tobj->o_obj, efi.cpt_fd);
++			if (epi) {
++				if (efi.cpt_ready) {
++					unsigned long flags;
++					spin_lock_irqsave(&ep->lock, flags);
++					if (list_empty(&epi->rdllink))
++						list_add_tail(&epi->rdllink, &ep->rdllist);
++					spin_unlock_irqrestore(&ep->lock, flags);
++				}
++			}
++		}
++		mutex_unlock(&ep->mtx);
++		if (err)
++			break;
++		pos += efi.cpt_next;
++	}
++	return err;
++}
++
++int rst_eventpoll(cpt_context_t *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_EPOLL];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		cpt_object_t *obj;
++		struct cpt_epoll_image *ebuf = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx);
++		if (obj == NULL) {
++			eprintk_ctx("cannot find epoll file object\n");
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++		err = restore_one_epoll(obj, sec, ebuf, ctx);
++		cpt_release_buf(ctx);
++		if (err)
++			return err;
++		sec += ebuf->cpt_next;
++	}
++
++	return 0;
++	
++}
+diff --git a/kernel/cpt/rst_files.c b/kernel/cpt/rst_files.c
+new file mode 100644
+index 0000000..534ea3a
+--- /dev/null
++++ b/kernel/cpt/rst_files.c
+@@ -0,0 +1,1661 @@
++/*
++ *
++ *  kernel/cpt/rst_files.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mount.h>
++#include <linux/tty.h>
++#include <linux/namei.h>
++#include <linux/vmalloc.h>
++#include <linux/smp_lock.h>
++#include <linux/vmalloc.h>
++#include <linux/pagemap.h>
++#include <asm/uaccess.h>
++#include <bc/kmem.h>
++#include <linux/cpt_image.h>
++#include <linux/mnt_namespace.h>
++#include <linux/fdtable.h>
++#include <linux/shm.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++
++#include "cpt_syscalls.h"
++
++
++struct filejob {
++	struct filejob *next;
++	int	pid;
++	loff_t	fdi;
++};
++
++static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx)
++{
++	struct filejob *j;
++
++	j = kmalloc(sizeof(*j), GFP_KERNEL);
++	if (j == NULL)
++		return -ENOMEM;
++	j->pid = current->pid;
++	j->fdi = pos;
++	j->next = ctx->filejob_queue;
++	ctx->filejob_queue = j;
++	return 0;
++}
++
++static void _anon_pipe_buf_release(struct pipe_inode_info *pipe,
++				  struct pipe_buffer *buf)
++{
++	struct page *page = buf->page;
++
++	/*
++	 * If nobody else uses this page, and we don't already have a
++	 * temporary page, let's keep track of it as a one-deep
++	 * allocation cache. (Otherwise just release our reference to it)
++	 */
++	if (page_count(page) == 1 && !pipe->tmp_page)
++		pipe->tmp_page = page;
++	else
++		page_cache_release(page);
++
++	module_put(THIS_MODULE);
++}
++
++static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe,
++			   struct pipe_buffer *buf, int atomic)
++{
++	if (atomic) {
++		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
++		return kmap_atomic(buf->page, KM_USER0);
++	}
++
++	return kmap(buf->page);
++}
++
++static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
++			    struct pipe_buffer *buf, void *map_data)
++{
++	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
++		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
++		kunmap_atomic(map_data, KM_USER0);
++	} else
++		kunmap(buf->page);
++}
++
++static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe,
++			   struct pipe_buffer *buf)
++{
++	struct page *page = buf->page;
++
++	if (page_count(page) == 1) {
++		lock_page(page);
++		return 0;
++	}
++
++	return 1;
++}
++
++static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
++{
++	page_cache_get(buf->page);
++}
++
++static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf)
++{
++	return 0;
++}
++
++static struct pipe_buf_operations _anon_pipe_buf_ops = {
++	.can_merge = 1,
++	.map = _anon_pipe_buf_map,
++	.unmap = _anon_pipe_buf_unmap,
++	.release = _anon_pipe_buf_release,
++	.confirm = _anon_pipe_buf_confirm,
++	.get = _anon_pipe_buf_get,
++	.steal = _anon_pipe_buf_steal,
++};
++
++/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer
++ * many times. We need to mark it in CPT_OBJ_INODE table in some way.
++ */
++static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi,
++			   struct cpt_context *ctx)
++{
++	struct inode *ino = file->f_dentry->d_inode;
++	struct cpt_inode_image ii;
++	struct cpt_obj_bits b;
++	struct pipe_inode_info *info;
++	int err;
++	int count;
++
++	if (!S_ISFIFO(ino->i_mode)) {
++		eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode);
++		return -EINVAL;
++	}
++	if (fi->cpt_inode == CPT_NULL)
++		return 0;
++
++	err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
++	if (err)
++		return err;
++
++	if (ii.cpt_next <= ii.cpt_hdrlen)
++		return 0;
++
++	err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx);
++	if (err)
++		return err;
++
++	if (b.cpt_size == 0)
++		return 0;
++
++	mutex_lock(&ino->i_mutex);
++	info = ino->i_pipe;
++	if (info->nrbufs) {
++		mutex_unlock(&ino->i_mutex);
++		eprintk("pipe buffer is restored already\n");
++		return -EINVAL;
++	}
++	info->curbuf = 0;
++	count = 0;
++	while (count < b.cpt_size) {
++		struct pipe_buffer *buf = info->bufs + info->nrbufs;
++		void * addr;
++		int chars;
++
++		chars = b.cpt_size - count;
++		if (chars > PAGE_SIZE)
++			chars = PAGE_SIZE;
++		if (!try_module_get(THIS_MODULE)) {
++			err = -EBUSY;
++			break;
++		}
++
++		buf->page = alloc_page(GFP_HIGHUSER);
++		if (buf->page == NULL) {
++			err = -ENOMEM;
++			break;
++		}
++		buf->ops = &_anon_pipe_buf_ops;
++		buf->offset = 0;
++		buf->len = chars;
++		info->nrbufs++;
++		addr = kmap(buf->page);
++		err = ctx->pread(addr, chars, ctx,
++				 fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count);
++		if (err)
++			break;
++		count += chars;
++	}
++	mutex_unlock(&ino->i_mutex);
++
++	return err;
++}
++
++static int make_flags(struct cpt_file_image *fi)
++{
++	int flags = O_NOFOLLOW;
++	switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) {
++	case FMODE_READ|FMODE_WRITE:
++		flags |= O_RDWR; break;
++	case FMODE_WRITE:
++		flags |= O_WRONLY; break;
++	case FMODE_READ:
++		flags |= O_RDONLY; break;
++	default: break;
++	}
++	flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC);
++	flags |= O_NONBLOCK|O_NOCTTY;
++	return flags;
++}
++
++static struct file *open_pipe(char *name,
++			      struct cpt_file_image *fi,
++			      unsigned flags,
++			      struct cpt_context *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++	struct cpt_inode_image ii;
++	struct file *rf, *wf;
++
++	err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
++	if (err)
++		return ERR_PTR(err);
++
++	if (ii.cpt_sb == FSMAGIC_PIPEFS) {
++		int pfd[2];
++
++		if ((err = sc_pipe(pfd)) < 0)
++			return ERR_PTR(err);
++
++		rf = fcheck(pfd[0]);
++		wf = fcheck(pfd[1]);
++		get_file(rf);
++		get_file(wf);
++		sc_close(pfd[0]);
++		sc_close(pfd[1]);
++
++		if (fi->cpt_mode&FMODE_READ) {
++			struct file *tf;
++			tf = wf; wf = rf; rf = tf;
++		}
++	} else {
++		if (fi->cpt_mode&FMODE_READ) {
++			rf = filp_open(name, flags, 0);
++			if (IS_ERR(rf)) {
++				dprintk_ctx("filp_open\n");
++				return rf;
++			}
++			dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current),
++				    (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode);
++			return rf;
++		}
++
++		dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode);
++
++		rf = filp_open(name, O_RDWR|O_NONBLOCK, 0);
++		if (IS_ERR(rf))
++			return rf;
++		wf = dentry_open(dget(rf->f_dentry),
++				 mntget(rf->f_vfsmnt), flags);
++	}
++
++	/* Add pipe inode to obj table. */
++	obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx);
++	if (obj == NULL) {
++		fput(rf); fput(wf);
++		return ERR_PTR(-ENOMEM);
++	}
++	cpt_obj_setpos(obj, fi->cpt_inode, ctx);
++	obj->o_parent = rf;
++
++	/* Add another side of pipe to obj table, it will not be used
++	 * (o_pos = PT_NULL), another processes opeining pipe will find
++	 * inode and open it with dentry_open(). */
++	obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx);
++	if (obj == NULL) {
++		fput(wf);
++		return ERR_PTR(-ENOMEM);
++	}
++	return wf;
++}
++
++static struct file *open_special(struct cpt_file_image *fi,
++				 unsigned flags,
++				 int deleted,
++				 struct cpt_context *ctx)
++{
++	struct cpt_inode_image *ii;
++	struct file *file;
++
++	/* Directories and named pipes are not special actually */
++	if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode))
++		return NULL;
++
++	/* No support for block devices at the moment. */
++	if (S_ISBLK(fi->cpt_i_mode))
++		return ERR_PTR(-EINVAL);
++
++	if (S_ISSOCK(fi->cpt_i_mode)) {
++		eprintk_ctx("bug: socket is not open\n");
++		return ERR_PTR(-EINVAL);
++	}
++
++	/* Support only (some) character devices at the moment. */
++	if (!S_ISCHR(fi->cpt_i_mode))
++		return ERR_PTR(-EINVAL);
++
++	ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx);
++	if (ii == NULL)
++		return ERR_PTR(-ENOMEM);
++
++	/* Do not worry about this right now. /dev/null,zero,*random are here.
++	 * To prohibit at least /dev/mem?
++	 */
++	if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) {
++		kfree(ii);
++		return NULL;
++	}
++
++	/* /dev/net/tun will be opened by caller */
++	if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) {
++		kfree(ii);
++		return NULL;
++	}	
++
++	file = rst_open_tty(fi, ii, flags, ctx);
++	kfree(ii);
++	return file;
++}
++
++static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx)
++{
++	struct file_lock lock;
++	cpt_object_t *obj;
++
++	memset(&lock, 0, sizeof(lock));
++	lock.fl_type = fli->cpt_type;
++	lock.fl_flags = fli->cpt_flags & ~FL_SLEEP;
++	lock.fl_start = fli->cpt_start;
++	lock.fl_end = fli->cpt_end;
++	obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx);
++	if (!obj) {
++		eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner);
++		return -EINVAL;
++	}
++	lock.fl_owner = obj->o_obj;
++	lock.fl_pid = vpid_to_pid(fli->cpt_pid);
++	if (lock.fl_pid < 0) {
++		eprintk_ctx("unknown lock pid %d\n", lock.fl_pid);
++		return -EINVAL;
++	}
++	lock.fl_file = file;
++
++	if (lock.fl_owner == NULL)
++		eprintk_ctx("no lock owner\n");
++	return posix_lock_file(file, &lock, NULL);
++}
++
++static int restore_flock(struct file *file, struct cpt_flock_image *fli,
++			 cpt_context_t *ctx)
++{
++	int cmd, err, fd;
++	fd = get_unused_fd();
++	if (fd < 0) {
++		eprintk_ctx("BSD flock cannot be restored\n");
++		return fd;
++	}
++	get_file(file);
++	fd_install(fd, file);
++	if (fli->cpt_type == F_RDLCK) {
++		cmd = LOCK_SH;
++	} else if (fli->cpt_type == F_WRLCK) {
++		cmd = LOCK_EX;
++	} else {
++		eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type);
++		sc_close(fd);
++		return -EINVAL;
++	}
++
++	err = sc_flock(fd, LOCK_NB | cmd);
++	sc_close(fd);
++	return err;
++}
++
++
++static int fixup_posix_locks(struct file *file,
++			     struct cpt_file_image *fi,
++			     loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	loff_t end;
++	struct cpt_flock_image fli;
++
++	end = pos + fi->cpt_next;
++	pos += fi->cpt_hdrlen;
++	while (pos < end) {
++		err = rst_get_object(-1, pos, &fli, ctx);
++		if (err)
++			return err;
++		if (fli.cpt_object == CPT_OBJ_FLOCK &&
++		    (fli.cpt_flags&FL_POSIX)) {
++			err = restore_posix_lock(file, &fli, ctx);
++			if (err)
++				return err;
++			dprintk_ctx("posix lock restored\n");
++		}
++		pos += fli.cpt_next;
++	}
++	return 0;
++}
++
++int rst_posix_locks(struct cpt_context *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++		struct cpt_file_image fi;
++
++		if (obj->o_pos == CPT_NULL)
++			continue;
++
++		err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx);
++		if (err < 0)
++			return err;
++		if (fi.cpt_next > fi.cpt_hdrlen)
++			fixup_posix_locks(file, &fi, obj->o_pos, ctx);
++	}
++	return 0;
++}
++
++static int fixup_flocks(struct file *file,
++			struct cpt_file_image *fi,
++			loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	loff_t end;
++	struct cpt_flock_image fli;
++
++	end = pos + fi->cpt_next;
++	pos += fi->cpt_hdrlen;
++	while (pos < end) {
++		err = rst_get_object(-1, pos, &fli, ctx);
++		if (err)
++			return err;
++		if (fli.cpt_object == CPT_OBJ_FLOCK &&
++		    (fli.cpt_flags&FL_FLOCK)) {
++			err = restore_flock(file, &fli, ctx);
++			if (err)
++				return err;
++			dprintk_ctx("bsd lock restored\n");
++		}
++		pos += fli.cpt_next;
++	}
++	return 0;
++}
++
++
++static int fixup_reg_data(struct file *file, loff_t pos, loff_t end,
++			  struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_page_block pgb;
++	ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
++
++	do_write = file->f_op->write;
++	if (do_write == NULL) {
++		eprintk_ctx("no write method. Cannot restore contents of the file.\n");
++		return -EINVAL;
++	}
++
++	atomic_inc(&file->f_count);
++
++	while (pos < end) {
++		loff_t opos;
++		loff_t ipos;
++		int count;
++
++		err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
++		if (err)
++			goto out;
++		dprintk_ctx("restoring file data block: %08x-%08x\n",
++		       (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
++		ipos = pos + pgb.cpt_hdrlen;
++		opos = pgb.cpt_start;
++		count = pgb.cpt_end-pgb.cpt_start;
++		while (count > 0) {
++			mm_segment_t oldfs;
++			int copy = count;
++
++			if (copy > PAGE_SIZE)
++				copy = PAGE_SIZE;
++			(void)cpt_get_buf(ctx);
++			oldfs = get_fs(); set_fs(KERNEL_DS);
++			err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
++			set_fs(oldfs);
++			if (err) {
++				__cpt_release_buf(ctx);
++				goto out;
++			}
++			if (!(file->f_mode & FMODE_WRITE) ||
++			    (file->f_flags&O_DIRECT)) {
++				fput(file);
++				file = dentry_open(dget(file->f_dentry),
++						   mntget(file->f_vfsmnt), O_WRONLY);
++				if (IS_ERR(file)) {
++					__cpt_release_buf(ctx);
++					return PTR_ERR(file);
++				}
++			}
++			oldfs = get_fs(); set_fs(KERNEL_DS);
++			ipos += copy;
++			err = do_write(file, ctx->tmpbuf, copy, &opos);
++			set_fs(oldfs);
++			__cpt_release_buf(ctx);
++			if (err != copy) {
++				if (err >= 0)
++					err = -EIO;
++				goto out;
++			}
++			count -= copy;
++		}
++		pos += pgb.cpt_next;
++	}
++	err = 0;
++
++out:
++	fput(file);
++	return err;
++}
++
++
++static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi,
++			      struct cpt_inode_image *ii,
++			      struct cpt_context *ctx)
++{
++	int err;
++	struct file *file = *file_p;
++	struct iattr newattrs;
++
++	if (!S_ISREG(fi->cpt_i_mode))
++		return 0;
++
++	if (file == NULL) {
++		file = shmem_file_setup("dev/zero", ii->cpt_size, 0);
++		if (IS_ERR(file))
++			return PTR_ERR(file);
++		*file_p = file;
++	}
++
++	if (ii->cpt_next > ii->cpt_hdrlen) {
++		struct cpt_object_hdr hdr;
++		err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen);
++		if (err)
++			return err;
++		if (hdr.cpt_object == CPT_OBJ_PAGES) {
++			err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen,
++					fi->cpt_inode+ii->cpt_next, ctx);
++			if (err)
++				return err;
++		}
++	}
++
++	mutex_lock(&file->f_dentry->d_inode->i_mutex);
++	/* stage 1 - update size like do_truncate does */
++	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
++	newattrs.ia_size = ii->cpt_size;
++	cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime);
++	err = notify_change(file->f_dentry, &newattrs);
++	if (err)
++		goto out;
++
++	/* stage 2 - update times, owner and mode */
++	newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME |
++		ATTR_ATIME_SET | ATTR_MTIME_SET |
++		ATTR_MODE | ATTR_UID | ATTR_GID;
++	newattrs.ia_uid = ii->cpt_uid;
++	newattrs.ia_gid = ii->cpt_gid;
++	newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT;
++	newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT);
++	cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime);
++	cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime);
++	err = notify_change(file->f_dentry, &newattrs);
++
++out:
++	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
++	return err;
++}
++
++static int fixup_file_flags(struct file *file, struct cpt_file_image *fi,
++			    int was_dentry_open, loff_t pos,
++			    cpt_context_t *ctx)
++{
++	if (fi->cpt_pos != file->f_pos) {
++		int err = -ESPIPE;
++		if (file->f_op->llseek)
++			err = file->f_op->llseek(file, fi->cpt_pos, 0);
++		if (err < 0) {
++			dprintk_ctx("file %Ld lseek %Ld - %Ld\n",
++				    (long long)pos,
++				    (long long)file->f_pos,
++				    (long long)fi->cpt_pos);
++			file->f_pos = fi->cpt_pos;
++		}
++	}
++	file->f_uid = fi->cpt_uid;
++	file->f_gid = fi->cpt_gid;
++	file->f_owner.pid = 0;
++	if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) {
++		file->f_owner.pid = find_get_pid(fi->cpt_fown_pid);
++		if (file->f_owner.pid == NULL) {
++			wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n",
++					fi->cpt_fown_pid);
++			return -EINVAL;
++		}
++	}
++	file->f_owner.uid = fi->cpt_fown_uid;
++	file->f_owner.euid = fi->cpt_fown_euid;
++	file->f_owner.signum = fi->cpt_fown_signo;
++
++	if (file->f_mode != fi->cpt_mode) {
++		if (was_dentry_open &&
++		    ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) {
++			file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK);
++			file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK);
++		}
++		if (file->f_mode != fi->cpt_mode)
++			wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode);
++	}
++	if (file->f_flags != fi->cpt_flags) {
++		if (!(fi->cpt_flags&O_NOFOLLOW))
++			file->f_flags &= ~O_NOFOLLOW;
++		if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) {
++			file->f_flags &= ~O_NONBLOCK;
++			file->f_flags |= fi->cpt_flags&O_NONBLOCK;
++		}
++		if (fi->cpt_flags&FASYNC) {
++			if (fi->cpt_fown_fd == -1) {
++				wprintk_ctx("No fd for FASYNC\n");
++				return -EINVAL;
++			} else if (file->f_op && file->f_op->fasync) {
++				if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) {
++					wprintk_ctx("FASYNC problem\n");
++					return -EINVAL;
++				} else {
++					file->f_flags |= FASYNC;
++				}
++			}
++		}
++		if (file->f_flags != fi->cpt_flags) {
++			eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags);
++			return -EINVAL;
++		}
++	}
++	return 0;
++}
++
++static struct file *
++open_deleted(char *name, unsigned flags, struct cpt_file_image *fi,
++	     struct cpt_inode_image *ii, cpt_context_t *ctx)
++{
++	struct file * file;
++	char *suffix = NULL;
++	int attempt = 0;
++	int tmp_pass = 0;
++	mode_t mode = fi->cpt_i_mode;
++
++	/* Strip (deleted) part... */
++	if (strlen(name) > strlen(" (deleted)")) {
++		if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) {
++			suffix = &name[strlen(name) - strlen(" (deleted)")];
++			*suffix = 0;
++		} else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) {
++			memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1);
++			suffix = name + strlen(name);
++		}
++	}
++
++try_again:
++	for (;;) {
++		if (attempt) {
++			if (attempt > 1000) {
++				eprintk_ctx("open_deleted: failed after %d attempts\n", attempt);
++				return ERR_PTR(-EEXIST);
++			}
++			if (suffix == NULL) {
++				eprintk_ctx("open_deleted: no suffix\n");
++				return ERR_PTR(-EEXIST);
++			}
++			sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt));
++		}
++		attempt++;
++
++		if (S_ISFIFO(mode)) {
++			int err;
++			err = sc_mknod(name, S_IFIFO|(mode&017777), 0);
++			if (err == -EEXIST)
++				continue;
++			if (err < 0 && !tmp_pass)
++				goto change_dir;
++			if (err < 0)
++				return ERR_PTR(err);
++			file = open_pipe(name, fi, flags, ctx);
++			sc_unlink(name);
++		} else if (S_ISCHR(mode)) {
++			int err;
++			err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev));
++			if (err == -EEXIST)
++				continue;
++			if (err < 0 && !tmp_pass)
++				goto change_dir;
++			if (err < 0)
++				return ERR_PTR(err);
++			file = filp_open(name, flags, mode&017777);
++			sc_unlink(name);
++		} else if (S_ISDIR(mode)) {
++			int err;
++			err = sc_mkdir(name, mode&017777);
++			if (err == -EEXIST)
++				continue;
++			if (err < 0 && !tmp_pass)
++				goto change_dir;
++			if (err < 0)
++				return ERR_PTR(err);
++			file = filp_open(name, flags, mode&017777);
++			sc_rmdir(name);
++		} else {
++			file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777);
++			if (IS_ERR(file)) {
++				if (PTR_ERR(file) == -EEXIST)
++					continue;
++				if (!tmp_pass)
++					goto change_dir;
++			} else {
++				sc_unlink(name);
++			}
++		}
++		break;
++	}
++
++	if (IS_ERR(file)) {
++		eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file));
++		return file;
++	} else {
++		dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode);
++	}
++	return file;
++
++change_dir:
++	sprintf(name, "/tmp/rst%u", current->pid);
++	suffix = name + strlen(name);
++	attempt = 1;
++	tmp_pass = 1;
++	goto try_again;
++}
++
++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx)
++{
++	int err;
++	int was_dentry_open = 0;
++	cpt_object_t *obj;
++	cpt_object_t *iobj;
++	struct cpt_file_image fi;
++	__u8 *name = NULL;
++	struct file *file;
++	int flags;
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx);
++	if (obj) {
++		file = obj->o_obj;
++		if (obj->o_index >= 0) {
++			dprintk_ctx("file is attached to a socket\n");
++			err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
++			if (err < 0)
++				goto err_out;
++			fixup_file_flags(file, &fi, 0, pos, ctx);
++		}
++		get_file(file);
++		return file;
++	}
++
++	err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
++	if (err < 0)
++		goto err_out;
++
++	flags = make_flags(&fi);
++
++	/* Easy way, inode has been already open. */
++	if (fi.cpt_inode != CPT_NULL &&
++	    !(fi.cpt_lflags & CPT_DENTRY_CLONING) &&
++	    (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL &&
++	    iobj->o_parent) {
++		struct file *filp = iobj->o_parent;
++		file = dentry_open(dget(filp->f_dentry),
++				   mntget(filp->f_vfsmnt), flags);
++		dprintk_ctx("rst_file: file obtained by dentry_open\n");
++		was_dentry_open = 1;
++		goto map_file;
++	}
++
++	if (fi.cpt_next > fi.cpt_hdrlen)
++		name = rst_get_name(pos + sizeof(fi), ctx);
++
++	if (!name) {
++		eprintk_ctx("no name for file?\n");
++		err = -EINVAL;
++		goto err_out;
++	}
++
++	if (fi.cpt_lflags & CPT_DENTRY_DELETED) {
++		struct cpt_inode_image ii;
++		if (fi.cpt_inode == CPT_NULL) {
++			eprintk_ctx("deleted file and no inode.\n");
++			err = -EINVAL;
++			goto err_out;
++		}
++
++		err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx);
++		if (err)
++			goto err_out;
++
++		if (ii.cpt_next > ii.cpt_hdrlen) {
++			struct cpt_object_hdr hdr;
++			err = ctx->pread(&hdr, sizeof(hdr), ctx,
++					fi.cpt_inode + ii.cpt_hdrlen);
++			if (err)
++				goto err_out;
++			if (hdr.cpt_object == CPT_OBJ_NAME) {
++				rst_put_name(name, ctx);
++				name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen,
++						ctx);
++				if (!name) {
++					eprintk_ctx("no name for link?\n");
++					err = -EINVAL;
++					goto err_out;
++				}
++				goto open_file;
++			}
++		}
++
++		/* One very special case... */
++		if (S_ISREG(fi.cpt_i_mode) &&
++		    (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) {
++			/* MAP_ANON|MAP_SHARED mapping.
++			 * kernel makes this damn ugly way, when file which
++			 * is passed to mmap by user does not match
++			 * file finally attached to VMA. Ok, rst_mm
++			 * has to take care of this. Otherwise, it will fail.
++			 */
++			file = NULL;
++		} else if (S_ISREG(fi.cpt_i_mode) ||
++			   S_ISCHR(fi.cpt_i_mode) ||
++			   S_ISFIFO(fi.cpt_i_mode) ||
++			   S_ISDIR(fi.cpt_i_mode)) {
++			if (S_ISCHR(fi.cpt_i_mode)) {
++				file = open_special(&fi, flags, 1, ctx);
++				if (file != NULL)
++					goto map_file;
++			}
++			file = open_deleted(name, flags, &fi, &ii, ctx);
++			if (IS_ERR(file))
++				goto out;
++		} else {
++			eprintk_ctx("not a regular deleted file.\n");
++			err = -EINVAL;
++			goto err_out;
++		}
++
++		err = fixup_file_content(&file, &fi, &ii, ctx);
++		if (err)
++			goto err_put;
++		goto map_file;
++	} else {
++open_file:
++		if (!name[0]) {
++			eprintk_ctx("empty name for file?\n");
++			err = -EINVAL;
++			goto err_out;
++		}
++		if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) &&
++		    (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL)
++			goto map_file;
++#ifdef CONFIG_INOTIFY_USER
++		if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) &&
++		    (file = rst_open_inotify(&fi, flags, ctx)) != NULL)
++			goto map_file;
++#else
++		if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) {
++			err = -EINVAL;
++			goto err_out;
++		}
++#endif
++		if (S_ISFIFO(fi.cpt_i_mode) &&
++		    (file = open_pipe(name, &fi, flags, ctx)) != NULL)
++			goto map_file;
++		if (!S_ISREG(fi.cpt_i_mode) &&
++		    (file = open_special(&fi, flags, 0, ctx)) != NULL)
++			goto map_file;
++	}
++
++	file = filp_open(name, flags, 0);
++
++map_file:
++	if (!IS_ERR(file)) {
++		fixup_file_flags(file, &fi, was_dentry_open, pos, ctx);
++
++		if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) {
++			err = fixup_pipe_data(file, &fi, ctx);
++			if (err)
++				goto err_put;
++		}
++
++		/* This is very special hack. Logically, cwd/root are
++		 * nothing but open directories. Nevertheless, this causes
++		 * failures of restores, when number of open files in VE
++		 * is close to limit. So, if it is rst_file() of cwd/root
++		 * (fd = -2) and the directory is not deleted, we skip
++		 * adding files to object table. If the directory is
++		 * not unlinked, this cannot cause any problems.
++		 */
++		if (fd != -2 ||
++		    !S_ISDIR(file->f_dentry->d_inode->i_mode) ||
++		    (fi.cpt_lflags & CPT_DENTRY_DELETED)) {
++			obj = cpt_object_get(CPT_OBJ_FILE, file, ctx);
++			if (!obj) {
++				obj = cpt_object_add(CPT_OBJ_FILE, file, ctx);
++				if (obj)
++					get_file(file);
++			}
++			if (obj)
++				cpt_obj_setpos(obj, pos, ctx);
++
++			obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++			if (obj) {
++				cpt_obj_setpos(obj, fi.cpt_inode, ctx);
++				if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED))
++					obj->o_parent = file;
++			}
++		}
++
++		if (fi.cpt_next > fi.cpt_hdrlen) {
++			err = fixup_flocks(file, &fi, pos, ctx);
++			if (err)
++				goto err_put;
++		}
++	} else {
++		if (fi.cpt_lflags & CPT_DENTRY_PROC) {
++			dprintk_ctx("rst_file /proc delayed\n");
++			file = NULL;
++		} else if (name)
++			eprintk_ctx("can't open file %s\n", name);
++	}
++
++out:
++	if (name)
++		rst_put_name(name, ctx);
++	return file;
++
++err_put:
++	if (file)
++		fput(file);
++err_out:
++	if (name)
++		rst_put_name(name, ctx);
++	return ERR_PTR(err);
++}
++
++
++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	__u32 flag = 0;
++
++	if (ti->cpt_files == CPT_NULL ||
++	    lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx))
++		flag |= CLONE_FILES;
++	if (ti->cpt_fs == CPT_NULL ||
++	    lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx))
++		flag |= CLONE_FS;
++	return flag;
++}
++
++static void local_close_files(struct files_struct * files)
++{
++	int i, j;
++
++	j = 0;
++	for (;;) {
++		unsigned long set;
++		i = j * __NFDBITS;
++		if (i >= files->fdt->max_fds)
++			break;
++		set = files->fdt->open_fds->fds_bits[j];
++		while (set) {
++			if (set & 1) {
++				struct file * file = xchg(&files->fdt->fd[i], NULL);
++				if (file)
++					filp_close(file, files);
++			}
++			i++;
++			set >>= 1;
++		}
++		files->fdt->open_fds->fds_bits[j] = 0;
++		files->fdt->close_on_exec->fds_bits[j] = 0;
++		j++;
++	}
++}
++
++extern int expand_fdtable(struct files_struct *files, int nr);
++
++
++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	struct cpt_files_struct_image fi;
++	struct files_struct *f = current->files;
++	cpt_object_t *obj;
++	loff_t pos, endpos;
++	int err;
++
++	if (ti->cpt_files == CPT_NULL) {
++		current->files = NULL;
++		if (f)
++			put_files_struct(f);
++		return 0;
++	}
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx);
++	if (obj) {
++		if (obj->o_obj != f) {
++			put_files_struct(f);
++			f = obj->o_obj;
++			atomic_inc(&f->count);
++			current->files = f;
++		}
++		return 0;
++	}
++
++	err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx);
++	if (err)
++		return err;
++
++	local_close_files(f);
++
++	if (fi.cpt_max_fds > f->fdt->max_fds) {
++		spin_lock(&f->file_lock);
++		err = expand_fdtable(f, fi.cpt_max_fds-1);
++		spin_unlock(&f->file_lock);
++		if (err < 0)
++			return err;
++	}
++
++	pos = ti->cpt_files + fi.cpt_hdrlen;
++	endpos = ti->cpt_files + fi.cpt_next;
++	while (pos < endpos) {
++		struct cpt_fd_image fdi;
++		struct file *filp;
++
++		err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx);
++		if (err)
++			return err;
++		filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
++		if (IS_ERR(filp)) {
++			eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp),
++				    (long long)fdi.cpt_file);
++			return PTR_ERR(filp);
++		}
++		if (filp == NULL) {
++			int err = rst_filejob_queue(pos, ctx);
++			if (err)
++				return err;
++		} else {
++			if (fdi.cpt_fd >= f->fdt->max_fds) BUG();
++			f->fdt->fd[fdi.cpt_fd] = filp;
++			FD_SET(fdi.cpt_fd, f->fdt->open_fds);
++			if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
++				FD_SET(fdi.cpt_fd, f->fdt->close_on_exec);
++		}
++		pos += fdi.cpt_next;
++	}
++	f->next_fd = fi.cpt_next_fd;
++
++	obj = cpt_object_add(CPT_OBJ_FILES, f, ctx);
++	if (obj) {
++		cpt_obj_setpos(obj, ti->cpt_files, ctx);
++		cpt_obj_setindex(obj, fi.cpt_index, ctx);
++	}
++	return 0;
++}
++
++int rst_do_filejobs(cpt_context_t *ctx)
++{
++	struct filejob *j;
++
++	while ((j = ctx->filejob_queue) != NULL) {
++		int err;
++		struct task_struct *tsk;
++		struct cpt_fd_image fdi;
++		struct file *filp;
++
++		read_lock(&tasklist_lock);
++		tsk = find_task_by_vpid(j->pid);
++		if (tsk)
++			get_task_struct(tsk);
++		read_unlock(&tasklist_lock);
++		if (!tsk)
++			return -EINVAL;
++
++		err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx);
++		if (err) {
++			put_task_struct(tsk);
++			return err;
++		}
++
++		if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
++		if (tsk->files->fdt->fd[fdi.cpt_fd] ||
++		    FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) {
++			eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi);
++			put_task_struct(tsk);
++			return -EBUSY;
++		}
++
++		filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
++		if (IS_ERR(filp)) {
++			eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file);
++			put_task_struct(tsk);
++			return PTR_ERR(filp);
++		}
++		if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
++		tsk->files->fdt->fd[fdi.cpt_fd] = filp;
++		FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds);
++		if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
++			FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec);
++
++		dprintk_ctx("filejob %Ld done\n", j->fdi);
++
++		put_task_struct(tsk);
++		ctx->filejob_queue = j->next;
++		kfree(j);
++	}
++	return 0;
++}
++
++void rst_flush_filejobs(cpt_context_t *ctx)
++{
++	struct filejob *j;
++
++	while ((j = ctx->filejob_queue) != NULL) {
++		ctx->filejob_queue = j->next;
++		kfree(j);
++	}
++}
++
++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	struct fs_struct *f = current->fs;
++	cpt_object_t *obj;
++
++	if (ti->cpt_fs == CPT_NULL) {
++		exit_fs(current);
++		return 0;
++	}
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx);
++	if (obj) {
++		if (obj->o_obj != f) {
++			exit_fs(current);
++			f = obj->o_obj;
++			atomic_inc(&f->count);
++			current->fs = f;
++		}
++		return 0;
++	}
++
++	/* Do _not_ restore root. Image contains absolute pathnames.
++	 * So, we fix it in context of rst process.
++	 */
++
++	obj = cpt_object_add(CPT_OBJ_FS, f, ctx);
++	if (obj)
++		cpt_obj_setpos(obj, ti->cpt_fs, ctx);
++
++	return 0;
++}
++
++int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp,
++		   loff_t *pos, struct cpt_context *ctx)
++{
++	struct cpt_file_image fi;
++	struct file * file;
++	int err;
++
++	err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx);
++	if (err)
++		return err;
++
++	file = rst_file(*pos, -2, ctx);
++	if (IS_ERR(file))
++		return PTR_ERR(file);
++
++	*dp = dget(file->f_dentry);
++	*mp = mntget(file->f_vfsmnt);
++	*pos += fi.cpt_next;
++	fput(file);
++	return 0;
++}
++
++static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
++			  struct dentry *dentry)
++{
++	struct dentry *old_root;
++	struct vfsmount *old_rootmnt;
++	write_lock(&fs->lock);
++	old_root = fs->root.dentry;
++	old_rootmnt = fs->root.mnt;
++	fs->root.mnt = mnt;
++	fs->root.dentry = dentry;
++	write_unlock(&fs->lock);
++	if (old_root) {
++		dput(old_root);
++		mntput(old_rootmnt);
++	}
++}
++
++static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
++			 struct dentry *dentry)
++{
++	struct dentry *old_pwd;
++	struct vfsmount *old_pwdmnt;
++
++	write_lock(&fs->lock);
++	old_pwd = fs->pwd.dentry;
++	old_pwdmnt = fs->pwd.mnt;
++	fs->pwd.mnt = mnt;
++	fs->pwd.dentry = dentry;
++	write_unlock(&fs->lock);
++
++	if (old_pwd) {
++		dput(old_pwd);
++		mntput(old_pwdmnt);
++	}
++}
++
++
++int rst_restore_fs(struct cpt_context *ctx)
++{
++	loff_t pos;
++	cpt_object_t *obj;
++	int err = 0;
++
++	for_each_object(obj, CPT_OBJ_FS) {
++		struct cpt_fs_struct_image fi;
++		struct fs_struct *fs = obj->o_obj;
++		int i;
++		struct dentry *d[3];
++		struct vfsmount *m[3];
++
++		err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx);
++		if (err)
++			return err;
++
++		fs->umask = fi.cpt_umask;
++
++		pos = obj->o_pos + fi.cpt_hdrlen;
++		d[0] = d[1] = d[2] = NULL;
++		m[0] = m[1] = m[2] = NULL;
++		i = 0;
++		while (pos < obj->o_pos + fi.cpt_next && i<3) {
++			err = cpt_get_dentry(d+i, m+i, &pos, ctx);
++			if (err) {
++				eprintk_ctx("cannot get_dir: %d", err);
++				for (--i; i >= 0; i--) {
++					if (d[i])
++						dput(d[i]);
++					if (m[i])
++						mntput(m[i]);
++				}
++				return err;
++			}
++			i++;
++		}
++		if (d[0])
++			__set_fs_root(fs, m[0], d[0]);
++		if (d[1])
++			__set_fs_pwd(fs, m[1], d[1]);
++		if (d[2]) {
++			struct dentry *olddentry;
++			struct vfsmount *oldmnt;
++			write_lock(&fs->lock);
++			oldmnt = fs->altroot.mnt;
++			olddentry = fs->altroot.dentry;
++			fs->altroot.mnt = m[2];
++			fs->altroot.dentry = d[2];
++			write_unlock(&fs->lock);
++
++			if (olddentry) {
++				dput(olddentry);
++				mntput(oldmnt);
++			}
++		}
++	}
++	return err;
++}
++
++int do_one_mount(char *mntpnt, char *mnttype, char *mntbind,
++		 unsigned long flags, unsigned long mnt_flags,
++		 struct cpt_context *ctx)
++{
++	int err;
++
++	if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0))
++		mntbind = NULL;
++
++	if (mntbind)
++		flags |= MS_BIND;
++	/* Join per-mountpoint flags with global flags */
++	if (mnt_flags & MNT_NOSUID)
++		flags |= MS_NOSUID;
++	if (mnt_flags & MNT_NODEV)
++		flags |= MS_NODEV;
++	if (mnt_flags & MNT_NOEXEC)
++		flags |= MS_NOEXEC;
++
++	err = sc_mount(mntbind, mntpnt, mnttype, flags);
++	if (err < 0) {
++		eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags);
++		return err;
++	}
++	return 0;
++}
++
++static int undumptmpfs(void *arg)
++{
++	int i;
++	int *pfd = arg;
++	int fd1, fd2, err;
++	char *argv[] = { "tar", "x", "-C", "/", "-S", NULL };
++
++	if (pfd[0] != 0)
++		sc_dup2(pfd[0], 0);
++
++	set_fs(KERNEL_DS);
++	fd1 = sc_open("/dev/null", O_WRONLY, 0);
++	fd2 = sc_open("/dev/null", O_WRONLY, 0);
++try:
++	if (fd1 < 0 || fd2 < 0) {
++		if (fd1 == -ENOENT && fd2 == -ENOENT) {
++			err = sc_mknod("/dev/null", S_IFCHR|0666,
++					new_encode_dev((MEM_MAJOR<<MINORBITS)|3));
++			if (err < 0) {
++				eprintk("can't create /dev/null: %d\n", err);
++				module_put(THIS_MODULE);
++				return 255 << 8;
++			}
++			fd1 = sc_open("/dev/null", O_WRONLY, 0666);
++			fd2 = sc_open("/dev/null", O_WRONLY, 0666);
++			sc_unlink("/dev/null");
++			goto try;
++		}
++		eprintk("can not open /dev/null for tar: %d %d\n", fd1, fd2);
++		module_put(THIS_MODULE);
++		return 255 << 8;
++	}
++	if (fd1 != 1)
++		sc_dup2(fd1, 1);
++	if (fd2 != 2)
++		sc_dup2(fd2, 2);
++
++	for (i = 3; i < current->files->fdt->max_fds; i++)
++		sc_close(i);
++
++	module_put(THIS_MODULE);
++
++	i = sc_execve("/bin/tar", argv, NULL);
++	eprintk("failed to exec /bin/tar: %d\n", i);
++	return 255 << 8;
++}
++
++static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx)
++{
++	int err;
++	int pfd[2];
++	struct file *f;
++	struct cpt_object_hdr v;
++	int n;
++	loff_t end;
++	int pid;
++	int status;
++	mm_segment_t oldfs;
++	sigset_t ignore, blocked;
++
++	err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx);
++	if (err < 0)
++		return err;
++
++	err = sc_pipe(pfd);
++	if (err < 0)
++		return err;
++	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
++	sigprocmask(SIG_BLOCK, &ignore, &blocked);
++	pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0);
++	if (err < 0) {
++		eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
++		goto out;
++	}
++	f = fget(pfd[1]);
++	sc_close(pfd[1]);
++	sc_close(pfd[0]);
++
++	ctx->file->f_pos = *pos + v.cpt_hdrlen;
++	end = *pos + v.cpt_next;
++	*pos += v.cpt_next;
++	do {
++		char buf[16];
++
++		n = end - ctx->file->f_pos;
++		if (n > sizeof(buf))
++			n = sizeof(buf);
++
++		if (ctx->read(buf, n, ctx))
++			break;
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		f->f_op->write(f, buf, n, &f->f_pos);
++		set_fs(oldfs);
++	} while (ctx->file->f_pos < end);
++
++	fput(f);
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if ((err = sc_waitx(pid, 0, &status)) < 0)
++		eprintk_ctx("wait4: %d\n", err);
++	else if ((status & 0x7f) == 0) {
++		err = (status & 0xff00) >> 8;
++		if (err != 0) {
++			eprintk_ctx("tar exited with %d\n", err);
++			err = -EINVAL;
++		}
++	} else {
++		eprintk_ctx("tar terminated\n");
++		err = -EINVAL;
++	}
++	set_fs(oldfs);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++
++	return err;
++
++out:
++	if (pfd[1] >= 0)
++		sc_close(pfd[1]);
++	if (pfd[0] >= 0)
++		sc_close(pfd[0]);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++	return err;
++}
++
++int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx)
++{
++	struct mnt_namespace *n;
++	struct list_head *p;
++	struct vfsmount *t;
++	char *path, *path_buf;
++	int ret;
++
++	n = current->nsproxy->mnt_ns;
++	ret = -ENOENT;
++	path_buf = cpt_get_buf(ctx);
++	down_read(&namespace_sem);
++	list_for_each(p, &n->list) {
++		struct path pt;
++		t = list_entry(p, struct vfsmount, mnt_list);
++		pt.dentry = t->mnt_root;
++		pt.mnt = t;
++		path = d_path(&pt, path_buf, PAGE_SIZE);
++		if (IS_ERR(path))
++			continue;
++		if (!strcmp(path, mntpnt) &&
++		    !strcmp(t->mnt_sb->s_type->name, mnttype)) {
++			ret = 0;
++			break;
++		}
++	}
++	up_read(&namespace_sem);
++	__cpt_release_buf(ctx);
++	return ret;
++}
++
++int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	loff_t endpos;
++
++	endpos = pos + mi->cpt_next;
++	pos += mi->cpt_hdrlen;
++
++	while (pos < endpos) {
++		char *mntdev;
++		char *mntpnt;
++		char *mnttype;
++		char *mntbind;
++
++		mntdev = __rst_get_name(&pos, ctx);
++		mntpnt = __rst_get_name(&pos, ctx);
++		mnttype = __rst_get_name(&pos, ctx);
++		mntbind = NULL;
++		if (mi->cpt_mntflags & CPT_MNT_BIND)
++			mntbind = __rst_get_name(&pos, ctx);
++		err = -EINVAL;
++		if (mnttype && mntpnt) {
++			err = 0;
++			if (!(mi->cpt_mntflags & CPT_MNT_EXT) &&
++			    strcmp(mntpnt, "/")) {
++				err = do_one_mount(mntpnt, mnttype, mntbind,
++						   mi->cpt_flags,
++						   mi->cpt_mntflags, ctx);
++				if (!err &&
++				    strcmp(mnttype, "tmpfs") == 0 &&
++				    !(mi->cpt_mntflags & (CPT_MNT_BIND)))
++					    err = rst_restore_tmpfs(&pos, ctx);
++			} else if (mi->cpt_mntflags & CPT_MNT_EXT) {
++				err = check_ext_mount(mntpnt, mnttype, ctx);
++				if (err)
++					eprintk_ctx("mount point is missing: %s\n", mntpnt);
++			}
++		}
++		if (mntdev)
++			rst_put_name(mntdev, ctx);
++		if (mntpnt)
++			rst_put_name(mntpnt, ctx);
++		if (mnttype)
++			rst_put_name(mnttype, ctx);
++		if (mntbind)
++			rst_put_name(mntbind, ctx);
++		if (err)
++			return err;
++	}
++	return 0;
++}
++
++int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_vfsmount_image mi;
++
++	while (pos < endpos) {
++		err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx);
++		if (err)
++			return err;
++		err = restore_one_vfsmount(&mi, pos, ctx);
++		if (err)
++			return err;
++		pos += mi.cpt_next;
++	}
++	return 0;
++}
++
++int rst_root_namespace(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_NAMESPACE];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_object_hdr sbuf;
++	int done = 0;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx);
++		if (err)
++			return err;
++		if (done) {
++			eprintk_ctx("multiple namespaces are not supported\n");
++			break;
++		}
++		done++;
++		err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx);
++		if (err)
++			return err;
++		sec += sbuf.cpt_next;
++	}
++
++	return 0;
++}
++
++int rst_stray_files(struct cpt_context *ctx)
++{
++	int err = 0;
++	loff_t sec = ctx->sections[CPT_SECT_FILES];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		struct cpt_object_hdr sbuf;
++		cpt_object_t *obj;
++
++		err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx);
++		if (err)
++			break;
++
++		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx);
++		if (!obj) {
++			struct file *file;
++
++			dprintk_ctx("stray file %Ld\n", sec);
++
++			file = rst_sysv_shm_itself(sec, ctx);
++
++			if (IS_ERR(file)) {
++				eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file));
++				return PTR_ERR(file);
++			} else {
++				fput(file);
++			}
++		}
++		sec += sbuf.cpt_next;
++	}
++
++	return err;
++}
+diff --git a/kernel/cpt/rst_inotify.c b/kernel/cpt/rst_inotify.c
+new file mode 100644
+index 0000000..0dcaf47
+--- /dev/null
++++ b/kernel/cpt/rst_inotify.c
+@@ -0,0 +1,196 @@
++/*
++ *
++ *  kernel/cpt/rst_inotify.c
++ *
++ *  Copyright (C) 2000-2007  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mnt_namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/inotify.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++extern struct file_operations inotify_fops;
++
++struct file *rst_open_inotify(struct cpt_file_image *fi,
++			      unsigned flags,
++			      struct cpt_context *ctx)
++{
++	struct file *file;
++	int fd;
++
++	fd = sys_inotify_init();
++	if (fd < 0)
++		return ERR_PTR(fd);
++
++	file = fget(fd);
++	sys_close(fd);
++	return file;
++}
++
++static int restore_one_inotify(cpt_object_t *obj,
++			       loff_t pos,
++			       struct cpt_inotify_image *ibuf,
++			       cpt_context_t *ctx)
++{
++	int err = 0;
++	loff_t endpos;
++	struct file *file = obj->o_obj;
++	struct inotify_device *dev;
++
++	if (file->f_op != &inotify_fops) {
++		eprintk_ctx("bad inotify file\n");
++		return -EINVAL;
++	}
++
++	dev = file->private_data;
++
++	if (unlikely(dev == NULL)) {
++		eprintk_ctx("bad inotify device\n");
++		return -EINVAL;
++	}
++
++	endpos = pos + ibuf->cpt_next;
++	pos += ibuf->cpt_hdrlen;
++	while (pos < endpos) {
++		union {
++			struct cpt_inotify_wd_image wi;
++			struct cpt_inotify_ev_image ei;
++		} u;
++
++		err = rst_get_object(-1, pos, &u, ctx);
++		if (err) {
++			eprintk_ctx("rst_get_object: %d\n", err);
++			return err;
++		}
++		if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) {
++			struct path p;
++			loff_t fpos = pos + u.wi.cpt_hdrlen;
++
++			err = cpt_get_dentry(&p.dentry, &p.mnt, &fpos, ctx);
++			if (err) {
++				eprintk_ctx("cpt_get_dentry: %d\n", err);
++				return err;
++			}
++
++			mutex_lock(&dev->up_mutex);
++			dev->ih->last_wd = u.wi.cpt_wd - 1;
++			err = inotify_create_watch(dev, &p, u.wi.cpt_mask);
++			dev->ih->last_wd = ibuf->cpt_last_wd;
++			if (err != u.wi.cpt_wd) {
++				eprintk_ctx("wrong inotify descriptor %u %u\n", err, u.wi.cpt_wd);
++				if (err >= 0)
++					err = -EINVAL;
++			} else
++				err = 0;
++			mutex_unlock(&dev->up_mutex);
++			path_put(&p);
++			if (err)
++				break;
++		} else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) {
++			struct inotify_user_watch dummy_watch;
++			struct inotify_watch *w;
++			char *name = NULL;
++
++			if (u.ei.cpt_namelen) {
++				name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL);
++				if (name == NULL) {
++					err = -ENOMEM;
++					break;
++				}
++				name[u.ei.cpt_namelen] = 0;
++				err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen);
++				if (err) {
++					kfree(name);
++					break;
++				}
++			}
++
++			w = &dummy_watch.wdata;
++			dummy_watch.dev = dev;
++			atomic_set(&w->count, 2);
++
++			/* Trick to avoid destruction due to exit event */
++			if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT))
++				atomic_inc(&w->count);
++			dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask,
++						      u.ei.cpt_cookie, name, NULL);
++			if (name)
++				kfree(name);
++		} else {
++			eprintk_ctx("bad object: %u\n", u.wi.cpt_object);
++			err = -EINVAL;
++			break;
++		}
++		pos += u.wi.cpt_next;
++	}
++	return err;
++}
++
++int rst_inotify(cpt_context_t *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_INOTIFY];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		cpt_object_t *obj;
++		struct cpt_inotify_image ibuf;
++
++		err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx);
++		if (err)
++			return err;
++		obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx);
++		if (obj == NULL) {
++			eprintk_ctx("cannot find inotify file object\n");
++			return -EINVAL;
++		}
++		err = restore_one_inotify(obj, sec, &ibuf, ctx);
++		if (err)
++			return err;
++		sec += ibuf.cpt_next;
++	}
++
++	return 0;
++	
++}
+diff --git a/kernel/cpt/rst_mm.c b/kernel/cpt/rst_mm.c
+new file mode 100644
+index 0000000..380b382
+--- /dev/null
++++ b/kernel/cpt/rst_mm.c
+@@ -0,0 +1,1145 @@
++/*
++ *
++ *  kernel/cpt/rst_mm.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/virtinfo.h>
++#include <linux/virtinfoscp.h>
++#include <linux/hugetlb.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/mman.h>
++#include <linux/vmalloc.h>
++#include <linux/rmap.h>
++#include <linux/hash.h>
++#include <asm/pgalloc.h>
++#include <asm/tlb.h>
++#include <asm/tlbflush.h>
++#include <asm/pgtable.h>
++#include <asm/mmu.h>
++#ifdef CONFIG_X86
++#include <asm/ldt.h>
++#include <asm/desc.h>
++#endif
++#include <asm/mmu_context.h>
++#include <asm/vsyscall.h>
++#include <linux/swapops.h>
++#include <linux/cpt_image.h>
++
++#ifdef CONFIG_VE
++#include <bc/beancounter.h>
++#include <bc/vmpages.h>
++#endif
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_ubc.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++#include "cpt_pagein.h"
++#endif
++
++#include "cpt_syscalls.h"
++
++#define __PAGE_NX (1ULL<<63)
++
++static unsigned long make_prot(struct cpt_vma_image *vmai)
++{
++	unsigned long prot = 0;
++
++	if (vmai->cpt_flags&VM_READ)
++		prot |= PROT_READ;
++	if (vmai->cpt_flags&VM_WRITE)
++		prot |= PROT_WRITE;
++	if (vmai->cpt_flags&VM_EXEC)
++		prot |= PROT_EXEC;
++	if (vmai->cpt_flags&VM_GROWSDOWN)
++		prot |= PROT_GROWSDOWN;
++	if (vmai->cpt_flags&VM_GROWSUP)
++		prot |= PROT_GROWSUP;
++	return prot;
++}
++
++static unsigned long make_flags(struct cpt_vma_image *vmai)
++{
++	unsigned long flags = MAP_FIXED;
++
++	if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
++		flags |= MAP_SHARED;
++	else
++		flags |= MAP_PRIVATE;
++
++	if (vmai->cpt_file == CPT_NULL)
++		flags |= MAP_ANONYMOUS;
++	if (vmai->cpt_flags&VM_GROWSDOWN)
++		flags |= MAP_GROWSDOWN;
++#ifdef MAP_GROWSUP
++	if (vmai->cpt_flags&VM_GROWSUP)
++		flags |= MAP_GROWSUP;
++#endif
++	if (vmai->cpt_flags&VM_DENYWRITE)
++		flags |= MAP_DENYWRITE;
++	if (vmai->cpt_flags&VM_EXECUTABLE)
++		flags |= MAP_EXECUTABLE;
++	if (!(vmai->cpt_flags&VM_ACCOUNT))
++		flags |= MAP_NORESERVE;
++	return flags;
++}
++
++#ifdef CONFIG_X86
++#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \
++				&& !defined(CONFIG_XEN)
++static int __alloc_ldt(mm_context_t *pc, int mincount)
++{
++	int oldsize, newsize, nr;
++
++	if (mincount <= pc->size)
++		return 0;
++	/*
++	 * LDT got larger - reallocate if necessary.
++	 */
++	oldsize = pc->size;
++	mincount = (mincount+511)&(~511);
++	newsize = mincount*LDT_ENTRY_SIZE;
++	for (nr = 0; nr * PAGE_SIZE < newsize; nr++) {
++		BUG_ON(nr * PAGE_SIZE >= 64*1024);
++		if (!pc->ldt_pages[nr]) {
++			pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC);
++			if (!pc->ldt_pages[nr])
++				goto nomem;
++			clear_highpage(pc->ldt_pages[nr]);
++		}
++	}
++	pc->size = mincount;
++	return 0;
++
++nomem:
++	while (--nr >= 0)
++		__free_page(pc->ldt_pages[nr]);
++	pc->size = 0;
++	return -ENOMEM;
++}
++
++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
++{
++	struct mm_struct *mm = current->mm;
++	int i;
++	int err;
++	int size;
++
++	err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE);
++	if (err)
++		return err;
++
++	size = mm->context.size*LDT_ENTRY_SIZE;
++
++	for (i = 0; i < size; i += PAGE_SIZE) {
++		int nr = i / PAGE_SIZE, bytes;
++		char *kaddr = kmap(mm->context.ldt_pages[nr]);
++
++		bytes = size - i;
++		if (bytes > PAGE_SIZE)
++			bytes = PAGE_SIZE;
++		err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i);
++		kunmap(mm->context.ldt_pages[nr]);
++		if (err)
++			return err;
++	}
++
++	load_LDT(&mm->context);
++	return 0;
++}
++
++#else
++
++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
++{
++	struct mm_struct *mm = current->mm;
++	int oldsize = mm->context.size;
++	void *oldldt;
++	void *newldt;
++	int err;
++
++	if (li->cpt_size > PAGE_SIZE)
++		newldt = vmalloc(li->cpt_size);
++	else
++		newldt = kmalloc(li->cpt_size, GFP_KERNEL);
++
++	if (!newldt)
++		return -ENOMEM;
++
++	err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen);
++	if (err)
++		return err;
++
++	oldldt = mm->context.ldt;
++	mm->context.ldt = newldt;
++	mm->context.size = li->cpt_size/LDT_ENTRY_SIZE;
++
++	load_LDT(&mm->context);
++
++	if (oldsize) {
++		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
++			vfree(oldldt);
++		else
++			kfree(oldldt);
++	}
++	return 0;
++}
++#endif
++#endif
++
++static int
++restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg)
++{
++	struct aio_ring_info *info = &aio_ctx->ring_info;
++	unsigned nr_events = aio_ctx->max_reqs;
++	unsigned long size;
++	int nr_pages;
++
++	/* We recalculate parameters of the ring exactly like
++	 * fs/aio.c does and then compare calculated values
++	 * with ones, stored in dump. They must be the same. */
++
++	nr_events += 2;
++
++	size = sizeof(struct aio_ring);
++	size += sizeof(struct io_event) * nr_events;
++	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++	if (nr_pages != aimg->cpt_ring_pages)
++		return -EINVAL;
++
++	info->nr_pages = nr_pages;
++
++	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
++
++	if (nr_events != aimg->cpt_nr)
++		return -EINVAL;
++
++	info->nr = 0;
++	info->ring_pages = info->internal_pages;
++	if (nr_pages > AIO_RING_PAGES) {
++		info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
++		if (!info->ring_pages)
++			return -ENOMEM;
++		memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
++	}
++
++	info->mmap_size = nr_pages * PAGE_SIZE;
++
++	/* This piece of shit is not entirely my fault. Kernel aio.c makes
++	 * something odd mmap()ping some pages and then pinning them.
++	 * I guess it is just some mud remained of failed attempt to show ring
++	 * to user space. The result is odd. :-) Immediately after
++	 * creation of AIO context, kernel shares those pages with user
++	 * and user can read and even write there. But after the first
++	 * fork, pages are marked COW with evident consequences.
++	 * I remember, I did the same mistake in the first version
++	 * of mmapped packet socket, luckily that crap never reached
++	 * mainstream.
++	 *
++	 * So, what are we going to do? I can simulate this odd behaviour
++	 * exactly, but I am not insane yet. For now just take the pages
++	 * from user space. Alternatively, we could keep kernel copy
++	 * in AIO context image, which would be more correct.
++	 *
++	 * What is wrong now? If the pages are COWed, ring is transferred
++	 * incorrectly.
++	 */
++	down_read(&current->mm->mmap_sem);
++	info->mmap_base = aimg->cpt_mmap_base;
++	info->nr_pages = get_user_pages(current, current->mm,
++					info->mmap_base, nr_pages, 
++					1, 0, info->ring_pages, NULL);
++	up_read(&current->mm->mmap_sem);
++
++	if (unlikely(info->nr_pages != nr_pages)) {
++		int i;
++
++		for (i=0; i<info->nr_pages; i++)
++			put_page(info->ring_pages[i]);
++		if (info->ring_pages && info->ring_pages != info->internal_pages)
++			kfree(info->ring_pages);
++		return -EFAULT;
++	}
++
++	aio_ctx->user_id = info->mmap_base;
++
++	info->nr = nr_events;
++	info->tail = aimg->cpt_tail;
++
++	return 0;
++}
++
++static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx)
++{
++	int err;
++	struct kioctx *aio_ctx;
++	extern spinlock_t aio_nr_lock;
++
++	aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
++	if (!aio_ctx)
++		return -ENOMEM;
++
++	memset(aio_ctx, 0, sizeof(*aio_ctx));
++	aio_ctx->max_reqs = aimg->cpt_max_reqs;
++
++	if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) {
++		kmem_cache_free(kioctx_cachep, aio_ctx);
++		eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err);
++		return err;
++	}
++
++	aio_ctx->mm = current->mm;
++	atomic_inc(&aio_ctx->mm->mm_count);
++	atomic_set(&aio_ctx->users, 1);
++	spin_lock_init(&aio_ctx->ctx_lock);
++	spin_lock_init(&aio_ctx->ring_info.ring_lock);
++	init_waitqueue_head(&aio_ctx->wait);
++	INIT_LIST_HEAD(&aio_ctx->active_reqs);
++	INIT_LIST_HEAD(&aio_ctx->run_list);
++	INIT_WORK(&aio_ctx->wq.work, aio_kick_handler);
++
++	spin_lock(&aio_nr_lock);
++	aio_nr += aio_ctx->max_reqs;
++	spin_unlock(&aio_nr_lock);
++
++	write_lock(&aio_ctx->mm->ioctx_list_lock);
++	aio_ctx->next = aio_ctx->mm->ioctx_list;
++	aio_ctx->mm->ioctx_list = aio_ctx;
++	write_unlock(&aio_ctx->mm->ioctx_list_lock);
++
++	return 0;
++}
++
++struct anonvma_map
++{
++	struct hlist_node	list;
++	struct anon_vma		*avma;
++	__u64			id;
++};
++
++static int verify_create_anonvma(struct mm_struct *mm,
++				 struct cpt_vma_image *vmai,
++				 cpt_context_t *ctx)
++{
++	struct anon_vma *avma = NULL;
++	struct anon_vma *new_avma;
++	struct vm_area_struct *vma;
++	int h;
++
++	if (!ctx->anonvmas) {
++		if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE)
++			return -EINVAL;
++		if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL)
++			return -ENOMEM;
++		for (h = 0; h < CPT_ANONVMA_HSIZE; h++)
++			INIT_HLIST_HEAD(&ctx->anonvmas[h]);
++	} else {
++		struct anonvma_map *map;
++		struct hlist_node *elem;
++
++		h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
++		hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) {
++			if (map->id == vmai->cpt_anonvmaid) {
++				avma = map->avma;
++				break;
++			}
++		}
++	}
++
++	down_read(&mm->mmap_sem);
++	if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) {
++		up_read(&mm->mmap_sem);
++		return -ESRCH;
++	}
++	if (vma->vm_start != vmai->cpt_start) {
++		up_read(&mm->mmap_sem);
++		eprintk_ctx("vma start mismatch\n");
++		return -EINVAL;
++	}
++	if (vma->vm_pgoff != vmai->cpt_pgoff) { 
++		dprintk_ctx("vma pgoff mismatch, fixing\n");
++		if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) {
++			eprintk_ctx("cannot fixup vma pgoff\n");
++			up_read(&mm->mmap_sem);	
++			return -EINVAL;
++		}
++		vma->vm_pgoff = vmai->cpt_pgoff;
++	}
++
++	if (!vma->anon_vma) {
++		if (avma) {
++			vma->anon_vma = avma;
++			anon_vma_link(vma);
++		} else {
++			int err;
++
++			err = anon_vma_prepare(vma);
++
++			if (err) {
++				up_read(&mm->mmap_sem);
++				return err;
++			}
++		}
++	} else {
++		/* Note, we _can_ arrive to the situation, when two
++		 * different anonvmaid's point to one anon_vma, this happens
++		 * f.e. when mmap() merged new area to previous one and
++		 * they will share one anon_vma even if they did not on
++		 * original host.
++		 *
++		 * IT IS OK. To all that I understand, we may merge all
++		 * the anon_vma's and rmap can scan all the huge list of vmas
++		 * searching for page. It is just "suboptimal".
++		 *
++		 * Real disaster would happen, if vma already got an anon_vma
++		 * with different id. It is very rare case, kernel does the
++		 * best efforts to merge anon_vmas when some attributes are
++		 * different. In this case we will fall to copying memory.
++		 */
++		if (avma && vma->anon_vma != avma) {
++			up_read(&mm->mmap_sem);
++			wprintk_ctx("anon_vma mismatch\n");
++			return 0;
++		}
++	}
++
++	new_avma = vma->anon_vma;
++	up_read(&mm->mmap_sem);
++
++	if (!avma) {
++		struct anonvma_map *map;
++
++		if (!new_avma)
++			return -EINVAL;
++
++		if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL)
++			return -ENOMEM;
++
++		map->id = vmai->cpt_anonvmaid;
++		map->avma = new_avma;
++		h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
++		hlist_add_head(&map->list, &ctx->anonvmas[h]);
++	}
++	return 0;
++}
++
++static int copy_mm_pages(struct mm_struct *src, unsigned long start,
++			 unsigned long end)
++{
++	int err;
++
++	for (; start < end; start += PAGE_SIZE) {
++		struct page *page;
++		struct page *spage;
++		void *maddr, *srcaddr;
++
++		err = get_user_pages(current, current->mm,
++				     start, 1, 1, 1, &page, NULL);
++		if (err == 0)
++			err = -EFAULT;
++		if (err < 0)
++			return err;
++
++		err = get_user_pages(current, src,
++				     start, 1, 0, 1, &spage, NULL);
++
++		if (err == 0)
++			err = -EFAULT;
++		if (err < 0) {
++			page_cache_release(page);
++			return err;
++		}
++
++		srcaddr = kmap(spage);
++		maddr = kmap(page);
++		memcpy(maddr, srcaddr, PAGE_SIZE);
++		set_page_dirty_lock(page);
++		kunmap(page);
++		kunmap(spage);
++		page_cache_release(page);
++		page_cache_release(spage);
++	}
++	return 0;
++}
++
++static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx)
++{
++	int err = 0;
++	unsigned long addr;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	struct file *file = NULL;
++	unsigned long prot;
++	int checked = 0;
++
++	if (vmai->cpt_type == CPT_VMA_VDSO) {
++		if (ctx->vdso == NULL) {
++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
++			err = arch_setup_additional_pages(NULL, 0,
++					vmai->cpt_start);
++#endif
++			goto out;
++		}
++	}
++
++	prot = make_prot(vmai);
++
++	if (vmai->cpt_file != CPT_NULL) {
++		if (vmai->cpt_type == CPT_VMA_TYPE_0) {
++			file = rst_file(vmai->cpt_file, -1, ctx);
++			if (IS_ERR(file)) {
++				eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file);
++				return PTR_ERR(file);
++			}
++		} else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) {
++			file = rst_sysv_shm_vma(vmai, ctx);
++			if (IS_ERR(file))
++				return PTR_ERR(file);
++		}
++	}
++
++	down_write(&mm->mmap_sem);
++	addr = do_mmap_pgoff(file, vmai->cpt_start,
++			     vmai->cpt_end-vmai->cpt_start,
++			     prot, make_flags(vmai),
++			     vmai->cpt_pgoff);
++
++	if (addr != vmai->cpt_start) {
++		up_write(&mm->mmap_sem);
++
++		err = -EINVAL;
++		if (IS_ERR((void*)addr))
++			err = addr;
++		goto out;
++	}
++
++	vma = find_vma(mm, vmai->cpt_start);
++	if (vma == NULL) {
++		up_write(&mm->mmap_sem);
++		eprintk_ctx("cannot find mmapped vma\n");
++		err = -ESRCH;
++		goto out;
++	}
++
++	/* do_mmap_pgoff() can merge new area to previous one (not to the next,
++	 * we mmap in order, the rest of mm is still unmapped). This can happen
++	 * f.e. if flags are to be adjusted later, or if we had different
++	 * anon_vma on two adjacent regions. Split it by brute force. */
++	if (vma->vm_start != vmai->cpt_start) {
++		dprintk_ctx("vma %Ld merged, split\n", vmapos);
++		err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0);
++		if (err) {
++			up_write(&mm->mmap_sem);
++			eprintk_ctx("cannot split vma\n");
++			goto out;
++		}
++	}
++	up_write(&mm->mmap_sem);
++
++	if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) {
++		err = verify_create_anonvma(mm, vmai, ctx);
++		if (err) {
++			eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos);
++			goto out;
++		}
++	}
++
++	if (vmai->cpt_type == CPT_VMA_VDSO) {
++		struct page *page;
++		void *maddr;
++
++		err = get_user_pages(current, current->mm,
++				(unsigned long)vmai->cpt_start,
++				1, 1, 1, &page, NULL);
++		if (err == 0)
++			err = -EFAULT;
++		if (err < 0) {
++			eprintk_ctx("can't get vdso: get_user_pages: %d\n", err);
++			goto out;
++		}
++		err = 0;
++		maddr = kmap(page);
++		memcpy(maddr, ctx->vdso, PAGE_SIZE);
++		set_page_dirty_lock(page);
++		kunmap(page);
++		page_cache_release(page);
++		goto out;
++	}
++
++	if (vmai->cpt_next > vmai->cpt_hdrlen) {
++		loff_t offset = vmapos + vmai->cpt_hdrlen;
++
++		do {
++			union {
++				struct cpt_page_block pb;
++				struct cpt_remappage_block rpb;
++				struct cpt_copypage_block cpb;
++				struct cpt_lazypage_block lpb;
++				struct cpt_iterpage_block ipb;
++			} u;
++			loff_t pos;
++
++			err = rst_get_object(-1, offset, &u, ctx);
++			if (err) {
++				eprintk_ctx("vma fix object: %d\n", err);
++				goto out;
++			}
++			if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) {
++				err = sc_remap_file_pages(u.rpb.cpt_start,
++							  u.rpb.cpt_end-u.rpb.cpt_start,
++							  0, u.rpb.cpt_pgoff, 0);
++				if (err < 0) {
++					eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err,
++					       (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), 
++					       (__u32)u.rpb.cpt_pgoff);
++					goto out;
++				}
++				offset += u.rpb.cpt_next;
++				continue;
++			} else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) {
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++				unsigned long ptr = u.lpb.cpt_start;
++
++				down_read(&mm->mmap_sem);
++				if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
++					up_read(&mm->mmap_sem);
++					eprintk_ctx("lost vm_area_struct\n");
++					err = -ESRCH;
++					goto out;
++				}
++				err = anon_vma_prepare(vma);
++				if (err) {
++					up_read(&mm->mmap_sem);
++					goto out;
++				}
++				while (ptr < u.lpb.cpt_end) {
++					err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE,
++							 ptr, ctx);
++					if (err)
++						break;
++					ptr += PAGE_SIZE;
++				}
++				up_read(&mm->mmap_sem);
++#else
++				err = -EINVAL;
++#endif
++				if (err)
++					goto out;
++				offset += u.cpb.cpt_next;
++				continue;
++			} else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) {
++				struct vm_area_struct *vma, *vma1;
++				struct mm_struct *src;
++				struct anon_vma *src_anon;
++				cpt_object_t *mobj;
++
++				if (!vmai->cpt_anonvmaid) {
++					err = -EINVAL;
++					eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n");
++					goto out;
++				}
++
++				mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx);
++				if (!mobj) {
++					eprintk_ctx("lost mm_struct to clone pages from\n");
++					err = -ESRCH;
++					goto out;
++				}
++				src = mobj->o_obj;
++
++				down_read(&src->mmap_sem);
++				src_anon = NULL;
++				vma1 = find_vma(src, u.cpb.cpt_start);
++				if (vma1)
++					src_anon = vma1->anon_vma;
++				up_read(&src->mmap_sem);
++
++				if (!vma1) {
++					eprintk_ctx("lost src vm_area_struct\n");
++					err = -ESRCH;
++					goto out;
++				}
++
++				down_read(&mm->mmap_sem);
++				if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) {
++					up_read(&mm->mmap_sem);
++					eprintk_ctx("lost vm_area_struct\n");
++					err = -ESRCH;
++					goto out;
++				}
++
++				if (!src_anon ||
++				    !vma->anon_vma ||
++				    vma->anon_vma != src_anon ||
++				    vma->vm_start - vma1->vm_start !=
++				    (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) {
++					up_read(&mm->mmap_sem);
++					wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos);
++					err = copy_mm_pages(mobj->o_obj,
++							    u.cpb.cpt_start,
++							    u.cpb.cpt_end);
++				} else {
++					err = __copy_page_range(vma, vma1,
++								u.cpb.cpt_start,
++								u.cpb.cpt_end-u.cpb.cpt_start);
++					up_read(&mm->mmap_sem);
++				}
++				if (err) {
++					eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err,
++						(__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), 
++						(long)u.cpb.cpt_source);
++					goto out;
++				}
++
++				offset += u.cpb.cpt_next;
++				continue;
++			} else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES ||
++				   u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES
++				   ) {
++#ifdef CONFIG_VZ_CHECKPOINT_ITER
++				unsigned long ptr = u.lpb.cpt_start;
++				u64 page_pos[16];
++				pos = offset + sizeof(u.pb);
++
++				err = ctx->pread(&page_pos,
++						 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE,
++						 ctx,
++						 pos);
++				if (err) {
++					eprintk_ctx("Oops\n");
++					goto out;
++				}
++
++				down_read(&mm->mmap_sem);
++				if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
++					up_read(&mm->mmap_sem);
++					eprintk_ctx("lost vm_area_struct\n");
++					err = -ESRCH;
++					goto out;
++				}
++				err = anon_vma_prepare(vma);
++				if (err) {
++					up_read(&mm->mmap_sem);
++					goto out;
++				}
++				while (ptr < u.lpb.cpt_end) {
++					err = rst_iter(vma,
++						       page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE],
++						       ptr,
++						       ctx);
++					if (err)
++						break;
++					ptr += PAGE_SIZE;
++				}
++				if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) {
++					make_pages_present((unsigned long)u.lpb.cpt_start,
++							   (unsigned long)u.lpb.cpt_end);
++				}
++				up_read(&mm->mmap_sem);
++#else
++				err = -EINVAL;
++#endif
++				if (err)
++					goto out;
++				offset += u.cpb.cpt_next;
++				continue;
++			}
++			if (u.pb.cpt_object != CPT_OBJ_PAGES) {
++				eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object);
++				err = -EINVAL;
++				goto out;
++			}
++			pos = offset + sizeof(u.pb);
++			if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) {
++				/* I guess this is get_user_pages() messed things,
++				 * this happens f.e. when gdb inserts breakpoints.
++				 */
++				int i;
++				for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) {
++					struct page *page;
++					void *maddr;
++					err = get_user_pages(current, current->mm,
++							     (unsigned long)u.pb.cpt_start + i*PAGE_SIZE,
++							     1, 1, 1, &page, NULL);
++					if (err == 0)
++						err = -EFAULT;
++					if (err < 0) {
++						eprintk_ctx("get_user_pages: %d\n", err);
++						goto out;
++					}
++					err = 0;
++					maddr = kmap(page);
++					if (u.pb.cpt_content == CPT_CONTENT_VOID) {
++						memset(maddr, 0, PAGE_SIZE);
++					} else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
++						err = ctx->pread(maddr, PAGE_SIZE,
++								 ctx, pos + i*PAGE_SIZE);
++						if (err) {
++							kunmap(page);
++							goto out;
++						}
++					} else {
++						err = -EINVAL;
++						kunmap(page);
++						goto out;
++					}
++					set_page_dirty_lock(page);
++					kunmap(page);
++					page_cache_release(page);
++				}
++			} else {
++				if (!(prot&PROT_WRITE))
++					sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
++				if (u.pb.cpt_content == CPT_CONTENT_VOID) {
++					int i;
++					for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) {
++						err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i);
++						if (err) {
++							eprintk_ctx("__put_user 2 %d\n", err);
++							goto out;
++						}
++					}
++				} else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
++					loff_t tpos = pos;
++					err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start),
++							 u.pb.cpt_end-u.pb.cpt_start,
++							 &tpos);
++					if (err != u.pb.cpt_end-u.pb.cpt_start) {
++						if (err >= 0)
++							err = -EIO;
++						goto out;
++					}
++				} else {
++					err = -EINVAL;
++					goto out;
++				}
++				if (!(prot&PROT_WRITE))
++					sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
++			}
++			err = 0;
++			offset += u.pb.cpt_next;
++		} while (offset < vmapos + vmai->cpt_next);
++	}
++
++check:
++	do {
++		struct vm_area_struct *vma;
++		down_read(&mm->mmap_sem);
++		vma = find_vma(mm, addr);
++		if (vma) {
++			if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) {
++				VM_ClearReadHint(vma);
++				vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK;
++			}
++			if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) {
++				dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos);
++				up_read(&mm->mmap_sem);
++				if (vma->vm_flags&VM_LOCKED)
++					err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
++				else
++					err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
++				/* When mlock fails with EFAULT, it means
++				 * that it could not bring in pages.
++				 * It can happen after mlock() on unreadable
++				 * VMAs. But VMA is correctly locked,
++				 * so that this error can be ignored. */
++				if (err == -EFAULT)
++					err = 0;
++				if (err)
++					goto out;
++				goto check;
++			}
++			if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX)
++				wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
++					    (unsigned long long)vma->vm_page_prot.pgprot,
++					    (unsigned long long)vmai->cpt_pgprot);
++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++			if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) &&
++			    (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE))
++				wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
++				       (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot);
++#endif
++			if (vma->vm_flags != vmai->cpt_flags) {
++				unsigned long x = vma->vm_flags ^ vmai->cpt_flags;
++				if (x & VM_EXEC) {
++					/* Crap. On i386 this is OK.
++					 * It is impossible to make via mmap/mprotect
++					 * exec.c clears VM_EXEC on stack. */
++					vma->vm_flags &= ~VM_EXEC;
++				} else if ((x & VM_ACCOUNT) && !checked) {
++					checked = 1;
++					if (!(prot&PROT_WRITE)) {
++						up_read(&mm->mmap_sem);
++						sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
++						sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
++						goto check;
++					}
++					wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
++					       (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
++				} else {
++					wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
++					       (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
++				}
++			}
++		} else {
++			wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos);
++		}
++		up_read(&mm->mmap_sem);
++	} while (0);
++
++out:
++	if (file)
++		fput(file);
++	return err;
++}
++
++#ifndef CONFIG_IA64
++#define TASK_UNMAP_START	0
++#else
++/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping
++ * used to accelerate speculative dereferences of NULL pointer. */
++#define TASK_UNMAP_START	PAGE_SIZE
++#endif
++
++static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx)
++{
++	int err = 0;
++	unsigned int def_flags;
++	struct mm_struct *mm = current->mm;
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *bc;
++#endif
++
++	down_write(&mm->mmap_sem);
++	do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START);
++
++#ifdef CONFIG_BEANCOUNTERS
++	/*
++	 * MM beancounter is usually correct from the fork time,
++	 * but not for init, for example.
++	 * Luckily, mm_ub can be changed for a completely empty MM.
++	 */
++	bc = rst_lookup_ubc(vmi->cpt_mmub, ctx);
++	err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc);
++	if (err & NOTIFY_FAIL) {
++		up_write(&mm->mmap_sem);
++		return -ECHRNG;
++	}
++	if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) {
++		struct user_beancounter *old_bc;
++
++		old_bc = mm->mm_ub;
++		mm->mm_ub = bc;
++		bc = old_bc;
++	}
++	err = 0;
++	put_beancounter(bc);
++#endif
++
++	mm->start_code = vmi->cpt_start_code;
++	mm->end_code = vmi->cpt_end_code;
++	mm->start_data = vmi->cpt_start_data;
++	mm->end_data = vmi->cpt_end_data;
++	mm->start_brk = vmi->cpt_start_brk;
++	mm->brk = vmi->cpt_brk;
++	mm->start_stack = vmi->cpt_start_stack;
++	mm->arg_start = vmi->cpt_start_arg;
++	mm->arg_end = vmi->cpt_end_arg;
++	mm->env_start = vmi->cpt_start_env;
++	mm->env_end = vmi->cpt_end_env;
++	mm->def_flags = 0;
++	def_flags = vmi->cpt_def_flags;
++
++	mm->flags = vmi->cpt_dumpable;
++	if (ctx->image_version < CPT_VERSION_24)
++		mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS;
++
++	mm->vps_dumpable = vmi->cpt_vps_dumpable;
++#ifndef CONFIG_IA64
++	if (ctx->image_version >= CPT_VERSION_9) {
++		mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso);
++		current_thread_info()->sysenter_return = 
++			VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN);
++	}
++#endif
++
++#if 0 /* def CONFIG_HUGETLB_PAGE*/
++/* NB: ? */
++	int used_hugetlb;
++#endif
++	up_write(&mm->mmap_sem);
++
++	if (vmi->cpt_next > vmi->cpt_hdrlen) {
++		loff_t offset = pos + vmi->cpt_hdrlen;
++		do {
++			union {
++				struct cpt_vma_image vmai;
++				struct cpt_aio_ctx_image aioi;
++				struct cpt_obj_bits bits;
++			} u;
++			err = rst_get_object(-1, offset, &u, ctx);
++			if (err)
++				goto out;
++			if (u.vmai.cpt_object == CPT_OBJ_VMA) {
++#ifdef CONFIG_IA64
++				//// Later...
++				if (u.vmai.cpt_start)
++#endif			
++				err = do_rst_vma(&u.vmai, offset, pos, ctx);
++				if (err)
++					goto out;
++#ifdef CONFIG_X86
++			} else if (u.bits.cpt_object == CPT_OBJ_BITS &&
++				   u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) {
++				err = do_rst_ldt(&u.bits, offset, ctx);
++				if (err)
++					goto out;
++#endif
++			} else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) {
++				err = do_rst_aio(&u.aioi, offset, ctx);
++				if (err)
++					goto out;
++			} else {
++				eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object);
++				err = -EINVAL;
++				goto out;
++			}
++			offset += u.vmai.cpt_next;
++		} while (offset < pos + vmi->cpt_next);
++	}
++
++	down_write(&mm->mmap_sem);
++	mm->def_flags = def_flags;
++	up_write(&mm->mmap_sem);
++
++
++out:
++	return err;
++}
++
++extern void exit_mm(struct task_struct * tsk);
++
++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	int err = 0;
++	cpt_object_t *mobj;
++	void *tmp = (void*)__get_free_page(GFP_KERNEL);
++	struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp;
++
++	if (!tmp)
++		return -ENOMEM;
++
++	if (ti->cpt_mm == CPT_NULL) {
++		if (current->mm) {
++			virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT,
++					current);
++			exit_mm(current);
++		}
++		goto out;
++	}
++
++	mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
++	if (mobj) {
++		if (current->mm != mobj->o_obj) BUG();
++		goto out;
++	}
++
++	if (current->mm == NULL) {
++		struct mm_struct *mm = mm_alloc();
++		if (mm == NULL) {
++			err = -ENOMEM;
++			goto out;
++		}
++		err = init_new_context(current, mm);
++		if (err) {
++			mmdrop(mm);
++			goto out;
++		}
++		current->mm = mm;
++	}
++
++	if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0)
++		goto out;
++	if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) {
++		eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm);
++		goto out;
++	}
++	err = -ENOMEM;
++	mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx);
++	if (mobj != NULL) {
++		err = 0;
++		cpt_obj_setpos(mobj, ti->cpt_mm, ctx);
++	}
++
++out:
++	if (tmp)
++		free_page((unsigned long)tmp);
++	return err;
++}
++
++/* This is part of mm setup, made in parent context. Mostly, it is the place,
++ * where we graft mm of another process to child.
++ */
++
++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	struct task_struct *tsk = obj->o_obj;
++	cpt_object_t *mobj;
++
++	/* Task without mm. Just get rid of this. */
++	if (ti->cpt_mm == CPT_NULL) {
++		if (tsk->mm) {
++			virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT,
++					tsk);
++			mmput(tsk->mm);
++			tsk->mm = NULL;
++		}
++		return 0;
++	}
++
++	mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
++	if (mobj) {
++		struct mm_struct *newmm = mobj->o_obj;
++		/* Good, the MM is already created. */
++		if (newmm == tsk->mm) {
++			/* Already done by clone(). */
++			return 0;
++		}
++		mmput(tsk->mm);
++		atomic_inc(&newmm->mm_users);
++		tsk->mm = newmm;
++		tsk->active_mm = newmm;
++	}
++	return 0;
++}
++
++/* We use CLONE_VM when mm of child is going to be shared with parent.
++ * Otherwise mm is copied.
++ */
++
++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	if (ti->cpt_mm == CPT_NULL ||
++	    lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx))
++		return CLONE_VM;
++	return 0;
++}
+diff --git a/kernel/cpt/rst_net.c b/kernel/cpt/rst_net.c
+new file mode 100644
+index 0000000..b246ddb
+--- /dev/null
++++ b/kernel/cpt/rst_net.c
+@@ -0,0 +1,746 @@
++/*
++ *
++ *  kernel/cpt/rst_net.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/nsproxy.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/addrconf.h>
++#include <linux/if_tun.h>
++#include <linux/veth.h>
++#include <linux/nfcalls.h>
++#include <linux/venet.h>
++#include <linux/fdtable.h>
++#include <net/net_namespace.h>
++#include <net/netns/generic.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++#include "cpt_net.h"
++#include "cpt_files.h"
++
++#include "cpt_syscalls.h"
++
++extern struct in_ifaddr *inet_alloc_ifa(void);
++extern int inet_insert_ifa(struct in_ifaddr *ifa);
++extern struct in_device *inetdev_init(struct net_device *dev);
++
++int rst_restore_ifaddr(struct cpt_context *ctx)
++{
++	struct net *net = get_exec_env()->ve_netns;
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_ifaddr_image di;
++	struct net_device *dev;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		int cindex = -1;
++		int err;
++		err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx);
++		if (err)
++			return err;
++		cindex = di.cpt_index;
++		rtnl_lock();
++		dev = __dev_get_by_index(net, cindex);
++		if (dev && di.cpt_family == AF_INET) {
++			struct in_device *in_dev;
++			struct in_ifaddr *ifa;
++			if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
++				in_dev = inetdev_init(dev);
++			ifa = inet_alloc_ifa();
++			if (ifa) {
++				ifa->ifa_local = di.cpt_address[0];
++				ifa->ifa_address = di.cpt_peer[0];
++				ifa->ifa_broadcast = di.cpt_broadcast[0];
++				ifa->ifa_prefixlen = di.cpt_masklen;
++				ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
++				ifa->ifa_flags = di.cpt_flags;
++				ifa->ifa_scope = di.cpt_scope;
++				memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ);
++				in_dev_hold(in_dev);
++				ifa->ifa_dev   = in_dev;
++				err = inet_insert_ifa(ifa);
++				if (err && err != -EEXIST) {
++					rtnl_unlock();
++					eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
++					return err;
++				}
++			}
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++		} else if (dev && di.cpt_family == AF_INET6) {
++			__u32 prefered_lft;
++			__u32 valid_lft;
++			struct net *net = get_exec_env()->ve_ns->net_ns;
++			prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ?
++				0 : di.cpt_prefered_lft;
++			valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ?
++				0xFFFFFFFF : di.cpt_valid_lft;
++			err = inet6_addr_add(net, dev->ifindex,
++					     (struct in6_addr *)di.cpt_address,
++					     di.cpt_masklen, 0,
++					     prefered_lft,
++					     valid_lft);
++			if (err && err != -EEXIST) {
++				rtnl_unlock();
++				eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
++				return err;
++			}
++#endif
++		} else {
++			rtnl_unlock();
++			eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index);
++			return -EINVAL;
++		}
++		rtnl_unlock();
++		sec += di.cpt_next;
++	}
++	return 0;
++}
++
++static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx)
++{
++	int min_len = NLMSG_LENGTH(sizeof(struct rtmsg));
++	struct rtmsg *rtm = NLMSG_DATA(nlh);
++	__u32 prefix0 = 0;
++
++	if (nlh->nlmsg_len > min_len) {
++		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
++		struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len);
++
++		while (RTA_OK(rta, attrlen)) {
++			if (rta->rta_type == RTA_DST) {
++				prefix0 = *(__u32*)RTA_DATA(rta);
++			}
++			rta = RTA_NEXT(rta, attrlen);
++		}
++	}
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++	if (rtm->rtm_family == AF_INET6) {
++		if (rtm->rtm_type == RTN_LOCAL)
++			return 2;
++		if (rtm->rtm_flags & RTM_F_CLONED)
++			return 2;
++		if (rtm->rtm_protocol == RTPROT_UNSPEC ||
++		    rtm->rtm_protocol == RTPROT_RA ||
++		    rtm->rtm_protocol == RTPROT_REDIRECT ||
++		    rtm->rtm_protocol == RTPROT_KERNEL)
++			return 2;
++		if (rtm->rtm_protocol == RTPROT_BOOT &&
++		    ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) ||
++		     (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000))))
++			return 2;
++	}
++#endif
++	return rtm->rtm_protocol == RTPROT_KERNEL;
++}
++
++int rst_restore_route(struct cpt_context *ctx)
++{
++	int err;
++	struct socket *sock;
++	struct msghdr msg;
++	struct iovec iov;
++	struct sockaddr_nl nladdr;
++	mm_segment_t oldfs;
++	loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_object_hdr v;
++	char *pg;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	if (h.cpt_hdrlen >= h.cpt_next)
++		return 0;
++
++	sec += h.cpt_hdrlen;
++	err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx);
++	if (err < 0)
++		return err;
++
++	err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
++	if (err)
++		return err;
++
++	pg = (char*)__get_free_page(GFP_KERNEL);
++	if (pg == NULL) {
++		err = -ENOMEM;
++		goto out_sock;
++	}
++
++	memset(&nladdr, 0, sizeof(nladdr));
++	nladdr.nl_family = AF_NETLINK;
++
++	endsec = sec + v.cpt_next;
++	sec += v.cpt_hdrlen;
++
++	while (sec < endsec) {
++		struct nlmsghdr *n;
++		struct nlmsghdr nh;
++		int kernel_flag;
++
++		if (endsec - sec < sizeof(nh))
++			break;
++
++		err = ctx->pread(&nh, sizeof(nh), ctx, sec);
++		if (err)
++			goto out_sock_pg;
++		if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE ||
++		    endsec - sec < nh.nlmsg_len) {
++			err = -EINVAL;
++			goto out_sock_pg;
++		}
++		err = ctx->pread(pg, nh.nlmsg_len, ctx, sec);
++		if (err)
++			goto out_sock_pg;
++
++		n = (struct nlmsghdr*)pg;
++		n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE;
++
++		err = rewrite_rtmsg(n, ctx);
++		if (err < 0)
++			goto out_sock_pg;
++		kernel_flag = err;
++
++		if (kernel_flag == 2)
++			goto do_next;
++
++		iov.iov_base=n;
++		iov.iov_len=nh.nlmsg_len;
++		msg.msg_name=&nladdr;
++		msg.msg_namelen=sizeof(nladdr);
++		msg.msg_iov=&iov;
++		msg.msg_iovlen=1;
++		msg.msg_control=NULL;
++		msg.msg_controllen=0;
++		msg.msg_flags=MSG_DONTWAIT;
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		err = sock_sendmsg(sock, &msg, nh.nlmsg_len);
++		set_fs(oldfs);
++
++		if (err < 0)
++			goto out_sock_pg;
++		err = 0;
++
++		iov.iov_base=pg;
++		iov.iov_len=PAGE_SIZE;
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
++		set_fs(oldfs);
++		if (err != -EAGAIN) {
++			if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) &&
++			    n->nlmsg_type == NLMSG_ERROR) {
++				struct nlmsgerr *e = NLMSG_DATA(n);
++				if (e->error != -EEXIST || !kernel_flag)
++					eprintk_ctx("NLMERR: %d\n", e->error);
++			} else {
++				eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type);
++			}
++		}
++do_next:
++		err = 0;
++		sec += NLMSG_ALIGN(nh.nlmsg_len);
++	}
++
++out_sock_pg:
++	free_page((unsigned long)pg);
++out_sock:
++	sock_release(sock);
++	return err;
++}
++
++int rst_resume_network(struct cpt_context *ctx)
++{
++	struct ve_struct *env;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++	env->disable_net = 0;
++	put_ve(env);
++	return 0;
++}
++
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++extern unsigned int tun_net_id;
++#endif
++
++/* We do not restore skb queue, just reinit it */
++static int rst_restore_tuntap(loff_t start, struct cpt_netdev_image *di,
++			struct cpt_context *ctx)
++{
++	int err = -ENODEV;
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++	struct cpt_tuntap_image ti;
++	struct net_device *dev;
++	struct file *bind_file = NULL;
++	struct net *net;
++	struct tun_struct *tun;
++	struct tun_net *tn;
++	loff_t pos;
++
++	pos = start + di->cpt_hdrlen;
++	err = rst_get_object(CPT_OBJ_NET_TUNTAP, pos, &ti, ctx);
++	if (err)
++		return err;
++
++	pos += ti.cpt_next;
++	if (ti.cpt_bindfile) {
++		bind_file = rst_file(ti.cpt_bindfile, -1, ctx);
++		if (IS_ERR(bind_file)) {
++			eprintk_ctx("rst_restore_tuntap:"
++				"rst_file: %Ld\n",
++				(unsigned long long)ti.cpt_bindfile);
++			return PTR_ERR(bind_file);
++		}
++	}
++
++	rtnl_lock();
++	err = -ENOMEM;
++	dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup);
++	if (!dev)
++		goto out;
++
++	tun = netdev_priv(dev);
++
++	tun->dev = dev;
++	tun->owner = ti.cpt_owner;
++	tun->flags = ti.cpt_flags;
++	tun->attached = ti.cpt_attached;
++	tun->if_flags = ti.cpt_if_flags;
++	tun_net_init(dev);
++	BUG_ON(sizeof(ti.cpt_dev_addr) != sizeof(tun->dev_addr));
++	memcpy(tun->dev_addr, ti.cpt_dev_addr, sizeof(ti.cpt_dev_addr));
++	BUG_ON(sizeof(ti.cpt_chr_filter) != sizeof(tun->chr_filter));
++	memcpy(tun->chr_filter, ti.cpt_chr_filter, sizeof(ti.cpt_chr_filter));
++	BUG_ON(sizeof(ti.cpt_net_filter) != sizeof(tun->net_filter));
++	memcpy(tun->net_filter, ti.cpt_net_filter, sizeof(ti.cpt_net_filter));
++
++	err = register_netdevice(dev);
++	if (err < 0) {
++		free_netdev(dev);
++		eprintk_ctx("failed to register tun/tap net device\n");
++		goto out;
++	}
++	if (pos < start + di->cpt_next) {
++		struct cpt_hwaddr_image hw;
++		/* Restore hardware address */
++		err = rst_get_object(CPT_OBJ_NET_HWADDR, pos,
++				&hw, ctx);
++		if (err)
++			goto out;
++		BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr));
++		memcpy(dev->dev_addr, hw.cpt_dev_addr,
++				sizeof(hw.cpt_dev_addr));
++	}
++	net = get_exec_env()->ve_ns->net_ns;
++	tn = net_generic(net, tun_net_id);
++	list_add(&tun->list, &tn->dev_list);
++
++	bind_file->private_data = tun;
++	tun->bind_file = bind_file;
++
++out:
++	fput(bind_file);
++	rtnl_unlock();
++#endif
++	return err;
++}
++
++static int rst_restore_veth(loff_t pos, struct net_device *dev,
++			struct cpt_context *ctx)
++{
++	int err = -ENODEV;
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++	struct cpt_veth_image vi;
++	struct veth_struct *veth;
++
++	if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open)) {
++		eprintk_ctx("Module vzethdev is not loaded, "
++			    "or device %s is not a veth device\n", dev->name);
++		return -EINVAL;
++	}
++	err = rst_get_object(CPT_OBJ_NET_VETH, pos, &vi, ctx);
++	if (err)
++		return err;
++	veth = veth_from_netdev(dev);
++	veth->allow_mac_change = vi.cpt_allow_mac_change;
++#endif
++	return err;
++}
++
++static int rst_restore_netstats(loff_t pos, struct net_device *dev,
++			struct cpt_context * ctx)
++{
++	struct cpt_netstats_image *n;
++	struct net_device_stats *stats = NULL;
++	struct net_device *lo = get_exec_env()->ve_netns->loopback_dev;
++	int err;
++
++	if (!dev->get_stats)
++		return 0;
++
++	n = cpt_get_buf(ctx);
++	err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx);
++	if (err)
++		goto out;
++	BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen);
++	preempt_disable();
++	if (dev == lo)
++		stats = &lo->stats;
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++	else if (KSYMREF(veth_open) && dev->open == KSYMREF(veth_open))
++		stats = veth_stats(dev, smp_processor_id());
++#endif
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++	else if (dev == get_exec_env()->_venet_dev)
++		stats = venet_stats(dev, smp_processor_id());
++#endif
++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
++	if (dev->open == tun_net_open)
++		stats = &dev->stats;
++#endif
++	if (!stats) {
++		err = -ENODEV;
++		eprintk_ctx("Network device %s is not supported\n", dev->name);
++		goto out;
++	}
++
++	stats->rx_packets = n->cpt_rx_packets;
++	stats->tx_packets = n->cpt_tx_packets;
++	stats->rx_bytes = n->cpt_rx_bytes;
++	stats->tx_bytes = n->cpt_tx_bytes;
++	stats->rx_errors = n->cpt_rx_errors;
++	stats->tx_errors = n->cpt_tx_errors;
++	stats->rx_dropped = n->cpt_rx_dropped;
++	stats->tx_dropped = n->cpt_tx_dropped;
++	stats->multicast = n->cpt_multicast;
++	stats->collisions = n->cpt_collisions;
++	stats->rx_length_errors = n->cpt_rx_length_errors;
++	stats->rx_over_errors = n->cpt_rx_over_errors;
++	stats->rx_crc_errors = n->cpt_rx_crc_errors;
++	stats->rx_frame_errors = n->cpt_rx_frame_errors;
++	stats->rx_fifo_errors = n->cpt_rx_fifo_errors;
++	stats->rx_missed_errors = n->cpt_rx_missed_errors;
++	stats->tx_aborted_errors = n->cpt_tx_aborted_errors;
++	stats->tx_carrier_errors = n->cpt_tx_carrier_errors;
++	stats->tx_fifo_errors = n->cpt_tx_fifo_errors;
++	stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors;
++	stats->tx_window_errors = n->cpt_tx_window_errors;
++	stats->rx_compressed = n->cpt_rx_compressed;
++	stats->tx_compressed = n->cpt_tx_compressed;
++
++out:
++	preempt_enable();
++	cpt_release_buf(ctx);
++	return err;
++}
++
++int rst_restore_netdev(struct cpt_context *ctx)
++{
++	struct net *net = get_exec_env()->ve_netns;
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_netdev_image di;
++	struct net_device *dev;
++
++	get_exec_env()->disable_net = 1;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		loff_t pos;
++		struct net_device *dev_new;
++		err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx);
++		if (err)
++			return err;
++
++		pos = sec + di.cpt_hdrlen;
++		if (di.cpt_next > sizeof(di)) {
++			struct cpt_object_hdr hdr;
++			err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
++					ctx, sec + di.cpt_hdrlen);
++			if (err)
++				return err;
++			if (hdr.cpt_object == CPT_OBJ_NET_TUNTAP) {
++				err = rst_restore_tuntap(sec, &di, ctx);
++				if (err) {
++					eprintk_ctx("restore tuntap %s: %d\n",
++							di.cpt_name, err);
++					return err;
++				}
++				pos += hdr.cpt_next;
++			}
++		}
++
++		rtnl_lock();
++		dev = __dev_get_by_name(net, di.cpt_name);
++		if (dev) {
++			if (dev->ifindex != di.cpt_index) {
++				dev_new = __dev_get_by_index(net, di.cpt_index);
++				if (!dev_new) {
++					write_lock_bh(&dev_base_lock);
++					hlist_del(&dev->index_hlist);
++					if (dev->iflink == dev->ifindex)
++						dev->iflink = di.cpt_index;
++					dev->ifindex = di.cpt_index;
++					hlist_add_head(&dev->index_hlist,
++							dev_index_hash(net, dev->ifindex));
++					write_unlock_bh(&dev_base_lock);
++				} else {
++					write_lock_bh(&dev_base_lock);
++					hlist_del(&dev->index_hlist);
++					hlist_del(&dev_new->index_hlist);
++					if (dev_new->iflink == dev_new->ifindex)
++						dev_new->iflink = dev->ifindex;
++					dev_new->ifindex = dev->ifindex;
++					if (dev->iflink == dev->ifindex)
++						dev->iflink = di.cpt_index;
++					dev->ifindex = di.cpt_index;
++					hlist_add_head(&dev->index_hlist,
++							dev_index_hash(net, dev->ifindex));
++					hlist_add_head(&dev_new->index_hlist,
++							dev_index_hash(net, dev_new->ifindex));
++					write_unlock_bh(&dev_base_lock);
++				}
++			}
++			if (di.cpt_flags^dev->flags) {
++				err = dev_change_flags(dev, di.cpt_flags);
++				if (err)
++					eprintk_ctx("dev_change_flags err: %d\n", err);
++			}
++			while (pos < sec + di.cpt_next) {
++				struct cpt_object_hdr hdr;
++				err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
++						ctx, pos);
++				if (err)
++					goto out;
++				if (hdr.cpt_object == CPT_OBJ_NET_VETH) {
++					err = rst_restore_veth(pos, dev, ctx);
++					if (err) {
++						eprintk_ctx("restore veth %s: %d\n",
++								di.cpt_name, err);
++						goto out;
++					}
++				} else if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) {
++					/* Restore hardware address */
++					struct cpt_hwaddr_image hw;
++					err = rst_get_object(CPT_OBJ_NET_HWADDR,
++							pos, &hw, ctx);
++					if (err)
++						goto out;
++				BUG_ON(sizeof(hw.cpt_dev_addr) !=
++						sizeof(dev->dev_addr));
++					memcpy(dev->dev_addr, hw.cpt_dev_addr,
++							sizeof(hw.cpt_dev_addr));
++				} else if (hdr.cpt_object == CPT_OBJ_NET_STATS) {
++					err = rst_restore_netstats(pos, dev, ctx);
++					if (err) {
++						eprintk_ctx("rst stats %s: %d\n",
++								di.cpt_name, err);
++						goto out;
++					}
++				}
++				pos += hdr.cpt_next;
++			}
++		} else {
++			eprintk_ctx("unknown interface 2 %s\n", di.cpt_name);
++		}
++		rtnl_unlock();
++		sec += di.cpt_next;
++	}
++	return 0;
++out:
++	rtnl_unlock();
++	return err;
++}
++
++static int dumpfn(void *arg)
++{
++	int i;
++	int *pfd = arg;
++	char *argv[] = { "iptables-restore", "-c", NULL };
++
++	if (pfd[0] != 0)
++		sc_dup2(pfd[0], 0);
++
++	for (i=1; i<current->files->fdt->max_fds; i++)
++		sc_close(i);
++
++	module_put(THIS_MODULE);
++
++	set_fs(KERNEL_DS);
++	i = sc_execve("/sbin/iptables-restore", argv, NULL);
++	if (i == -ENOENT)
++		i = sc_execve("/usr/sbin/iptables-restore", argv, NULL);
++	eprintk("failed to exec iptables-restore: %d\n", i);
++	return 255 << 8;
++}
++
++static int rst_restore_iptables(struct cpt_context * ctx)
++{
++	int err;
++	int pfd[2];
++	struct file *f;
++	struct cpt_object_hdr v;
++	int n;
++	struct cpt_section_hdr h;
++	loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES];
++	loff_t end;
++	int pid;
++	int status;
++	mm_segment_t oldfs;
++	sigset_t ignore, blocked;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	if (h.cpt_hdrlen == h.cpt_next)
++		return 0;
++	if (h.cpt_hdrlen > h.cpt_next)
++		return -EINVAL;
++	sec += h.cpt_hdrlen;
++	err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx);
++	if (err < 0)
++		return err;
++
++	err = sc_pipe(pfd);
++	if (err < 0)
++		return err;
++	ignore.sig[0] = CPT_SIG_IGNORE_MASK;
++	sigprocmask(SIG_BLOCK, &ignore, &blocked);
++	pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0);
++	if (err < 0) {
++		eprintk_ctx("iptables local_kernel_thread: %d\n", err);
++		goto out;
++	}
++	f = fget(pfd[1]);
++	sc_close(pfd[1]);
++	sc_close(pfd[0]);
++
++	ctx->file->f_pos = sec + v.cpt_hdrlen;
++	end = sec + v.cpt_next;
++	do {
++		char *p;
++		char buf[16];
++
++		n = end - ctx->file->f_pos;
++		if (n > sizeof(buf))
++			n = sizeof(buf);
++
++		if (ctx->read(buf, n, ctx))
++			break;
++		if ((p = memchr(buf, 0, n)) != NULL)
++			n = p - buf;
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		f->f_op->write(f, buf, n, &f->f_pos);
++		set_fs(oldfs);
++	} while (ctx->file->f_pos < end);
++
++	fput(f);
++
++	oldfs = get_fs(); set_fs(KERNEL_DS);
++	if ((err = sc_waitx(pid, 0, &status)) < 0)
++		eprintk_ctx("wait4: %d\n", err);
++	else if ((status & 0x7f) == 0) {
++		err = (status & 0xff00) >> 8;
++		if (err != 0) {
++			eprintk_ctx("iptables-restore exited with %d\n", err);
++			err = -EINVAL;
++		}
++	} else {
++		eprintk_ctx("iptables-restore terminated\n");
++		err = -EINVAL;
++	}
++	set_fs(oldfs);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++
++	return err;
++
++out:
++	if (pfd[1] >= 0)
++		sc_close(pfd[1]);
++	if (pfd[0] >= 0)
++		sc_close(pfd[0]);
++	sigprocmask(SIG_SETMASK, &blocked, NULL);
++	return err;
++}
++
++int rst_restore_net(struct cpt_context *ctx)
++{
++	int err;
++
++	err = rst_restore_netdev(ctx);
++	if (!err)
++		err = rst_restore_ifaddr(ctx);
++	if (!err)
++		err = rst_restore_route(ctx);
++	if (!err)
++		err = rst_restore_iptables(ctx);
++	if (!err)
++		err = rst_restore_ip_conntrack(ctx);
++	return err;
++}
+diff --git a/kernel/cpt/rst_proc.c b/kernel/cpt/rst_proc.c
+new file mode 100644
+index 0000000..189649f
+--- /dev/null
++++ b/kernel/cpt/rst_proc.c
+@@ -0,0 +1,580 @@
++/*
++ *
++ *  kernel/cpt/rst_proc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_ioctl.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++
++MODULE_AUTHOR("Alexey Kuznetsov <alexey at sw.ru>");
++MODULE_LICENSE("GPL");
++
++/* List of contexts and lock protecting the list */
++static struct list_head cpt_context_list;
++static spinlock_t cpt_context_lock;
++
++static int proc_read(char *buffer, char **start, off_t offset,
++		     int length, int *eof, void *data)
++{
++	off_t pos = 0;
++	off_t begin = 0;
++	int len = 0;
++	cpt_context_t *ctx;
++
++	len += sprintf(buffer, "Ctx      Id       VE       State\n");
++
++	spin_lock(&cpt_context_lock);
++
++	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++		len += sprintf(buffer+len,"%p %08x %-8u %d",
++			       ctx,
++			       ctx->contextid,
++			       ctx->ve_id,
++			       ctx->ctx_state
++			       );
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++		len += pagein_info_printf(buffer+len, ctx);
++#endif
++
++		buffer[len++] = '\n';
++
++		pos = begin+len;
++		if (pos < offset) {
++			len = 0;
++			begin = pos;
++		}
++		if (pos > offset+length)
++			goto done;
++	}
++	*eof = 1;
++
++done:
++	spin_unlock(&cpt_context_lock);
++	*start = buffer + (offset - begin);
++	len -= (offset - begin);
++	if(len > length)
++		len = length;
++	if(len < 0)
++		len = 0;
++	return len;
++}
++
++void rst_context_release(cpt_context_t *ctx)
++{
++	list_del(&ctx->ctx_list);
++	spin_unlock(&cpt_context_lock);
++
++	if (ctx->ctx_state > 0)
++		rst_resume(ctx);
++	ctx->ctx_state = CPT_CTX_ERROR;
++
++	rst_close_dumpfile(ctx);
++
++	if (ctx->anonvmas) {
++		int h;
++		for (h = 0; h < CPT_ANONVMA_HSIZE; h++) {
++			while (!hlist_empty(&ctx->anonvmas[h])) {
++				struct hlist_node *elem = ctx->anonvmas[h].first;
++				hlist_del(elem);
++				kfree(elem);
++			}
++		}
++		free_page((unsigned long)ctx->anonvmas);
++	}
++	cpt_flush_error(ctx);
++	if (ctx->errorfile) {
++		fput(ctx->errorfile);
++		ctx->errorfile = NULL;
++	}
++	if (ctx->error_msg) {
++		free_page((unsigned long)ctx->error_msg);
++		ctx->error_msg = NULL;
++	}
++#ifdef CONFIG_VZ_CHECKPOINT_ITER
++	rst_drop_iter_dir(ctx);
++#endif
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	if (ctx->pagein_file_out)
++		fput(ctx->pagein_file_out);
++	if (ctx->pagein_file_in)
++		fput(ctx->pagein_file_in);
++	if (ctx->pgin_task)
++		put_task_struct(ctx->pgin_task);
++#endif
++	if (ctx->filejob_queue)
++		rst_flush_filejobs(ctx);
++	if (ctx->vdso)
++		free_page((unsigned long)ctx->vdso);
++	if (ctx->objcount)
++		eprintk_ctx("%d objects leaked\n", ctx->objcount);
++	kfree(ctx);
++
++	spin_lock(&cpt_context_lock);
++}
++
++static void __cpt_context_put(cpt_context_t *ctx)
++{
++	if (!--ctx->refcount)
++		rst_context_release(ctx);
++}
++
++static void cpt_context_put(cpt_context_t *ctx)
++{
++	spin_lock(&cpt_context_lock);
++	__cpt_context_put(ctx);
++	spin_unlock(&cpt_context_lock);
++}
++
++cpt_context_t * rst_context_open(void)
++{
++	cpt_context_t *ctx;
++
++	if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
++		rst_context_init(ctx);
++		spin_lock(&cpt_context_lock);
++		list_add_tail(&ctx->ctx_list, &cpt_context_list);
++		spin_unlock(&cpt_context_lock);
++		ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
++		if (ctx->error_msg != NULL)
++			ctx->error_msg[0] = 0;
++	}
++	return ctx;
++}
++
++void rst_report_error(int err, cpt_context_t *ctx)
++{
++	if (ctx->statusfile) {
++		mm_segment_t oldfs;
++		int status = 7 /* VZ_ENVCREATE_ERROR */;
++
++		oldfs = get_fs(); set_fs(KERNEL_DS);
++		if (ctx->statusfile->f_op && ctx->statusfile->f_op->write)
++			ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos);
++		set_fs(oldfs);
++		fput(ctx->statusfile);
++		ctx->statusfile = NULL;
++	}
++}
++
++
++static cpt_context_t * cpt_context_lookup(unsigned int ctxid)
++{
++	cpt_context_t *ctx;
++
++	spin_lock(&cpt_context_lock);
++	list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++		if (ctx->contextid == ctxid) {
++			ctx->refcount++;
++			spin_unlock(&cpt_context_lock);
++			return ctx;
++		}
++	}
++	spin_unlock(&cpt_context_lock);
++	return NULL;
++}
++
++static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
++{
++	int err = 0;
++	cpt_context_t *ctx;
++	struct file *dfile = NULL;
++
++	unlock_kernel();
++
++	if (cmd == CPT_TEST_CAPS) {
++		err = test_cpu_caps();
++		goto out_lock;
++	}
++
++	if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
++		cpt_context_t *old_ctx;
++
++		ctx = NULL;
++		if (cmd == CPT_JOIN_CONTEXT) {
++			err = -ENOENT;
++			ctx = cpt_context_lookup(arg);
++			if (!ctx)
++				goto out_lock;
++		}
++
++		spin_lock(&cpt_context_lock);
++		old_ctx = (cpt_context_t*)file->private_data;
++		file->private_data = ctx;
++
++		if (old_ctx) {
++			if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
++				old_ctx->sticky = 0;
++				old_ctx->refcount--;
++			}
++			__cpt_context_put(old_ctx);
++		}
++		spin_unlock(&cpt_context_lock);
++		err = 0;
++		goto out_lock;
++	}
++
++	spin_lock(&cpt_context_lock);
++	ctx = (cpt_context_t*)file->private_data;
++	if (ctx)
++		ctx->refcount++;
++	spin_unlock(&cpt_context_lock);
++
++	if (!ctx) {
++		cpt_context_t *old_ctx;
++
++		err = -ENOMEM;
++		ctx = rst_context_open();
++		if (!ctx)
++			goto out_lock;
++
++		spin_lock(&cpt_context_lock);
++		old_ctx = (cpt_context_t*)file->private_data;
++		if (!old_ctx) {
++			ctx->refcount++;
++			file->private_data = ctx;
++		} else {
++			old_ctx->refcount++;
++		}
++		if (old_ctx) {
++			__cpt_context_put(ctx);
++			ctx = old_ctx;
++		}
++		spin_unlock(&cpt_context_lock);
++	}
++
++	if (cmd == CPT_GET_CONTEXT) {
++		unsigned int contextid = (unsigned int)arg;
++
++		err = -EINVAL;
++		if (ctx->contextid && ctx->contextid != contextid)
++			goto out_nosem;
++		if (!ctx->contextid) {
++			cpt_context_t *c1 = cpt_context_lookup(contextid);
++			if (c1) {
++				cpt_context_put(c1);
++				err = -EEXIST;
++				goto out_nosem;
++			}
++			ctx->contextid = contextid;
++		}
++		spin_lock(&cpt_context_lock);
++		if (!ctx->sticky) {
++			ctx->sticky = 1;
++			ctx->refcount++;
++		}
++		spin_unlock(&cpt_context_lock);
++		err = 0;
++		goto out_nosem;
++	}
++
++	down(&ctx->main_sem);
++
++	err = -EBUSY;
++	if (ctx->ctx_state < 0)
++		goto out;
++
++	err = 0;
++	switch (cmd) {
++	case CPT_SET_DUMPFD:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			err = -EBADF;
++			dfile = fget(arg);
++			if (dfile == NULL)
++				break;
++			if (dfile->f_op == NULL ||
++			    dfile->f_op->read == NULL) {
++				fput(dfile);
++				break;
++			}
++			err = 0;
++		}
++		if (ctx->file)
++			fput(ctx->file);
++		ctx->file = dfile;
++		break;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	case CPT_SET_PAGEINFDIN:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->pagein_file_in)
++			fput(ctx->pagein_file_in);
++		ctx->pagein_file_in = dfile;
++		break;
++	case CPT_SET_PAGEINFDOUT:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->pagein_file_out)
++			fput(ctx->pagein_file_out);
++		ctx->pagein_file_out = dfile;
++		break;
++	case CPT_PAGEIND:
++		err = rst_pageind(ctx);
++		break;
++#endif
++#ifdef CONFIG_VZ_CHECKPOINT_ITER
++	case CPT_ITER:
++		err = rst_iteration(ctx);
++		break;
++#endif
++	case CPT_SET_LOCKFD:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->lockfile)
++			fput(ctx->lockfile);
++		ctx->lockfile = dfile;
++		break;
++	case CPT_SET_STATUSFD:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->statusfile)
++			fput(ctx->statusfile);
++		ctx->statusfile = dfile;
++		break;
++	case CPT_SET_ERRORFD:
++		if (arg >= 0) {
++			dfile = fget(arg);
++			if (dfile == NULL) {
++				err = -EBADF;
++				break;
++			}
++		}
++		if (ctx->errorfile)
++			fput(ctx->errorfile);
++		ctx->errorfile = dfile;
++		break;
++	case CPT_SET_VEID:
++		if (ctx->ctx_state > 0) {
++			err = -EBUSY;
++			break;
++		}
++		ctx->ve_id = arg;
++		break;
++	case CPT_UNDUMP:
++		if (ctx->ctx_state > 0) {
++			err = -ENOENT;
++			break;
++		}
++		ctx->ctx_state = CPT_CTX_UNDUMPING;
++		err = vps_rst_undump(ctx);
++		if (err) {
++			rst_report_error(err, ctx);
++			if (rst_kill(ctx) == 0)
++				ctx->ctx_state = CPT_CTX_IDLE;
++		} else {
++			ctx->ctx_state = CPT_CTX_UNDUMPED;
++		}
++		break;
++	case CPT_RESUME:
++		if (!ctx->ctx_state) {
++			err = -ENOENT;
++			break;
++		}
++		err = rst_resume(ctx);
++		if (!err)
++			ctx->ctx_state = CPT_CTX_IDLE;
++		break;
++	case CPT_KILL:
++		if (!ctx->ctx_state) {
++			err = -ENOENT;
++			break;
++		}
++		err = rst_kill(ctx);
++		if (!err)
++			ctx->ctx_state = CPT_CTX_IDLE;
++		break;
++	default:
++		err = -EINVAL;
++		break;
++	}
++
++out:
++	cpt_flush_error(ctx);
++	up(&ctx->main_sem);
++out_nosem:
++	cpt_context_put(ctx);
++out_lock:
++	lock_kernel();
++	if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
++	    err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
++		err = -EINTR;
++	return err;
++}
++
++static int rst_open(struct inode * inode, struct file * file)
++{
++	if (!try_module_get(THIS_MODULE))
++		return -EBUSY;
++
++	return 0;
++}
++
++static int rst_release(struct inode * inode, struct file * file)
++{
++	cpt_context_t *ctx;
++
++	spin_lock(&cpt_context_lock);
++	ctx = (cpt_context_t*)file->private_data;
++	file->private_data = NULL;
++	if (ctx)
++		__cpt_context_put(ctx);
++	spin_unlock(&cpt_context_lock);
++
++
++	module_put(THIS_MODULE);
++	return 0;
++}
++
++static struct file_operations rst_fops =
++{
++	.owner		= THIS_MODULE,
++	.ioctl		= rst_ioctl,
++	.open		= rst_open,
++	.release	= rst_release,
++};
++
++
++static struct proc_dir_entry *proc_ent;
++extern void *schedule_tail_p;
++extern void schedule_tail_hook(void);
++
++static struct ctl_table_header *ctl_header;
++
++static ctl_table debug_table[] = {
++	{
++		.procname	= "rst",
++		.data		= &debug_level,
++		.maxlen		= sizeof(debug_level),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{ .ctl_name = 0 }
++};
++static ctl_table root_table[] = {
++	{
++		.ctl_name	= CTL_DEBUG,
++		.procname	= "debug",
++		.mode		= 0555,
++		.child		= debug_table,
++	},
++	{ .ctl_name = 0 }
++};
++
++static int __init init_rst(void)
++{
++	int err;
++
++	err = -ENOMEM;
++	ctl_header = register_sysctl_table(root_table);
++	if (!ctl_header)
++		goto err_mon;
++
++	spin_lock_init(&cpt_context_lock);
++	INIT_LIST_HEAD(&cpt_context_list);
++
++	err = -EINVAL;
++	proc_ent = proc_create("rst", 0600, NULL, NULL);
++	if (!proc_ent)
++		goto err_out;
++
++	rst_fops.read = proc_ent->proc_fops->read;
++	rst_fops.write = proc_ent->proc_fops->write;
++	rst_fops.llseek = proc_ent->proc_fops->llseek;
++	proc_ent->proc_fops = &rst_fops;
++
++	proc_ent->read_proc = proc_read;
++	proc_ent->data = NULL;
++	proc_ent->owner = THIS_MODULE;
++	return 0;
++
++err_out:
++	unregister_sysctl_table(ctl_header);
++err_mon:
++	return err;
++}
++module_init(init_rst);
++
++static void __exit exit_rst(void)
++{
++	remove_proc_entry("rst", NULL);
++	unregister_sysctl_table(ctl_header);
++
++	spin_lock(&cpt_context_lock);
++	while (!list_empty(&cpt_context_list)) {
++		cpt_context_t *ctx;
++		ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
++
++		if (!ctx->sticky)
++			ctx->refcount++;
++		ctx->sticky = 0;
++
++		BUG_ON(ctx->refcount != 1);
++
++		__cpt_context_put(ctx);
++	}
++	spin_unlock(&cpt_context_lock);
++}
++module_exit(exit_rst);
+diff --git a/kernel/cpt/rst_process.c b/kernel/cpt/rst_process.c
+new file mode 100644
+index 0000000..0f60a06
+--- /dev/null
++++ b/kernel/cpt/rst_process.c
+@@ -0,0 +1,1630 @@
++/*
++ *
++ *  kernel/cpt/rst_process.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/virtinfo.h>
++#include <linux/virtinfoscp.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/ptrace.h>
++#include <linux/tty.h>
++#include <linux/nsproxy.h>
++#include <linux/securebits.h>
++#ifdef CONFIG_X86
++#include <asm/desc.h>
++#endif
++#include <asm/unistd.h>
++
++#include <bc/beancounter.h>
++#include <bc/misc.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_ubc.h"
++#include "cpt_process.h"
++#include "cpt_kernel.h"
++
++
++#define HOOK_RESERVE	256
++
++struct resume_info
++{
++	asmlinkage void (*hook)(struct resume_info *);
++	unsigned long	hooks;
++#define HOOK_TID	0
++#define HOOK_CONT	1
++#define HOOK_LSI	2
++#define HOOK_RESTART	3
++	unsigned long	tid_ptrs[2];
++	siginfo_t	last_siginfo;
++};
++
++#ifdef CONFIG_X86_32
++
++#define IN_SYSCALL(regs)	((long)(regs)->orig_ax >= 0)
++#define IN_ERROR(regs)		((long)(regs)->ax < 0)
++#define SYSCALL_ERRNO(regs)	(-(long)((regs)->ax))
++#define SYSCALL_RETVAL(regs)	((regs)->ax)
++#define SYSCALL_NR(regs)	((regs)->orig_ax)
++
++#define SYSCALL_SETRET(regs,val)	do { (regs)->ax = (val); } while (0)
++
++#define SYSCALL_RESTART2(regs,new)	do { (regs)->ax = (new); \
++					     (regs)->ip -= 2; } while (0) 
++
++#define syscall_is(tsk,regs,name)	(SYSCALL_NR(regs) == __NR_##name)
++
++/* In new kernels task_pt_regs() is define to something inappropriate */
++#undef task_pt_regs
++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
++
++#elif defined(CONFIG_X86_64)
++
++#define IN_SYSCALL(regs)	((long)(regs)->orig_ax >= 0)
++#define IN_ERROR(regs)		((long)(regs)->ax < 0)
++#define SYSCALL_ERRNO(regs)	(-(long)((regs)->ax))
++#define SYSCALL_RETVAL(regs)	((regs)->ax)
++#define SYSCALL_NR(regs)	((regs)->orig_ax)
++
++#define SYSCALL_SETRET(regs,val)	do { (regs)->ax = (val); } while (0)
++
++#define SYSCALL_RESTART2(regs,new)	do { (regs)->ax = (new); \
++					     (regs)->ip -= 2; } while (0) 
++
++#define __NR32_restart_syscall	0
++#define __NR32_rt_sigtimedwait	177
++#define __NR32_pause		29
++#define __NR32_futex		240
++
++#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \
++				    SYSCALL_NR(regs) == __NR_##name) || \
++				   ((task_thread_info(tsk)->flags&_TIF_IA32) && \
++				    SYSCALL_NR(regs) == __NR32_##name))
++
++#elif defined (CONFIG_IA64)
++
++#define IN_SYSCALL(regs)	((long)(regs)->cr_ifs >= 0)
++#define IN_ERROR(regs)		((long)(regs)->r10 == -1)
++#define SYSCALL_ERRNO(regs)	((regs)->r10 == -1 ? (long)((regs)->r8) : 0)
++#define SYSCALL_RETVAL(regs)	((regs)->r8)
++#define SYSCALL_NR(regs)	((regs)->cr_ifs >= 0 ? (regs)->r15 : -1)
++
++#define SYSCALL_SETRET(regs,val)	do { (regs)->r8 = (val); } while (0)
++
++#define SYSCALL_RESTART2(regs,new)	do { (regs)->r15 = (new); \
++					     (regs)->r10 = 0; \
++					     ia64_decrement_ip(regs); } while (0) 
++
++#define syscall_is(tsk,regs,name)	(SYSCALL_NR(regs) == __NR_##name)
++
++#else
++
++#error This arch is not supported
++
++#endif
++
++#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs))
++
++pid_t vpid_to_pid(pid_t nr)
++{
++	pid_t vnr;
++	struct pid *pid;
++
++	rcu_read_lock();
++	pid = find_vpid(nr);
++	vnr = (pid == NULL ? -1 : pid->numbers[0].nr);
++	rcu_read_unlock();
++	return vnr;
++}
++
++static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si)
++{
++	memset(info, 0, sizeof(*info));
++	switch(si->cpt_code & __SI_MASK) {
++	case __SI_TIMER:
++		info->si_tid = si->cpt_pid;
++		info->si_overrun = si->cpt_uid;
++		info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval);
++		info->si_sys_private = si->cpt_utime;
++		break;
++	case __SI_POLL:
++		info->si_band = si->cpt_pid;
++		info->si_fd = si->cpt_uid;
++		break;
++	case __SI_FAULT:
++		info->si_addr = cpt_ptr_import(si->cpt_sigval);
++#ifdef __ARCH_SI_TRAPNO
++		info->si_trapno = si->cpt_pid;
++#endif
++		break;
++	case __SI_CHLD:
++		info->si_pid = si->cpt_pid;
++		info->si_uid = si->cpt_uid;
++		info->si_status = si->cpt_sigval;
++		info->si_stime = si->cpt_stime;
++		info->si_utime = si->cpt_utime;
++		break;
++	case __SI_KILL:
++	case __SI_RT:
++	case __SI_MESGQ:
++	default:
++		info->si_pid = si->cpt_pid;
++		info->si_uid = si->cpt_uid;
++		info->si_ptr = cpt_ptr_import(si->cpt_sigval);
++		break;
++	}
++	info->si_signo = si->cpt_signo;
++	info->si_errno = si->cpt_errno;
++	info->si_code = si->cpt_code;
++}
++
++static int restore_sigqueue(struct task_struct *tsk,
++			    struct sigpending *queue, unsigned long start,
++			    unsigned long end)
++{
++	while (start < end) {
++		struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start;
++		if (si->cpt_object == CPT_OBJ_SIGINFO) {
++			struct sigqueue *q = NULL;
++			struct user_struct *up;
++
++			up = alloc_uid(get_exec_env()->ve_ns->user_ns, si->cpt_user);
++			if (!up)
++				return -ENOMEM;
++			q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
++			if (!q) {
++				free_uid(up);
++				return -ENOMEM;
++			}
++			if (ub_siginfo_charge(q, get_exec_ub())) {
++				kmem_cache_free(sigqueue_cachep, q);
++				free_uid(up);
++				return -ENOMEM;
++			}
++
++			INIT_LIST_HEAD(&q->list);
++			/* Preallocated elements (posix timers) are not
++			 * supported yet. It is safe to replace them with
++			 * a private one. */
++			q->flags = 0;
++			q->user = up;
++			atomic_inc(&q->user->sigpending);
++
++			decode_siginfo(&q->info, si);
++			list_add_tail(&q->list, &queue->list);
++		}
++		start += si->cpt_next;
++	}
++	return 0;
++}
++
++int rst_process_linkage(cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		struct cpt_task_image *ti = obj->o_image;
++
++		if (tsk == NULL) {
++			eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm);
++			return -EINVAL;
++		}
++
++		if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) {
++			struct pid *pid;
++
++			rcu_read_lock();
++			pid = find_vpid(ti->cpt_pgrp);
++			if (!pid) {
++				eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk));
++				return -EINVAL;
++			}
++
++			write_lock_irq(&tasklist_lock);
++			if (task_pgrp_nr(tsk) != pid_nr(pid)) {
++				detach_pid(tsk, PIDTYPE_PGID);
++				set_task_pgrp(tsk, pid_nr(pid));
++				if (thread_group_leader(tsk))
++					attach_pid(tsk, PIDTYPE_PGID, pid);
++			}
++			write_unlock_irq(&tasklist_lock);
++			if (task_pgrp_nr(tsk) != pid_nr(pid)) {
++				eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk));
++				return -EINVAL;
++			}
++			rcu_read_unlock();
++		}
++		if (task_session_vnr(tsk) != ti->cpt_session) {
++			struct pid *pid;
++
++			rcu_read_lock();
++			pid = find_vpid(ti->cpt_session);
++			if (!pid) {
++				eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk));
++				return -EINVAL;
++			}
++
++			write_lock_irq(&tasklist_lock);
++			if (task_session_nr(tsk) != pid_nr(pid)) {
++				detach_pid(tsk, PIDTYPE_SID);
++				set_task_session(tsk, pid_nr(pid));
++				if (thread_group_leader(tsk))
++					attach_pid(tsk, PIDTYPE_SID, pid);
++			}
++			write_unlock_irq(&tasklist_lock);
++			if (task_session_nr(tsk) != pid_nr(pid)) {
++				eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk));
++				return -EINVAL;
++			}
++			rcu_read_unlock();
++		}
++		if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) {
++			struct pid *pid;
++
++			rcu_read_lock();
++			pid = get_pid(find_vpid(ti->cpt_old_pgrp));
++			if (!pid) {
++				eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk));
++				return -EINVAL;
++			}
++			tsk->signal->tty_old_pgrp = pid;
++			rcu_read_unlock();
++		}
++	}
++
++	return 0;
++}
++
++struct pid *alloc_vpid_safe(pid_t vnr)
++{
++	struct pid *pid;
++
++	pid = alloc_pid(current->nsproxy->pid_ns, vnr);
++	if (!pid)
++		pid = find_vpid(vnr);
++	return pid;
++}
++
++static int
++restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx)
++{
++	int err;
++	struct cpt_signal_image *si = cpt_get_buf(ctx);
++
++	current->signal->tty = NULL;
++
++	err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx);
++	if (err) {
++		cpt_release_buf(ctx);
++		return err;
++	}
++
++	if (task_pgrp_vnr(current) != si->cpt_pgrp) {
++		struct pid * pid = NULL, *free = NULL;
++
++		rcu_read_lock();
++		if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) {
++#if 0
++			if (!is_virtual_pid(si->cpt_pgrp)) {
++				eprintk_ctx("external process group " CPT_FID, CPT_TID(current));
++				cpt_release_buf(ctx);
++				return -EINVAL;
++			}
++#endif
++			pid = alloc_vpid_safe(si->cpt_pgrp);
++			free = pid;
++		}
++		write_lock_irq(&tasklist_lock);
++		if (pid != NULL) {
++			if (task_pgrp_nr(current) != pid_nr(pid)) {
++				detach_pid(current, PIDTYPE_PGID);
++				set_task_pgrp(current, pid_nr(pid));
++				if (thread_group_leader(current)) {
++					attach_pid(current, PIDTYPE_PGID, pid);
++					free = NULL;
++				}
++			}
++		}
++		write_unlock_irq(&tasklist_lock);
++		if (free != NULL)
++			free_pid(free);
++		rcu_read_unlock();
++	}
++
++	current->signal->tty_old_pgrp = NULL;
++	if ((int)si->cpt_old_pgrp > 0) {
++		if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) {
++			current->signal->tty_old_pgrp =
++					alloc_pid(current->nsproxy->pid_ns, 0);
++			if (!current->signal->tty_old_pgrp) {
++				eprintk_ctx("failed to allocate stray tty_old_pgrp\n");
++				cpt_release_buf(ctx);
++				return -EINVAL;
++			}
++		} else {
++			rcu_read_lock();
++			current->signal->tty_old_pgrp =
++				get_pid(alloc_vpid_safe(si->cpt_old_pgrp));
++			rcu_read_unlock();
++			if (!current->signal->tty_old_pgrp) {
++				dprintk_ctx("forward old tty PGID\n");
++				current->signal->tty_old_pgrp = NULL;
++			}
++		}
++	}
++
++	if (task_session_vnr(current) != si->cpt_session) {
++		struct pid * pid = NULL, *free = NULL;
++
++		rcu_read_lock();
++		if (si->cpt_session_type == CPT_PGRP_ORPHAN) {
++#if 0
++			if (!is_virtual_pid(si->cpt_session)) {
++				eprintk_ctx("external process session " CPT_FID, CPT_TID(current));
++				cpt_release_buf(ctx);
++				return -EINVAL;
++			}
++#endif
++			pid = alloc_vpid_safe(si->cpt_session);
++			free = pid;
++		}
++		write_lock_irq(&tasklist_lock);
++		if (pid == NULL)
++			pid = find_vpid(si->cpt_session);
++		if (pid != NULL) {
++			if (task_session_nr(current) != pid_nr(pid)) {
++				detach_pid(current, PIDTYPE_SID);
++				set_task_session(current, pid_nr(pid));
++				if (thread_group_leader(current)) {
++					attach_pid(current, PIDTYPE_SID, pid);
++					free = NULL;
++				}
++			}
++		}
++		write_unlock_irq(&tasklist_lock);
++		if (free != NULL)
++			free_pid(free);
++		rcu_read_unlock();
++	}
++
++	cpt_sigset_import(&current->signal->shared_pending.signal, si->cpt_sigpending);
++	current->signal->leader = si->cpt_leader;
++	if (si->cpt_ctty != CPT_NULL) {
++		cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx);
++		if (obj) {
++			struct tty_struct *tty = obj->o_obj;
++			if (!tty->session || tty->session ==
++					task_session(current)) {
++				tty->session = task_session(current);
++				current->signal->tty = tty;
++			} else {
++				wprintk_ctx("tty session mismatch\n");
++			}
++		}
++	}
++
++	if (si->cpt_curr_target)
++		current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target);
++	current->signal->flags = 0;
++	*exiting = si->cpt_group_exit;
++	current->signal->group_exit_code = si->cpt_group_exit_code;
++	if (si->cpt_group_exit_task) {
++		current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task);
++		if (current->signal->group_exit_task == NULL) {
++			eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task);
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++	}
++	current->signal->notify_count = si->cpt_notify_count;
++	current->signal->group_stop_count = si->cpt_group_stop_count;
++
++	if (si->cpt_next > si->cpt_hdrlen) {
++		char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL);
++		if (buf == NULL) {
++			cpt_release_buf(ctx);
++			return -ENOMEM;
++		}
++		err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx,
++				 ti->cpt_signal + si->cpt_hdrlen);
++		if (err) {
++			kfree(buf);
++			cpt_release_buf(ctx);
++			return err;
++		}
++		restore_sigqueue(current,
++				 &current->signal->shared_pending, (unsigned long)buf,
++				 (unsigned long)buf + si->cpt_next - si->cpt_hdrlen);
++		kfree(buf);
++	}
++	cpt_release_buf(ctx);
++	return 0;
++}
++
++int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_sighand_image si;
++	int i;
++	loff_t pos, endpos;
++	
++	err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx);
++	if (err)
++		return err;
++
++	for (i=0; i<_NSIG; i++) {
++		current->sighand->action[i].sa.sa_handler = SIG_DFL;
++#ifndef CONFIG_IA64
++		current->sighand->action[i].sa.sa_restorer = 0;
++#endif
++		current->sighand->action[i].sa.sa_flags = 0;
++		memset(&current->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t));
++	}
++
++	pos = ti->cpt_sighand + si.cpt_hdrlen;
++	endpos = ti->cpt_sighand + si.cpt_next;
++	while (pos < endpos) {
++		struct cpt_sighandler_image shi;
++
++		err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx);
++		if (err)
++			return err;
++		current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler;
++#ifndef CONFIG_IA64
++		current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer;
++#endif
++		current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags;
++		cpt_sigset_import(&current->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask);
++		pos += shi.cpt_next;
++	}
++
++	return 0;
++}
++
++
++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	__u32 flag = 0;
++
++	if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx))
++		flag |= CLONE_THREAD;
++	if (ti->cpt_sighand == CPT_NULL ||
++	    lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx))
++		flag |= CLONE_SIGHAND;
++	return flag;
++}
++
++int
++rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++
++	if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) {
++		return -EINVAL;
++	}
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx);
++	if (obj) {
++		struct sighand_struct *sig = current->sighand;
++		if (obj->o_obj != sig) {
++			return -EINVAL;
++		}
++	} else {
++		obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx);
++		if (obj == NULL)
++			return -ENOMEM;
++		cpt_obj_setpos(obj, ti->cpt_sighand, ctx);
++		err = restore_one_sighand_struct(ti, ctx);
++		if (err)
++			return err;
++	}
++
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx);
++	if (obj) {
++		struct signal_struct *sig = current->signal;
++		if (obj->o_obj != sig) {
++			return -EINVAL;
++		}
++/*		if (current->signal) {
++			pid_t session;
++
++			session = process_session(current);
++			set_process_vgroup(current, session);
++			set_signal_vsession(current->signal, session);
++		}*/
++	} else {
++		obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx);
++		if (obj == NULL)
++			return -ENOMEM;
++		cpt_obj_setpos(obj, ti->cpt_signal, ctx);
++		err = restore_one_signal_struct(ti, exiting, ctx);
++		if (err)
++			return err;
++	}
++
++	return 0;
++}
++
++#ifdef CONFIG_X86
++static u32 decode_segment(u32 segid)
++{
++	if (segid == CPT_SEG_ZERO)
++		return 0;
++
++	/* TLS descriptors */
++	if (segid <= CPT_SEG_TLS3)
++		return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3;
++
++	/* LDT descriptor, it is just an index to LDT array */
++	if (segid >= CPT_SEG_LDT)
++		return ((segid - CPT_SEG_LDT) << 3) | 7;
++
++	/* Check for one of standard descriptors */
++#ifdef CONFIG_X86_64
++	if (segid == CPT_SEG_USER32_DS)
++		return __USER32_DS;
++	if (segid == CPT_SEG_USER32_CS)
++		return __USER32_CS;
++	if (segid == CPT_SEG_USER64_DS)
++		return __USER_DS;
++	if (segid == CPT_SEG_USER64_CS)
++		return __USER_CS;
++#else
++	if (segid == CPT_SEG_USER32_DS)
++		return __USER_DS;
++	if (segid == CPT_SEG_USER32_CS)
++		return __USER_CS;
++#endif
++	wprintk("Invalid segment reg %d\n", segid);
++	return 0;
++}
++#endif
++
++#if defined (CONFIG_IA64)
++void ia64_decrement_ip (struct pt_regs *regs)
++{
++	unsigned long w0, ri = ia64_psr(regs)->ri - 1;
++
++	if (ia64_psr(regs)->ri == 0) {
++		regs->cr_iip -= 16;
++		ri = 2;
++		get_user(w0, (char __user *) regs->cr_iip + 0);
++		if (((w0 >> 1) & 0xf) == 2) {
++			/*
++			 * rfi'ing to slot 2 of an MLX bundle causes
++			 * an illegal operation fault.  We don't want
++			 * that to happen...
++			 */
++			ri = 1;
++		}
++	}
++	ia64_psr(regs)->ri = ri;
++}
++#endif
++
++static void rst_child_tid(unsigned long *child_tids)
++{
++	dprintk("rct: " CPT_FID "\n", CPT_TID(current));
++	current->clear_child_tid = (void*)child_tids[0];
++	current->set_child_tid = (void*)child_tids[1];
++}
++
++static void rst_last_siginfo(void)
++{
++	int signr;
++	siginfo_t *info = current->last_siginfo;
++	struct pt_regs *regs = task_pt_regs(current);
++	struct k_sigaction *ka;
++	int ptrace_id;
++
++	dprintk("rlsi: " CPT_FID "\n", CPT_TID(current));
++
++	spin_lock_irq(&current->sighand->siglock);
++	current->last_siginfo = NULL;
++	recalc_sigpending();
++
++	ptrace_id = current->pn_state;
++	clear_pn_state(current);
++
++	switch (ptrace_id) {
++	case PN_STOP_TF:
++	case PN_STOP_TF_RT:
++		/* frame_*signal */
++		dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n",
++		       task_pid_vnr(current), current->pid, current->comm,
++		       info->si_signo, info->si_code,
++		       current->exit_code, SYSCALL_NR(regs),
++		       current->ptrace, current->ptrace_message);
++		goto out;
++	case PN_STOP_ENTRY:
++	case PN_STOP_LEAVE:
++		/* do_syscall_trace */
++		spin_unlock_irq(&current->sighand->siglock);
++		dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code);
++		if (current->exit_code) {
++			send_sig(current->exit_code, current, 1);
++			current->exit_code = 0;
++		}
++		if (IN_SYSCALL(regs)) {
++			if (ptrace_id == PN_STOP_ENTRY
++#ifdef CONFIG_X86
++			    && SYSCALL_ERRNO(regs) == ENOSYS
++#endif
++			    )
++				SYSCALL_RESTART(regs);
++			else if (IN_ERROR(regs) &&
++				 syscall_is(current, regs, rt_sigtimedwait) &&
++				 (SYSCALL_ERRNO(regs) == EAGAIN ||
++				  SYSCALL_ERRNO(regs) == EINTR))
++				SYSCALL_RESTART(regs);
++		}
++		return;
++	case PN_STOP_FORK:
++		/* fork */
++		SYSCALL_SETRET(regs, current->ptrace_message);
++		dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs));
++		goto out;
++	case PN_STOP_VFORK:
++		/* after vfork */
++		SYSCALL_SETRET(regs, current->ptrace_message);
++		dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs));
++		goto out;
++	case PN_STOP_SIGNAL:
++		/* normal case : dequeue signal */
++		break;
++	case PN_STOP_EXIT:
++		dprintk("ptrace exit caught\n");
++		current->ptrace &= ~PT_TRACE_EXIT;
++		spin_unlock_irq(&current->sighand->siglock);
++		module_put(THIS_MODULE);
++		complete_and_exit(NULL, current->ptrace_message);
++		BUG();
++	case PN_STOP_EXEC:
++		eprintk("ptrace after exec caught: must not happen\n");
++		BUG();
++	default:
++		eprintk("ptrace with unknown identity %d\n", ptrace_id);
++		BUG();
++	}
++
++	signr = current->exit_code;
++	if (signr == 0) {
++		dprintk("rlsi: canceled signal %d\n", info->si_signo);
++		goto out;
++	}
++	current->exit_code = 0;
++
++	if (signr != info->si_signo) {
++		info->si_signo = signr;
++		info->si_errno = 0;
++		info->si_code = SI_USER;
++		info->si_pid = task_pid_vnr(current->parent);
++		info->si_uid = current->parent->uid;
++	}
++
++	/* If the (new) signal is now blocked, requeue it.  */
++	if (sigismember(&current->blocked, signr)) {
++		dprintk("going to requeue signal %d\n", signr);
++		goto out_resend_sig;
++	}
++
++	ka = &current->sighand->action[signr-1];
++	if (ka->sa.sa_handler == SIG_IGN) {
++		dprintk("going to resend signal %d (ignored)\n", signr);
++		goto out;
++	}
++	if (ka->sa.sa_handler != SIG_DFL) {
++		dprintk("going to resend signal %d (not SIG_DFL)\n", signr);
++		goto out_resend_sig;
++	}
++        if (signr == SIGCONT ||
++	    signr == SIGCHLD ||
++	    signr == SIGWINCH ||
++	    signr == SIGURG ||
++	    current->pid == 1)
++		goto out;
++
++	/* All the rest, which we cannot handle are requeued. */
++	dprintk("going to resend signal %d (sigh)\n", signr);
++out_resend_sig:
++	spin_unlock_irq(&current->sighand->siglock);
++	send_sig_info(signr, info, current);
++	return;
++
++out:
++	spin_unlock_irq(&current->sighand->siglock);
++}
++
++static void rst_finish_stop(void)
++{
++	/* ...
++	 * do_signal() ->
++	 *   get_signal_to_deliver() ->
++	 *     do_signal_stop() ->
++	 *       finish_stop()
++	 *
++	 * Normally after SIGCONT it will dequeue the next signal. If no signal
++	 * is found, do_signal restarts syscall unconditionally.
++	 * Otherwise signal handler is pushed on user stack.
++	 */
++
++	dprintk("rfs: " CPT_FID "\n", CPT_TID(current));
++
++	clear_stop_state(current);
++	current->exit_code = 0;
++}
++
++static void rst_restart_sys(void)
++{
++	struct pt_regs *regs = task_pt_regs(current);
++
++	/* This hook is supposed to be executed, when we have
++	 * to complete some interrupted syscall.
++	 */
++	dprintk("rrs: " CPT_FID "\n", CPT_TID(current));
++
++	if (!IN_SYSCALL(regs) || !IN_ERROR(regs))
++		return;
++
++#ifdef __NR_pause
++	if (syscall_is(current,regs,pause)) {
++		if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
++			current->state = TASK_INTERRUPTIBLE;
++			schedule();
++		}
++	} else
++#else
++	/* On this arch pause() is simulated with sigsuspend(). */
++	if (syscall_is(current,regs,rt_sigsuspend)) {
++		if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
++			current->state = TASK_INTERRUPTIBLE;
++			schedule();
++		}
++	} else
++#endif
++	if (syscall_is(current,regs,rt_sigtimedwait)) {
++		if (SYSCALL_ERRNO(regs) == EAGAIN ||
++		    SYSCALL_ERRNO(regs) == EINTR) {
++			SYSCALL_RESTART(regs);
++		}
++	} else if (syscall_is(current,regs,futex)) {
++		if (SYSCALL_ERRNO(regs) == EINTR &&
++		    !signal_pending(current)) {
++			SYSCALL_RESTART(regs);
++		}
++	}
++
++	if (!signal_pending(current) &&
++	    !current_thread_info()->status & TS_RESTORE_SIGMASK) {
++		if (SYSCALL_ERRNO(regs) == ERESTARTSYS ||
++		    SYSCALL_ERRNO(regs) == ERESTARTNOINTR ||
++		    SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
++			SYSCALL_RESTART(regs);
++		} else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) {
++			int new = __NR_restart_syscall;
++#ifdef CONFIG_X86_64
++			if (task_thread_info(current)->flags&_TIF_IA32)
++				new = __NR32_restart_syscall;
++#endif
++			SYSCALL_RESTART2(regs, new);
++		}
++	}
++}
++
++#ifdef CONFIG_X86_32
++
++static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
++			     struct cpt_task_image *ti, struct cpt_x86_regs *b,
++			     struct resume_info **rip, struct cpt_context *ctx)
++{
++	extern char i386_ret_from_resume;
++
++	if (b->cpt_object != CPT_OBJ_X86_REGS)
++		return -EINVAL;
++
++	tsk->thread.sp = (unsigned long) regs;
++	tsk->thread.sp0 = (unsigned long) (regs+1);
++	tsk->thread.ip = (unsigned long) &i386_ret_from_resume;
++
++	tsk->thread.gs = decode_segment(b->cpt_gs);
++	tsk->thread.debugreg0 = b->cpt_debugreg[0];
++	tsk->thread.debugreg1 = b->cpt_debugreg[1];
++	tsk->thread.debugreg2 = b->cpt_debugreg[2];
++	tsk->thread.debugreg3 = b->cpt_debugreg[3];
++	tsk->thread.debugreg6 = b->cpt_debugreg[6];
++	tsk->thread.debugreg7 = b->cpt_debugreg[7];
++
++	regs->bx = b->cpt_ebx;
++	regs->cx = b->cpt_ecx;
++	regs->dx = b->cpt_edx;
++	regs->si = b->cpt_esi;
++	regs->di = b->cpt_edi;
++	regs->bp = b->cpt_ebp;
++	regs->ax = b->cpt_eax;
++	regs->ds = b->cpt_xds;
++	regs->es = b->cpt_xes;
++	regs->orig_ax = b->cpt_orig_eax;
++	regs->ip = b->cpt_eip;
++	regs->cs = b->cpt_xcs;
++	regs->flags = b->cpt_eflags;
++	regs->sp = b->cpt_esp;
++	regs->ss = b->cpt_xss;
++
++	regs->cs = decode_segment(b->cpt_xcs);
++	regs->ss = decode_segment(b->cpt_xss);
++	regs->ds = decode_segment(b->cpt_xds);
++	regs->es = decode_segment(b->cpt_xes);
++	regs->fs = decode_segment(b->cpt_fs);
++
++	tsk->thread.sp -= HOOK_RESERVE;
++	memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
++	*rip = (void*)tsk->thread.sp;
++
++	return 0;
++}
++
++#elif defined(CONFIG_X86_64)
++
++static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s)
++{
++	memset(d, 0, sizeof(struct pt_regs));
++	d->bp = s->cpt_ebp;
++	d->bx = s->cpt_ebx;
++	d->ax = (s32)s->cpt_eax;
++	d->cx = s->cpt_ecx;
++	d->dx = s->cpt_edx;
++	d->si = s->cpt_esi;
++	d->di = s->cpt_edi;
++	d->orig_ax = (s32)s->cpt_orig_eax;
++	d->ip = s->cpt_eip;
++	d->cs = s->cpt_xcs;
++	d->flags = s->cpt_eflags;
++	d->sp = s->cpt_esp;
++	d->ss = s->cpt_xss;
++}
++
++static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
++			     struct cpt_task_image *ti, struct cpt_obj_bits *hdr,
++			     struct resume_info **rip, struct cpt_context *ctx)
++{
++	if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) {
++		struct cpt_x86_64_regs *b = (void*)hdr;
++
++		tsk->thread.sp = (unsigned long) regs;
++		tsk->thread.sp0 = (unsigned long) (regs+1);
++
++		tsk->thread.fs = b->cpt_fsbase;
++		tsk->thread.gs = b->cpt_gsbase;
++		tsk->thread.fsindex = decode_segment(b->cpt_fsindex);
++		tsk->thread.gsindex = decode_segment(b->cpt_gsindex);
++		tsk->thread.ds = decode_segment(b->cpt_ds);
++		tsk->thread.es = decode_segment(b->cpt_es);
++		tsk->thread.debugreg0 = b->cpt_debugreg[0];
++		tsk->thread.debugreg1 = b->cpt_debugreg[1];
++		tsk->thread.debugreg2 = b->cpt_debugreg[2];
++		tsk->thread.debugreg3 = b->cpt_debugreg[3];
++		tsk->thread.debugreg6 = b->cpt_debugreg[6];
++		tsk->thread.debugreg7 = b->cpt_debugreg[7];
++
++		memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs));
++
++		tsk->thread.usersp = regs->sp;
++		regs->cs = decode_segment(b->cpt_cs);
++		regs->ss = decode_segment(b->cpt_ss);
++	} else if (hdr->cpt_object == CPT_OBJ_X86_REGS) {
++		struct cpt_x86_regs *b = (void*)hdr;
++
++		tsk->thread.sp = (unsigned long) regs;
++		tsk->thread.sp0 = (unsigned long) (regs+1);
++
++		tsk->thread.fs = 0;
++		tsk->thread.gs = 0;
++		tsk->thread.fsindex = decode_segment(b->cpt_fs);
++		tsk->thread.gsindex = decode_segment(b->cpt_gs);
++		tsk->thread.debugreg0 = b->cpt_debugreg[0];
++		tsk->thread.debugreg1 = b->cpt_debugreg[1];
++		tsk->thread.debugreg2 = b->cpt_debugreg[2];
++		tsk->thread.debugreg3 = b->cpt_debugreg[3];
++		tsk->thread.debugreg6 = b->cpt_debugreg[6];
++		tsk->thread.debugreg7 = b->cpt_debugreg[7];
++
++		xlate_ptregs_32_to_64(regs, b);
++
++		tsk->thread.usersp = regs->sp;
++		regs->cs = decode_segment(b->cpt_xcs);
++		regs->ss = decode_segment(b->cpt_xss);
++		tsk->thread.ds = decode_segment(b->cpt_xds);
++		tsk->thread.es = decode_segment(b->cpt_xes);
++	} else {
++		return -EINVAL;
++	}
++
++	tsk->thread.sp -= HOOK_RESERVE;
++	memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
++	*rip = (void*)tsk->thread.sp;
++	return 0;
++}
++
++#elif defined(CONFIG_IA64)
++
++#define MASK(nbits)	((1UL << (nbits)) - 1)	/* mask with NBITS bits set */
++
++#define PUT_BITS(first, last, nat)					\
++	({								\
++		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
++		unsigned long nbits = (last - first + 1);		\
++		unsigned long mask = MASK(nbits) << first;		\
++		long dist;						\
++		if (bit < first)					\
++			dist = 64 + bit - first;			\
++		else							\
++			dist = bit - first;				\
++		ia64_rotl(nat & mask, dist);				\
++	})
++
++unsigned long
++ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
++{
++	unsigned long scratch_unat;
++
++	/*
++	 * Registers that are stored consecutively in struct pt_regs
++	 * can be handled in parallel.  If the register order in
++	 * struct_pt_regs changes, this code MUST be updated.
++	 */
++	scratch_unat  = PUT_BITS( 1,  1, nat);
++	scratch_unat |= PUT_BITS( 2,  3, nat);
++	scratch_unat |= PUT_BITS(12, 13, nat);
++	scratch_unat |= PUT_BITS(14, 14, nat);
++	scratch_unat |= PUT_BITS(15, 15, nat);
++	scratch_unat |= PUT_BITS( 8, 11, nat);
++	scratch_unat |= PUT_BITS(16, 31, nat);
++
++	return scratch_unat;
++
++}
++
++static unsigned long
++ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat)
++{
++	unsigned long scratch_unat;
++
++	scratch_unat  = PUT_BITS( 4,  7, nat);
++
++	return scratch_unat;
++
++}
++
++#undef PUT_BITS
++
++
++static int restore_registers(struct task_struct *tsk, struct pt_regs *pt,
++			     struct cpt_task_image *ti,
++			     struct cpt_ia64_regs *r,
++			     struct resume_info **rip,
++			     struct cpt_context *ctx)
++{
++	extern char ia64_ret_from_resume;
++	struct switch_stack *sw;
++	struct resume_info *ri;
++	struct ia64_psr *psr = ia64_psr(pt);
++	void *krbs = (void *)tsk + IA64_RBS_OFFSET;
++	unsigned long reg;
++
++	if (r->cpt_object != CPT_OBJ_IA64_REGS)
++		return -EINVAL;
++
++	if (r->num_regs > 96) {
++		eprintk(CPT_FID " too much RSE regs %lu\n",
++			CPT_TID(tsk), r->num_regs);
++		return -EINVAL;
++	}
++
++	*rip = ri = ((void*)pt) - HOOK_RESERVE;
++	sw = ((struct switch_stack *) ri) - 1;
++
++	memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack));
++	memset(ri, 0, HOOK_RESERVE);
++
++	/* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
++	memcpy(&pt->r1,  &r->gr[1],  8*(2-1));
++	memcpy(&pt->r2,  &r->gr[2],  8*(4-2));
++	memcpy(&pt->r8,  &r->gr[8],  8*(12-8));
++	memcpy(&pt->r12, &r->gr[12], 8*(14-12));
++	memcpy(&pt->r14, &r->gr[14], 8*(15-14));
++	memcpy(&pt->r15, &r->gr[15], 8*(16-15));
++	memcpy(&pt->r16, &r->gr[16], 8*(32-16));
++
++	pt->b0 = r->br[0];
++	pt->b6 = r->br[6];
++	pt->b7 = r->br[7];
++
++	pt->ar_bspstore	= r->ar_bspstore;
++	pt->ar_unat	= r->ar_unat;
++	pt->ar_pfs	= r->ar_pfs;
++	pt->ar_ccv	= r->ar_ccv;
++	pt->ar_fpsr	= r->ar_fpsr;
++	pt->ar_csd	= r->ar_csd;
++	pt->ar_ssd	= r->ar_ssd;
++	pt->ar_rsc	= r->ar_rsc;
++
++	pt->cr_iip	= r->cr_iip;
++	pt->cr_ipsr	= r->cr_ipsr;
++
++	pt->pr = r->pr;
++
++	pt->cr_ifs = r->cfm;
++
++	/* fpregs 6..9,10..11 are in pt_regs */
++	memcpy(&pt->f6,  &r->fr[2*6],  16*(10-6));
++	memcpy(&pt->f10, &r->fr[2*10], 16*(12-10));
++	/* fpreg 12..15 are on switch stack */
++	memcpy(&sw->f12, &r->fr[2*12], 16*(16-12));
++	/* fpregs 32...127 */
++	tsk->thread.flags |= IA64_THREAD_FPH_VALID;
++	memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32));
++	ia64_drop_fpu(tsk);
++	psr->dfh = 1;
++
++	memcpy(&sw->r4, &r->gr[4], 8*(8-4));
++	memcpy(&sw->b1, &r->br[1], 8*(6-1));
++	sw->ar_lc = r->ar_lc;
++
++	memcpy(&sw->f2, &r->fr[2*2], 16*(6-2));
++	memcpy(&sw->f16, &r->fr[2*16], 16*(32-16));
++
++	sw->caller_unat = 0;
++	sw->ar_fpsr = pt->ar_fpsr;
++	sw->ar_unat = 0;
++	if (r->nat[0] & 0xFFFFFF0FUL)
++		sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]);
++	if (r->nat[0] & 0xF0)
++		sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]);
++
++	sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs);
++	memset(krbs, 0, (void*)sw->ar_bspstore - krbs);
++	sw->ar_rnat = 0;
++	sw->ar_pfs = 0;
++
++	/* This is tricky. When we are in syscall, we have frame
++	 * of output register (sometimes, plus one input reg sometimes).
++	 * It is not so easy to restore such frame, RSE optimizes
++	 * and does not fetch those regs from backstore. So, we restore
++	 * the whole frame as local registers, and then repartition it
++	 * in ia64_ret_from_resume().
++	 */
++	if ((long)pt->cr_ifs >= 0) {
++		unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F);
++		sw->ar_pfs = out | (out<<7);
++	}
++	if (r->ar_ec)
++		sw->ar_pfs |= (r->ar_ec & 0x3F) << 52;
++
++	for (reg = 0; reg < r->num_regs; reg++) {
++		unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
++		unsigned long *rnatp;
++		unsigned long set_rnat = 0;
++
++		*ptr = r->gr[32+reg];
++
++		if (reg < 32)
++			set_rnat = (r->nat[0] & (1UL<<(reg+32)));
++		else
++			set_rnat = (r->nat[1] & (1UL<<(reg-32)));
++
++		if (set_rnat) {
++			rnatp = ia64_rse_rnat_addr(ptr);
++			if ((unsigned long)rnatp >= sw->ar_bspstore)
++				rnatp = &sw->ar_rnat;
++			*rnatp |= (1UL<<ia64_rse_slot_num(ptr));
++		}
++	}
++	
++	sw->b0 = (unsigned long) &ia64_ret_from_resume;
++	tsk->thread.ksp = (unsigned long) sw - 16;
++
++#define PRED_LEAVE_SYSCALL	1 /* TRUE iff leave from syscall */
++#define PRED_KERNEL_STACK	2 /* returning to kernel-stacks? */
++#define PRED_USER_STACK		3 /* returning to user-stacks? */
++#define PRED_SYSCALL		4 /* inside a system call? */
++#define PRED_NON_SYSCALL	5 /* complement of PRED_SYSCALL */
++
++	pt->loadrs = r->loadrs;
++	sw->pr = 0;
++	sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL);
++	sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL));
++	sw->pr &= ~(1UL << PRED_KERNEL_STACK);
++	sw->pr |= (1UL << PRED_USER_STACK);
++	if ((long)pt->cr_ifs < 0) {
++		sw->pr |= (1UL << PRED_NON_SYSCALL);
++	} else {
++		sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
++	}
++
++	return 0;
++}
++#endif
++
++asmlinkage void rst_resume_work(struct resume_info *ri)
++{
++	if (ri->hooks & (1<<HOOK_TID))
++		rst_child_tid(ri->tid_ptrs);
++	if (ri->hooks & (1<<HOOK_CONT))
++		rst_finish_stop();
++	if (ri->hooks & (1<<HOOK_LSI))
++		rst_last_siginfo();
++	if (ri->hooks & (1<<HOOK_RESTART))
++		rst_restart_sys();
++	module_put(THIS_MODULE);
++}
++
++static void rst_apply_mxcsr_mask(struct task_struct *tsk)
++{
++#ifdef CONFIG_X86_32
++	unsigned int flags;
++
++	flags = test_cpu_caps();
++
++	/* if cpu does not support sse2 mask 6 bit (DAZ flag) and 16-31 bits
++	   in MXCSR to avoid general protection fault */
++	if (!(flags & (1 << CPT_CPU_X86_SSE2)))
++		tsk->thread.xstate->fxsave.mxcsr &= 0x0000ffbf;
++#endif
++}
++
++int rst_restore_process(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		struct cpt_task_image *ti = obj->o_image;
++		struct pt_regs * regs;
++		struct cpt_object_hdr *b;
++		struct cpt_siginfo_image *lsi = NULL;
++		struct group_info *gids, *ogids;
++		struct resume_info *ri = NULL;
++		int i;
++		int err = 0;
++#ifdef CONFIG_BEANCOUNTERS
++		struct task_beancounter *tbc;
++		struct user_beancounter *new_bc, *old_bc;
++#endif
++
++		if (tsk == NULL) {
++			eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm);
++			return -EFAULT;
++		}
++
++		wait_task_inactive(tsk);
++#ifdef CONFIG_BEANCOUNTERS
++		tbc = &tsk->task_bc;
++		new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx);
++		err = virtinfo_notifier_call(VITYPE_SCP,
++				VIRTINFO_SCP_RSTTSK, new_bc);
++		if (err & NOTIFY_FAIL) {
++			put_beancounter(new_bc);
++			return -ECHRNG; 
++		}
++		old_bc = tbc->exec_ub;
++		if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) {
++			dprintk(" *** replacing ub %p by %p for %p (%d %s)\n",
++					old_bc, new_bc, tsk,
++					tsk->pid, tsk->comm);
++			tbc->exec_ub = new_bc;
++			new_bc = old_bc;
++		}
++		put_beancounter(new_bc);
++#endif
++		regs = task_pt_regs(tsk);
++
++		if (!tsk->exit_state) {
++			tsk->lock_depth = -1;
++#ifdef CONFIG_PREEMPT
++			task_thread_info(tsk)->preempt_count--;
++#endif
++		}
++
++		if (tsk->static_prio != ti->cpt_static_prio)
++			set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio));
++
++		cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked);
++		cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked);
++		cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked);
++		cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending);
++
++		tsk->uid = ti->cpt_uid;
++		tsk->euid = ti->cpt_euid;
++		tsk->suid = ti->cpt_suid;
++		tsk->fsuid = ti->cpt_fsuid;
++		tsk->gid = ti->cpt_gid;
++		tsk->egid = ti->cpt_egid;
++		tsk->sgid = ti->cpt_sgid;
++		tsk->fsgid = ti->cpt_fsgid;
++#ifdef CONFIG_IA64
++		SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac);
++		SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu);
++#endif
++		memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective));
++		memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable));
++		memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted));
++		if (ctx->image_version < CPT_VERSION_26)
++			tsk->securebits = (ti->cpt_keepcap != 0) ?
++				issecure_mask(SECURE_KEEP_CAPS) : 0;
++		else
++			tsk->securebits = ti->cpt_keepcap;
++		tsk->did_exec = (ti->cpt_did_exec != 0);
++		gids = groups_alloc(ti->cpt_ngids);
++		ogids = tsk->group_info;
++		if (gids) {
++			int i;
++			for (i=0; i<32; i++)
++				gids->small_block[i] = ti->cpt_gids[i];
++			tsk->group_info = gids;
++		}
++		if (ogids)
++			put_group_info(ogids);
++		tsk->utime = ti->cpt_utime;
++		tsk->stime = ti->cpt_stime;
++		if (ctx->image_version == CPT_VERSION_8)
++			tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC);
++		else
++			cpt_timespec_import(&tsk->start_time, ti->cpt_starttime);
++		_set_normalized_timespec(&tsk->start_time,
++					tsk->start_time.tv_sec +
++					VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec,
++					tsk->start_time.tv_nsec +
++					VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec);
++
++		tsk->nvcsw = ti->cpt_nvcsw;
++		tsk->nivcsw = ti->cpt_nivcsw;
++		tsk->min_flt = ti->cpt_min_flt;
++		tsk->maj_flt = ti->cpt_maj_flt;
++
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
++		tsk->cutime = ti->cpt_cutime;
++		tsk->cstime = ti->cpt_cstime;
++		tsk->cnvcsw = ti->cpt_cnvcsw;
++		tsk->cnivcsw = ti->cpt_cnivcsw;
++		tsk->cmin_flt = ti->cpt_cmin_flt;
++		tsk->cmaj_flt = ti->cpt_cmaj_flt;
++
++		BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS);
++
++		for (i=0; i<RLIM_NLIMITS; i++) {
++			tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
++			tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i];
++		}
++#else
++		if (thread_group_leader(tsk) && tsk->signal) {
++			tsk->signal->utime = ti->cpt_utime;
++			tsk->signal->stime = ti->cpt_stime;
++			tsk->signal->cutime = ti->cpt_cutime;
++			tsk->signal->cstime = ti->cpt_cstime;
++			tsk->signal->nvcsw = ti->cpt_nvcsw;
++			tsk->signal->nivcsw = ti->cpt_nivcsw;
++			tsk->signal->cnvcsw = ti->cpt_cnvcsw;
++			tsk->signal->cnivcsw = ti->cpt_cnivcsw;
++			tsk->signal->min_flt = ti->cpt_min_flt;
++			tsk->signal->maj_flt = ti->cpt_maj_flt;
++			tsk->signal->cmin_flt = ti->cpt_cmin_flt;
++			tsk->signal->cmaj_flt = ti->cpt_cmaj_flt;
++
++			for (i=0; i<RLIM_NLIMITS; i++) {
++				tsk->signal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
++				tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i];
++			}
++		}
++#endif
++
++#ifdef CONFIG_X86
++		for (i=0; i<3; i++) {
++			if (i >= GDT_ENTRY_TLS_ENTRIES) {
++				eprintk_ctx("too many tls descs\n");
++			} else {
++				tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF;
++				tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32;
++			}
++		}
++#endif
++
++		clear_stopped_child_used_math(tsk);
++
++		b = (void *)(ti+1);
++		while ((void*)b < ((void*)ti) + ti->cpt_next) {
++			/* Siginfo objects are at the end of obj array */
++			if (b->cpt_object == CPT_OBJ_SIGINFO) {
++				struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
++				restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next);
++				set_exec_env(env);
++				break;
++			}
++
++			switch (b->cpt_object) {
++#ifdef CONFIG_X86
++			case CPT_OBJ_BITS:
++				if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE &&
++				    cpu_has_fxsr) {
++					memcpy(&tsk->thread.xstate,
++					       (void*)b + b->cpt_hdrlen,
++					       sizeof(struct i387_fxsave_struct));
++					rst_apply_mxcsr_mask(tsk);
++					if (ti->cpt_used_math)
++						set_stopped_child_used_math(tsk);
++				}
++#ifndef CONFIG_X86_64
++				else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD &&
++					 !cpu_has_fxsr) {		
++					memcpy(&tsk->thread.xstate,
++					       (void*)b + b->cpt_hdrlen,
++					       sizeof(struct i387_fsave_struct));
++					if (ti->cpt_used_math)
++						set_stopped_child_used_math(tsk);
++				}
++#endif
++				break;
++#endif
++			case CPT_OBJ_LASTSIGINFO:
++				lsi = (void*)b;
++				break;
++			case CPT_OBJ_X86_REGS:
++			case CPT_OBJ_X86_64_REGS:
++			case CPT_OBJ_IA64_REGS:
++				if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) {
++					eprintk_ctx("cannot restore registers: image is corrupted\n");
++					return -EINVAL;
++				}
++				break;
++			case CPT_OBJ_SIGALTSTACK: {
++				struct cpt_sigaltstack_image *sas;
++				sas = (struct cpt_sigaltstack_image *)b;
++				tsk->sas_ss_sp = sas->cpt_stack;
++				tsk->sas_ss_size = sas->cpt_stacksize;
++				break;
++			    }
++			case CPT_OBJ_TASK_AUX: {
++				struct cpt_task_aux_image *ai;
++				ai = (struct cpt_task_aux_image *)b;
++				tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list);
++#ifdef CONFIG_X86_64
++#ifdef CONFIG_COMPAT
++				if (task_thread_info(tsk)->flags&_TIF_IA32) {
++					tsk->robust_list = (void __user *)NULL;
++					tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list);
++				}
++#endif
++#endif
++				break;
++			    }
++			}
++			b = ((void*)b) + b->cpt_next;
++		}
++
++		if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++			eprintk_ctx("missing register info\n");
++			return -EINVAL;
++		}
++
++		if (ti->cpt_ppid != ti->cpt_rppid) {
++			struct task_struct *parent;
++			struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
++			write_lock_irq(&tasklist_lock);
++			parent = find_task_by_vpid(ti->cpt_ppid);
++			if (parent && parent != tsk->parent) {
++				list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children);
++				remove_parent(tsk);
++				tsk->parent = parent;
++				add_parent(tsk);
++			}
++			write_unlock_irq(&tasklist_lock);
++			set_exec_env(env);
++		}
++
++		tsk->ptrace_message = ti->cpt_ptrace_message;
++		tsk->pn_state = ti->cpt_pn_state;
++		tsk->stopped_state = ti->cpt_stopped_state;
++		task_thread_info(tsk)->flags = ti->cpt_thrflags;
++
++		/* The image was created with kernel < 2.6.16, while
++		 * task hanged in sigsuspend -> do_signal.
++		 *
++		 * FIXME! This needs more brain efforts...
++		 */
++		if (ti->cpt_sigsuspend_state) {
++			set_restore_sigmask();
++		}
++
++#ifdef CONFIG_X86_64
++		task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME;
++		if (!ti->cpt_64bit)
++			task_thread_info(tsk)->flags |= _TIF_IA32;
++#endif
++
++#ifdef CONFIG_X86_32
++		do {
++			if (regs->orig_ax == __NR__newselect && regs->di) {
++				struct timeval tv;
++				if (access_process_vm(tsk, regs->di, &tv, 
++						sizeof(tv), 0) != sizeof(tv)) {
++					wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n",
++						task_pid_vnr(tsk), tsk->pid, tsk->comm,
++					       regs->di);
++					break;
++				}
++				dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n",
++				       task_pid_vnr(tsk), tsk->pid, tsk->comm,
++				       tv.tv_sec, tv.tv_usec);
++				tv.tv_sec -= ctx->delta_time.tv_sec;
++				if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
++					tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
++					tv.tv_sec--;
++				} else {
++					tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
++				}
++				if (tv.tv_sec < 0) {
++					tv.tv_sec = 0;
++					tv.tv_usec = 0;
++				}
++				dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n",
++					task_pid_vnr(tsk), tsk->pid, tsk->comm,
++				       tv.tv_sec, tv.tv_usec);
++				if (access_process_vm(tsk, regs->di, &tv, 
++						sizeof(tv), 1) != sizeof(tv)) {
++					wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n",
++						task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di);
++				}
++				
++			} else if (regs->orig_ax == __NR_select && regs->di) {
++				struct {
++					unsigned long n;
++					fd_set __user *inp, *outp, *exp;
++					struct timeval __user *tvp;
++				} a;
++				struct timeval tv;
++				if (access_process_vm(tsk, regs->bx, &a, 
++						sizeof(a), 0) != sizeof(a)) {
++					wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid);
++					break;
++				}
++				if (access_process_vm(tsk, (unsigned long)a.tvp,
++						&tv, sizeof(tv), 0) != sizeof(tv)) {
++					wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid);
++					break;
++				}
++				dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n",
++					tsk->pid, tv.tv_sec, tv.tv_usec);
++				tv.tv_sec -= ctx->delta_time.tv_sec;
++				if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
++					tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
++					tv.tv_sec--;
++				} else {
++					tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
++				}
++				if (tv.tv_sec < 0) {
++					tv.tv_sec = 0;
++					tv.tv_usec = 0;
++				}
++				dprintk_ctx("task %d: New timeval in select: %ld.%ld\n",
++					tsk->pid, tv.tv_sec, tv.tv_usec);
++				if (access_process_vm(tsk, (unsigned long)a.tvp,
++						&tv, sizeof(tv), 1) != sizeof(tv)) {
++					wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid);
++				}
++			}
++		} while (0);
++#endif
++
++		if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) {
++			switch (SYSCALL_ERRNO(regs)) {
++			case ERESTARTSYS:
++			case ERESTARTNOINTR:
++			case ERESTARTNOHAND:
++			case ERESTART_RESTARTBLOCK:
++			case EAGAIN:
++			case EINTR:
++				ri->hooks |= (1<<HOOK_RESTART);
++			}
++		}
++
++		if (ri && (lsi || tsk->pn_state)) {
++			/* ... -> ptrace_notify()
++			 * or
++			 * ... -> do_signal() -> get_signal_to_deliver() ->
++			 *   ptrace stop
++			 */
++			tsk->last_siginfo = &ri->last_siginfo;
++			ri->hooks |= (1<<HOOK_LSI);
++			if (lsi)
++				decode_siginfo(tsk->last_siginfo, lsi);
++		}
++
++		tsk->ptrace = ti->cpt_ptrace;
++		tsk->flags = ti->cpt_flags & ~PF_FROZEN;
++		clear_tsk_thread_flag(tsk, TIF_FREEZE);
++		tsk->exit_signal = ti->cpt_exit_signal;
++
++		if (ri && tsk->stopped_state) {
++			dprintk_ctx("finish_stop\n");
++			if (ti->cpt_state != TASK_STOPPED)
++				eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state);
++			ri->hooks |= (1<<HOOK_CONT);
++		}
++
++		if (ri && (ti->cpt_set_tid || ti->cpt_clear_tid)) {
++			ri->hooks |= (1<<HOOK_TID);
++			ri->tid_ptrs[0] = ti->cpt_clear_tid;
++			ri->tid_ptrs[1] = ti->cpt_set_tid;
++			dprintk_ctx("settids\n");
++		}
++
++		if (ri && ri->hooks &&
++		    !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++			if (try_module_get(THIS_MODULE))
++				ri->hook = rst_resume_work;
++		}
++
++		if (ti->cpt_state == TASK_TRACED)
++			tsk->state = TASK_TRACED;
++		else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) {
++			tsk->signal->it_virt_expires = 0;
++			tsk->signal->it_prof_expires = 0;
++			if (tsk->state != EXIT_DEAD)
++				eprintk_ctx("oops, schedule() did not make us dead\n");
++		}
++
++		if (thread_group_leader(tsk) &&
++		    ti->cpt_it_real_value &&
++		    !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++			ktime_t val;
++			s64 nsec;
++
++			nsec = ti->cpt_it_real_value;
++			val.tv64 = 0;
++
++			if (ctx->image_version < CPT_VERSION_9)
++				nsec *= TICK_NSEC;
++
++			val = ktime_add_ns(val, nsec - ctx->delta_nsec);
++			if (val.tv64 <= 0)
++				val.tv64 = NSEC_PER_USEC;
++			dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk),
++				(long long)val.tv64,
++				(unsigned long long)ti->cpt_it_real_value);
++
++			spin_lock_irq(&tsk->sighand->siglock);
++			if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) {
++				/* FIXME. Check!!!! */
++				hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL);
++			} else {
++				wprintk_ctx("Timer clash. Impossible?\n");
++			}
++			spin_unlock_irq(&tsk->sighand->siglock);
++
++			dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk),
++				    (unsigned long long)val.tv64);
++		}
++
++		module_put(THIS_MODULE);
++	}
++	return 0;
++}
+diff --git a/kernel/cpt/rst_socket.c b/kernel/cpt/rst_socket.c
+new file mode 100644
+index 0000000..d90488e
+--- /dev/null
++++ b/kernel/cpt/rst_socket.c
+@@ -0,0 +1,918 @@
++/*
++ *
++ *  kernel/cpt/rst_socket.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/socket.h>
++#include <linux/un.h>
++#include <net/tcp.h>
++#include <net/sock.h>
++#include <net/scm.h>
++#include <net/af_unix.h>
++
++#include <bc/kmem.h>
++#include <bc/sock_orphan.h>
++#include <bc/net.h>
++#include <bc/tcp.h>
++
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++#include "cpt_syscalls.h"
++
++
++static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si,
++			     loff_t pos, struct cpt_context *ctx)
++{
++	struct timeval tmptv;
++
++	if (sk->sk_socket) {
++		sk->sk_socket->flags = si->cpt_ssflags;
++		sk->sk_socket->state = si->cpt_sstate;
++	}
++	sk->sk_reuse = si->cpt_reuse;
++	sk->sk_shutdown = si->cpt_shutdown;
++	sk->sk_userlocks = si->cpt_userlocks;
++	sk->sk_no_check = si->cpt_no_check;
++	sock_reset_flag(sk, SOCK_DBG);
++	if (si->cpt_debug)
++		sock_set_flag(sk, SOCK_DBG);
++	sock_reset_flag(sk, SOCK_RCVTSTAMP);
++	if (si->cpt_rcvtstamp)
++		sock_set_flag(sk, SOCK_RCVTSTAMP);
++	sock_reset_flag(sk, SOCK_LOCALROUTE);
++	if (si->cpt_localroute)
++		sock_set_flag(sk, SOCK_LOCALROUTE);
++	sk->sk_protocol = si->cpt_protocol;
++	sk->sk_err = si->cpt_err;
++	sk->sk_err_soft = si->cpt_err_soft;
++	sk->sk_priority = si->cpt_priority;
++	sk->sk_rcvlowat = si->cpt_rcvlowat;
++	sk->sk_rcvtimeo = si->cpt_rcvtimeo;
++	if (si->cpt_rcvtimeo == CPT_NULL)
++		sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
++	sk->sk_sndtimeo = si->cpt_sndtimeo;
++	if (si->cpt_sndtimeo == CPT_NULL)
++		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
++	sk->sk_rcvbuf = si->cpt_rcvbuf;
++	sk->sk_sndbuf = si->cpt_sndbuf;
++	sk->sk_bound_dev_if = si->cpt_bound_dev_if;
++	sk->sk_flags = si->cpt_flags;
++	sk->sk_lingertime = si->cpt_lingertime;
++	if (si->cpt_lingertime == CPT_NULL)
++		sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
++	sk->sk_peercred.pid = si->cpt_peer_pid;
++	sk->sk_peercred.uid = si->cpt_peer_uid;
++	sk->sk_peercred.gid = si->cpt_peer_gid;
++	cpt_timeval_import(&tmptv, si->cpt_stamp);
++	sk->sk_stamp = timeval_to_ktime(tmptv);
++	return 0;
++}
++
++static struct file *sock_mapfile(struct socket *sock)
++{
++	int fd = sock_map_fd(sock);
++
++	if (fd >= 0) {
++		struct file *file = sock->file;
++		get_file(file);
++		sc_close(fd);
++		return file;
++	}
++	return ERR_PTR(fd);
++}
++
++/* Assumption is that /tmp exists and writable.
++ * In previous versions we assumed that listen() will autobind
++ * the socket. It does not do this for AF_UNIX by evident reason:
++ * socket in abstract namespace is accessible, unlike socket bound
++ * to deleted FS object.
++ */
++
++static int
++select_deleted_name(char * name, cpt_context_t *ctx)
++{
++	int i;
++
++	for (i=0; i<100; i++) {
++		struct nameidata nd;
++		unsigned int rnd = net_random();
++
++		sprintf(name, "/tmp/SOCK.%08x", rnd);
++
++		if (path_lookup(name, 0, &nd) != 0)
++			return 0;
++
++		path_put(&nd.path);
++	}
++
++	eprintk_ctx("failed to allocate deleted socket inode\n");
++	return -ELOOP;
++}
++
++static int
++bind_unix_socket(struct socket *sock, struct cpt_sock_image *si,
++		 cpt_context_t *ctx)
++{
++	int err;
++	char *name;
++	struct sockaddr* addr;
++	int addrlen;
++	struct sockaddr_un sun;
++	struct nameidata nd;
++
++	if ((addrlen = si->cpt_laddrlen) <= 2)
++		return 0;
++
++	nd.path.dentry = NULL;
++	name = ((char*)si->cpt_laddr) + 2;
++	addr = (struct sockaddr *)si->cpt_laddr;
++
++	if (name[0]) {
++		if (path_lookup(name, 0, &nd))
++			nd.path.dentry = NULL;
++
++		if (si->cpt_deleted) {
++			if (nd.path.dentry == NULL &&
++			    sock->ops->bind(sock, addr, addrlen) == 0) {
++				sc_unlink(name);
++				return 0;
++			}
++
++			addr = (struct sockaddr*)&sun;
++			addr->sa_family = AF_UNIX;
++			name = ((char*)addr) + 2;
++			err = select_deleted_name(name, ctx);
++			if (err)
++				goto out;
++			addrlen = 2 + strlen(name);
++		} else if (nd.path.dentry) {
++			if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) {
++				eprintk_ctx("bind_unix_socket: not a socket dentry\n");
++				err = -EINVAL;
++				goto out;
++			}
++			sc_unlink(name);
++		}
++	}
++
++	err = sock->ops->bind(sock, addr, addrlen);
++
++	if (!err && name[0]) {
++		if (nd.path.dentry) {
++			sc_chown(name, nd.path.dentry->d_inode->i_uid,
++				 nd.path.dentry->d_inode->i_gid);
++			sc_chmod(name, nd.path.dentry->d_inode->i_mode);
++		}
++		if (si->cpt_deleted)
++			sc_unlink(name);
++	}
++
++out:
++	if (nd.path.dentry)
++		path_put(&nd.path);
++	return err;
++}
++
++static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si,
++			      struct cpt_context *ctx)
++{
++	struct sock *sk = sock->sk;
++	cpt_object_t *obj;
++	struct sock *parent;
++
++	if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN)
++		return 0;
++
++	if (si->cpt_parent == -1)
++		return bind_unix_socket(sock, si, ctx);
++
++	obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++	if (!obj)
++		return 0;
++
++	parent = obj->o_obj;
++	if (unix_sk(parent)->addr) {
++		if (unix_sk(sk)->addr &&
++		    atomic_dec_and_test(&unix_sk(sk)->addr->refcnt))
++			kfree(unix_sk(sk)->addr);
++		atomic_inc(&unix_sk(parent)->addr->refcnt);
++		unix_sk(sk)->addr = unix_sk(parent)->addr;
++	}
++	return 0;
++}
++
++static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si,
++				  loff_t pos, struct cpt_context *ctx)
++{
++	loff_t endpos;
++
++	pos = pos + si->cpt_hdrlen;
++	endpos = pos + si->cpt_next;
++	while (pos < endpos) {
++		struct sk_buff *skb;
++		__u32 type;
++
++		skb = rst_skb(&pos, NULL, &type, ctx);
++		if (IS_ERR(skb)) {
++			if (PTR_ERR(skb) == -EINVAL) {
++				int err;
++
++				err = rst_sock_attr(&pos, sk, ctx);
++				if (err)
++					return err;
++			}
++			return PTR_ERR(skb);
++		}
++
++		if (type == CPT_SKB_RQ) {
++			skb_set_owner_r(skb, sk);
++			skb_queue_tail(&sk->sk_receive_queue, skb);
++		} else {
++			wprintk_ctx("strange socket queue type %u\n", type);
++			kfree_skb(skb);
++		}
++	}
++	return 0;
++}
++
++static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si,
++		       struct cpt_context *ctx)
++{
++	int err;
++	struct socket *sock;
++	struct socket *sock2 = NULL;
++	struct file *file;
++	cpt_object_t *fobj;
++	cpt_object_t *pobj = NULL;
++
++	err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol,
++			       &sock);
++	if (err)
++		return err;
++
++	if (si->cpt_socketpair) {
++		err = sock_create_kern(si->cpt_family, si->cpt_type,
++				       si->cpt_protocol, &sock2);
++		if (err)
++			goto err_out;
++
++		err = sock->ops->socketpair(sock, sock2);
++		if (err < 0)
++			goto err_out;
++
++		/* Socketpair with a peer outside our environment.
++		 * So, we create real half-open pipe and do not worry
++		 * about dead end anymore. */
++		if (si->cpt_peer == -1) {
++			sock_release(sock2);
++			sock2 = NULL;
++		}
++	}
++
++	cpt_obj_setobj(obj, sock->sk, ctx);
++
++	if (si->cpt_file != CPT_NULL) {
++		file = sock_mapfile(sock);
++		err = PTR_ERR(file);
++		if (IS_ERR(file))
++			goto err_out;
++
++		err = -ENOMEM;
++
++		obj->o_parent = file;
++
++		if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
++			goto err_out;
++		cpt_obj_setpos(fobj, si->cpt_file, ctx);
++		cpt_obj_setindex(fobj, si->cpt_index, ctx);
++	}
++
++	if (sock2) {
++		struct file *file2;
++
++		pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx);
++		if (!pobj) BUG();
++		if (pobj->o_obj) BUG();
++		cpt_obj_setobj(pobj, sock2->sk, ctx);
++
++		if (pobj->o_ppos != CPT_NULL) {
++			file2 = sock_mapfile(sock2);
++			err = PTR_ERR(file2);
++			if (IS_ERR(file2))
++				goto err_out;
++
++			err = -ENOMEM;
++			if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL)
++				goto err_out;
++			cpt_obj_setpos(fobj, pobj->o_ppos, ctx);
++			cpt_obj_setindex(fobj, si->cpt_peer, ctx);
++
++			pobj->o_parent = file2;
++		}
++	}
++
++	setup_sock_common(sock->sk, si, obj->o_pos, ctx);
++	if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) {
++		int saved_reuse = sock->sk->sk_reuse;
++
++		inet_sk(sock->sk)->freebind = 1;
++		sock->sk->sk_reuse = 2;
++		if (si->cpt_laddrlen) {
++			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++			if (err) {
++				dprintk_ctx("binding failed: %d, do not worry\n", err);
++			}
++		}
++		sock->sk->sk_reuse = saved_reuse;
++		rst_socket_in(si, obj->o_pos, sock->sk, ctx);
++	} else if (sock->sk->sk_family == AF_NETLINK) {
++		struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr;
++		if (nl->nl_pid) {
++			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++			if (err) {
++				eprintk_ctx("AF_NETLINK binding failed: %d\n", err);
++			}
++		}
++		if (si->cpt_raddrlen && nl->nl_pid) {
++			err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK);
++			if (err) {
++				eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err);
++			}
++		}
++		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
++	} else if (sock->sk->sk_family == PF_PACKET) {
++		struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr;
++		if (ll->sll_protocol || ll->sll_ifindex) {
++			int alen = si->cpt_laddrlen;
++			if (alen < sizeof(struct sockaddr_ll))
++				alen = sizeof(struct sockaddr_ll);
++			err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen);
++			if (err) {
++				eprintk_ctx("AF_PACKET binding failed: %d\n", err);
++			}
++		}
++		generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
++	}
++	fixup_unix_address(sock, si, ctx);
++
++	if (sock2) {
++		err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx);
++		if (err)
++			return err;
++		setup_sock_common(sock2->sk, si, pobj->o_pos, ctx);
++		fixup_unix_address(sock2, si, ctx);
++	}
++
++	if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
++	    && (int)si->cpt_parent != -1) {
++		cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++		if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0)
++			sock->sk = NULL;
++	}
++
++
++	if (si->cpt_file == CPT_NULL && sock->sk &&
++	    sock->sk->sk_family == AF_INET) {
++		struct sock *sk = sock->sk;
++
++		if (sk) {
++			sock->sk = NULL;
++
++			local_bh_disable();
++			bh_lock_sock(sk);
++			if (sock_owned_by_user(sk))
++				eprintk_ctx("oops, sock is locked by user\n");
++
++			sock_hold(sk);
++			sock_orphan(sk);
++			ub_inc_orphan_count(sk);
++			bh_unlock_sock(sk);
++			local_bh_enable();
++			sock_put(sk);
++			dprintk_ctx("orphaning socket %p\n", sk);
++		}
++	}
++
++	if (si->cpt_file == CPT_NULL && sock->sk == NULL)
++		sock_release(sock);
++
++	return 0;
++
++err_out:
++	if (sock2)
++		sock_release(sock2);
++	sock_release(sock);
++	return err;
++}
++
++static int open_listening_socket(loff_t pos, struct cpt_sock_image *si,
++				 struct cpt_context *ctx)
++{
++	int err;
++	struct socket *sock;
++	struct file *file;
++	cpt_object_t *obj, *fobj;
++
++	err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol,
++			       &sock);
++	if (err) {
++		eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err);
++		return err;
++	}
++
++	sock->sk->sk_reuse = 2;
++	sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if;
++
++	if (sock->sk->sk_family == AF_UNIX) {
++		err = bind_unix_socket(sock, si, ctx);
++	} else if (si->cpt_laddrlen) {
++		if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
++			inet_sk(sock->sk)->freebind = 1;
++
++		err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++
++		if (err) {
++			eprintk_ctx("open_listening_socket: bind: %d\n", err);
++			goto err_out;
++		}
++	}
++
++	err = sock->ops->listen(sock, si->cpt_max_ack_backlog);
++	if (err) {
++		eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted);
++		goto err_out;
++	}
++
++	/* Now we may access socket body directly and fixup all the things. */
++
++	file = sock_mapfile(sock);
++	err = PTR_ERR(file);
++	if (IS_ERR(file)) {
++		eprintk_ctx("open_listening_socket: map: %d\n", err);
++		goto err_out;
++	}
++
++	err = -ENOMEM;
++	if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
++		goto err_out;
++	if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL)
++		goto err_out;
++	cpt_obj_setpos(obj, pos, ctx);
++	cpt_obj_setindex(obj, si->cpt_index, ctx);
++	obj->o_parent = file;
++	cpt_obj_setpos(fobj, si->cpt_file, ctx);
++	cpt_obj_setindex(fobj, si->cpt_index, ctx);
++
++	setup_sock_common(sock->sk, si, pos, ctx);
++
++	if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6)
++		rst_restore_synwait_queue(sock->sk, si, pos, ctx);
++
++	return 0;
++
++err_out:
++	sock_release(sock);
++	return err;
++}
++
++static int
++rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++	int err;
++	loff_t pos = *pos_p;
++	struct cpt_sockmc_image v;
++
++	err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx);
++	if (err)
++		return err;
++
++	*pos_p += v.cpt_next;
++
++	if (v.cpt_family == AF_INET)
++		return rst_sk_mcfilter_in(sk, &v, pos, ctx);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	else if (v.cpt_family == AF_INET6)
++		return rst_sk_mcfilter_in6(sk, &v, pos, ctx); 
++#endif
++	else
++		return -EAFNOSUPPORT;
++}
++
++
++static int
++rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++	int err;
++	struct sk_filter *fp, *old_fp; 
++	loff_t pos = *pos_p;
++	struct cpt_obj_bits v;
++
++	err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx);
++	if (err)
++		return err;
++
++	*pos_p += v.cpt_next;
++
++	if (v.cpt_size % sizeof(struct sock_filter))
++		return -EINVAL;
++
++	fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC);
++	if (fp == NULL)
++		return -ENOMEM;
++	atomic_set(&fp->refcnt, 1);
++	fp->len = v.cpt_size/sizeof(struct sock_filter);
++
++	err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen);
++	if (err) {
++		sk_filter_uncharge(sk, fp);
++		return err;
++	}
++
++	old_fp = sk->sk_filter;
++	sk->sk_filter = fp;
++	if (old_fp)
++		sk_filter_uncharge(sk, old_fp);
++	return 0;
++}
++
++
++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++	int err;
++	loff_t pos = *pos_p;
++
++	err = rst_sock_attr_skfilter(pos_p, sk, ctx);
++	if (err && pos == *pos_p)
++		err = rst_sock_attr_mcfilter(pos_p, sk, ctx);
++	return err;
++}
++
++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx)
++{
++	int err;
++	struct sk_buff *skb;
++	struct cpt_skb_image v;
++	loff_t pos = *pos_p;
++	struct scm_fp_list *fpl = NULL;
++	struct timeval tmptv;
++
++	err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx);
++	if (err)
++		return ERR_PTR(err);
++	*pos_p = pos + v.cpt_next;
++
++	if (owner)
++		*owner = v.cpt_owner;
++	if (queue)
++		*queue = v.cpt_queue;
++
++	skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL);
++	if (skb == NULL)
++		return ERR_PTR(-ENOMEM);
++	skb_reserve(skb, v.cpt_hspace);
++	skb_put(skb, v.cpt_len);
++#ifdef NET_SKBUFF_DATA_USES_OFFSET
++	skb->transport_header = v.cpt_h;
++	skb->network_header = v.cpt_nh;
++	skb->mac_header = v.cpt_mac;
++#else
++	skb->transport_header = skb->head + v.cpt_h;
++	skb->network_header = skb->head + v.cpt_nh;
++	skb->mac_header = skb->head + v.cpt_mac;
++#endif
++	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb));
++	memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb));
++	skb->mac_len = v.cpt_mac_len;
++
++	skb->csum = v.cpt_csum;
++	skb->local_df = v.cpt_local_df;
++	skb->pkt_type = v.cpt_pkt_type;
++	skb->ip_summed = v.cpt_ip_summed;
++	skb->priority = v.cpt_priority;
++	skb->protocol = v.cpt_protocol;
++	cpt_timeval_import(&tmptv, v.cpt_stamp);
++	skb->tstamp = timeval_to_ktime(tmptv);
++
++	skb_shinfo(skb)->gso_segs = v.cpt_gso_segs;
++	skb_shinfo(skb)->gso_size = v.cpt_gso_size;
++	if (ctx->image_version == 0) {
++		skb_shinfo(skb)->gso_segs = 1;
++		skb_shinfo(skb)->gso_size = 0;
++	}
++
++	if (v.cpt_next > v.cpt_hdrlen) {
++		pos = pos + v.cpt_hdrlen;
++		while (pos < *pos_p) {
++			union {
++				struct cpt_obj_bits b;
++				struct cpt_fd_image f;
++			} u;
++
++			err = rst_get_object(-1, pos, &u, ctx);
++			if (err) {
++				kfree_skb(skb);
++				return ERR_PTR(err);
++			}
++			if (u.b.cpt_object == CPT_OBJ_BITS) {
++				if (u.b.cpt_size != v.cpt_hspace + skb->len) {
++					eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len);
++					kfree_skb(skb);
++					return ERR_PTR(-EINVAL);
++				}
++
++				err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen);
++				if (err) {
++					kfree_skb(skb);
++					return ERR_PTR(err);
++				}
++			} else if (u.f.cpt_object == CPT_OBJ_FILEDESC) {
++				if (!fpl) {
++					fpl = kmalloc(sizeof(struct scm_fp_list),
++							GFP_KERNEL_UBC);
++					if (!fpl) {
++						kfree_skb(skb);
++						return ERR_PTR(-ENOMEM);
++					}
++					fpl->count = 0;
++					UNIXCB(skb).fp = fpl;
++				}
++				fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx);
++				if (!IS_ERR(fpl->fp[fpl->count]))
++					fpl->count++;
++			}
++			pos += u.b.cpt_next;
++		}
++	}
++
++	return skb;
++}
++
++static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si,
++			       loff_t pos, struct cpt_context *ctx)
++{
++	loff_t endpos;
++
++	pos = pos + si->cpt_hdrlen;
++	endpos = pos + si->cpt_next;
++	while (pos < endpos) {
++		struct sk_buff *skb;
++		struct sock *owner_sk;
++		__u32 owner;
++
++		skb = rst_skb(&pos, &owner, NULL, ctx);
++		if (IS_ERR(skb)) {
++			if (PTR_ERR(skb) == -EINVAL) {
++				int err;
++
++				err = rst_sock_attr(&pos, sk, ctx);
++				if (err)
++					return err;
++			}
++			return PTR_ERR(skb);
++		}
++
++		owner_sk = unix_peer(sk);
++		if (owner != -1) {
++			cpt_object_t *pobj;
++			pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx);
++			if (pobj == NULL) {
++				eprintk_ctx("orphan af_unix skb?\n");
++				kfree_skb(skb);
++				continue;
++			}
++			owner_sk = pobj->o_obj;
++		}
++		if (owner_sk == NULL) {
++			dprintk_ctx("orphan af_unix skb 2?\n");
++			kfree_skb(skb);
++			continue;
++		}
++		skb_set_owner_w(skb, owner_sk);
++		if (UNIXCB(skb).fp)
++			skb->destructor = unix_destruct_fds;
++		skb_queue_tail(&sk->sk_receive_queue, skb);
++		if (sk->sk_state == TCP_LISTEN) {
++			struct socket *sock = skb->sk->sk_socket;
++			if (sock == NULL) BUG();
++			if (sock->file) BUG();
++			skb->sk->sk_socket = NULL;
++			skb->sk->sk_sleep = NULL;
++			sock->sk = NULL;
++			sock_release(sock);
++		}
++	}
++	return 0;
++}
++
++
++/* All the sockets are created before we start to open files */
++
++int rst_sockets(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_SOCKET];
++	loff_t endsec;
++	cpt_object_t *obj;
++	struct cpt_section_hdr h;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err) {
++		eprintk_ctx("rst_sockets: ctx->pread: %d\n", err);
++		return err;
++	}
++	if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) {
++		eprintk_ctx("rst_sockets: hdr err\n");
++		return -EINVAL;
++	}
++
++	/* The first pass: we create socket index and open listening sockets. */
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
++		if (err) {
++			eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
++			cpt_release_buf(ctx);
++			return err;
++		}
++		if (sbuf->cpt_state == TCP_LISTEN) {
++			err = open_listening_socket(sec, sbuf, ctx); 
++			cpt_release_buf(ctx);
++			if (err) {
++				eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err);
++				return err;
++			}
++		} else {
++			cpt_release_buf(ctx);
++			obj = alloc_cpt_object(GFP_KERNEL, ctx);
++			if (obj == NULL)
++				return -ENOMEM;
++			cpt_obj_setindex(obj, sbuf->cpt_index, ctx);
++			cpt_obj_setpos(obj, sec, ctx);
++			obj->o_ppos  = sbuf->cpt_file;
++			intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx);
++		}
++		sec += sbuf->cpt_next;
++	}
++
++	/* Pass 2: really restore sockets */
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct cpt_sock_image *sbuf;
++		if (obj->o_obj != NULL)
++			continue;
++		sbuf = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++		if (err) {
++			eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
++			cpt_release_buf(ctx);
++			return err;
++		}
++		if (sbuf->cpt_state == TCP_LISTEN) BUG();
++		err = open_socket(obj, sbuf, ctx); 
++		cpt_release_buf(ctx);
++		if (err) {
++			eprintk_ctx("rst_sockets: open_socket: %d\n", err);
++			return err;
++		}
++	}
++
++	return 0;
++}
++
++int rst_orphans(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_ORPHANS];
++	loff_t endsec;
++	cpt_object_t *obj;
++	struct cpt_section_hdr h;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++		obj = alloc_cpt_object(GFP_KERNEL, ctx);
++		if (obj == NULL) {
++			cpt_release_buf(ctx);
++			return -ENOMEM;
++		}
++		obj->o_pos = sec;
++		obj->o_ppos  = sbuf->cpt_file;
++		err = open_socket(obj, sbuf, ctx);
++		dprintk_ctx("Restoring orphan: %d\n", err);
++		free_cpt_object(obj, ctx);
++		cpt_release_buf(ctx);
++		if (err)
++			return err;
++		sec += sbuf->cpt_next;
++	}
++
++	return 0;
++}
++
++
++/* Pass 3: I understand, this is not funny already :-),
++ * but we have to do another pass to establish links between
++ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX
++ * skb queues with proper skb->sk links.
++ *
++ * This could be made at the end of rst_sockets(), but we defer
++ * restoring af_unix queues up to the end of restoring files to
++ * make restoring passed FDs cleaner.
++ */
++
++int rst_sockets_complete(struct cpt_context *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct cpt_sock_image *sbuf;
++		struct sock *sk = obj->o_obj;
++		struct sock *peer;
++
++		if (!sk) BUG();
++
++		if (sk->sk_family != AF_UNIX)
++			continue;
++
++		sbuf = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++
++		if (sbuf->cpt_next > sbuf->cpt_hdrlen)
++			restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx);
++
++		cpt_release_buf(ctx);
++
++		if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) {
++			cpt_object_t *pobj;
++
++			sbuf = cpt_get_buf(ctx);
++			err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++			if (err) {
++				cpt_release_buf(ctx);
++				return err;
++			}
++
++			if (sbuf->cpt_peer != -1) {
++				pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx);
++				if (pobj) {
++					peer = pobj->o_obj;
++					sock_hold(peer);
++					unix_peer(sk) = peer;
++				}
++			}
++			cpt_release_buf(ctx);
++		}
++	}
++
++	rst_orphans(ctx);
++
++	return 0;
++}
++
+diff --git a/kernel/cpt/rst_socket_in.c b/kernel/cpt/rst_socket_in.c
+new file mode 100644
+index 0000000..ddc2d5a
+--- /dev/null
++++ b/kernel/cpt/rst_socket_in.c
+@@ -0,0 +1,489 @@
++/*
++ *
++ *  kernel/cpt/rst_socket_in.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/tcp.h>
++#include <linux/jhash.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <linux/ipv6.h>
++#include <linux/igmp.h>
++#include <net/addrconf.h>
++#include <net/inet6_connection_sock.h>
++#include <linux/nsproxy.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++static inline unsigned long jiffies_import(__u32 tmo)
++{
++	__s32 delta = tmo;
++	return jiffies + (long)delta;
++}
++
++static inline __u32 tcp_jiffies_import(__u32 tmo)
++{
++	return ((__u32)jiffies) + tmo;
++}
++
++
++static int restore_queues(struct sock *sk, struct cpt_sock_image *si,
++			  loff_t pos, struct cpt_context *ctx)
++{
++	loff_t endpos;
++
++	pos = pos + si->cpt_hdrlen;
++	endpos = pos + si->cpt_next;
++	while (pos < endpos) {
++		struct sk_buff *skb;
++		__u32 type;
++
++		skb = rst_skb(&pos, NULL, &type, ctx);
++		if (IS_ERR(skb)) {
++			if (PTR_ERR(skb) == -EINVAL) {
++				int err;
++
++				err = rst_sock_attr(&pos, sk, ctx);
++				if (err)
++					return err;
++			}
++			return PTR_ERR(skb);
++		}
++
++		if (sk->sk_type == SOCK_STREAM) {
++			if (type == CPT_SKB_RQ) {
++				skb_set_owner_r(skb, sk);
++				ub_tcprcvbuf_charge_forced(sk, skb);
++				skb_queue_tail(&sk->sk_receive_queue, skb);
++			} else if (type == CPT_SKB_OFOQ) {
++				struct tcp_sock *tp = tcp_sk(sk);
++				skb_set_owner_r(skb, sk);
++				ub_tcprcvbuf_charge_forced(sk, skb);
++				skb_queue_tail(&tp->out_of_order_queue, skb);
++			} else if (type == CPT_SKB_WQ) {
++				sk->sk_wmem_queued += skb->truesize;
++				sk->sk_forward_alloc -= skb->truesize;
++				ub_tcpsndbuf_charge_forced(sk, skb);
++				skb_queue_tail(&sk->sk_write_queue, skb);
++			} else {
++				wprintk_ctx("strange stream queue type %u\n", type);
++				kfree_skb(skb);
++			}
++		} else {
++			if (type == CPT_SKB_RQ) {
++				skb_set_owner_r(skb, sk);
++				skb_queue_tail(&sk->sk_receive_queue, skb);
++			} else if (type == CPT_SKB_WQ) {
++				struct inet_sock *inet = inet_sk(sk);
++				if (inet->cork.fragsize) {
++					skb_set_owner_w(skb, sk);
++					skb_queue_tail(&sk->sk_write_queue, skb);
++				} else {
++					eprintk_ctx("cork skb is dropped\n");
++					kfree_skb(skb);
++				}
++			} else {
++				wprintk_ctx("strange dgram queue type %u\n", type);
++				kfree_skb(skb);
++			}
++		}
++	}
++	return 0;
++}
++
++static struct sock *find_parent(__u16 sport, cpt_context_t *ctx)
++{
++	cpt_object_t *obj;
++	for_each_object(obj, CPT_OBJ_SOCKET) {
++		struct sock *sk = obj->o_obj;
++		if (sk &&
++		    sk->sk_state == TCP_LISTEN &&
++		    (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
++		    inet_sk(sk)->sport == sport)
++			return sk;
++	}
++	return NULL;
++}
++
++static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
++			  struct cpt_context *ctx)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++	struct sk_buff *skb;
++	tp->pred_flags = si->cpt_pred_flags;
++	tp->rcv_nxt = si->cpt_rcv_nxt;
++	tp->snd_nxt = si->cpt_snd_nxt;
++	tp->snd_una = si->cpt_snd_una;
++	tp->snd_sml = si->cpt_snd_sml;
++	tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
++	tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
++	tp->tcp_header_len = si->cpt_tcp_header_len;
++	inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending;
++	inet_csk(sk)->icsk_ack.quick = si->cpt_quick;
++	inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong;
++	inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked;
++	inet_csk(sk)->icsk_ack.ato = si->cpt_ato;
++	inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout);
++	inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime);
++	inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size;
++	inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss;
++	tp->snd_wl1 = si->cpt_snd_wl1;
++	tp->snd_wnd = si->cpt_snd_wnd;
++	tp->max_window = si->cpt_max_window;
++	inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie;
++	tp->mss_cache = si->cpt_mss_cache;
++	tp->rx_opt.mss_clamp = si->cpt_mss_clamp;
++	inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
++	inet_csk(sk)->icsk_ca_state = si->cpt_ca_state;
++	inet_csk(sk)->icsk_retransmits = si->cpt_retransmits;
++	tp->reordering = si->cpt_reordering;
++	tp->frto_counter = si->cpt_frto_counter;
++	tp->frto_highmark = si->cpt_frto_highmark;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++	// // tp->adv_cong = si->cpt_adv_cong;
++#endif
++	inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
++	inet_csk(sk)->icsk_backoff = si->cpt_backoff;
++	tp->srtt = si->cpt_srtt;
++	tp->mdev = si->cpt_mdev;
++	tp->mdev_max = si->cpt_mdev_max;
++	tp->rttvar = si->cpt_rttvar;
++	tp->rtt_seq = si->cpt_rtt_seq;
++	inet_csk(sk)->icsk_rto = si->cpt_rto;
++	tp->packets_out = si->cpt_packets_out;
++	tp->retrans_out = si->cpt_retrans_out;
++	tp->lost_out = si->cpt_lost_out;
++	tp->sacked_out = si->cpt_sacked_out;
++	tp->fackets_out = si->cpt_fackets_out;
++	tp->snd_ssthresh = si->cpt_snd_ssthresh;
++	tp->snd_cwnd = si->cpt_snd_cwnd;
++	tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt;
++	tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp;
++	tp->snd_cwnd_used = si->cpt_snd_cwnd_used;
++	tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp);
++	inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout);
++	tp->rcv_wnd = si->cpt_rcv_wnd;
++	tp->rcv_wup = si->cpt_rcv_wup;
++	tp->write_seq = si->cpt_write_seq;
++	tp->pushed_seq = si->cpt_pushed_seq;
++	tp->copied_seq = si->cpt_copied_seq;
++	tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok;
++	tp->rx_opt.wscale_ok = si->cpt_wscale_ok;
++	tp->rx_opt.sack_ok = si->cpt_sack_ok;
++	tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp;
++	tp->rx_opt.snd_wscale = si->cpt_snd_wscale;
++	tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale;
++	tp->nonagle = si->cpt_nonagle;
++	tp->keepalive_probes = si->cpt_keepalive_probes;
++	tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval;
++	tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr;
++	tp->rx_opt.ts_recent = si->cpt_ts_recent;
++	tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp;
++	tp->rx_opt.user_mss = si->cpt_user_mss;
++	tp->rx_opt.dsack = si->cpt_dsack;
++	tp->rx_opt.eff_sacks = si->cpt_num_sacks;
++	tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0];
++	tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1];
++	tp->selective_acks[0].start_seq = si->cpt_sack_array[2];
++	tp->selective_acks[0].end_seq = si->cpt_sack_array[3];
++	tp->selective_acks[1].start_seq = si->cpt_sack_array[4];
++	tp->selective_acks[1].end_seq = si->cpt_sack_array[5];
++	tp->selective_acks[2].start_seq = si->cpt_sack_array[6];
++	tp->selective_acks[2].end_seq = si->cpt_sack_array[7];
++	tp->selective_acks[3].start_seq = si->cpt_sack_array[8];
++	tp->selective_acks[3].end_seq = si->cpt_sack_array[9];
++
++	tp->window_clamp = si->cpt_window_clamp;
++	tp->rcv_ssthresh = si->cpt_rcv_ssthresh;
++	inet_csk(sk)->icsk_probes_out = si->cpt_probes_out;
++	tp->rx_opt.num_sacks = si->cpt_num_sacks;
++	tp->advmss = si->cpt_advmss;
++	inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
++	tp->ecn_flags = si->cpt_ecn_flags;
++	tp->prior_ssthresh = si->cpt_prior_ssthresh;
++	tp->high_seq = si->cpt_high_seq;
++	tp->retrans_stamp = si->cpt_retrans_stamp;
++	tp->undo_marker = si->cpt_undo_marker;
++	tp->undo_retrans = si->cpt_undo_retrans;
++	tp->urg_seq = si->cpt_urg_seq;
++	tp->urg_data = si->cpt_urg_data;
++	inet_csk(sk)->icsk_pending = si->cpt_pending;
++	tp->urg_mode = si->cpt_urg_mode;
++	tp->snd_up = si->cpt_snd_up;
++	tp->keepalive_time = si->cpt_keepalive_time;
++	tp->keepalive_intvl = si->cpt_keepalive_intvl;
++	tp->linger2 = si->cpt_linger2;
++
++	sk->sk_send_head = NULL;
++	for (skb = skb_peek(&sk->sk_write_queue);
++	     skb && skb != (struct sk_buff*)&sk->sk_write_queue;
++	     skb = skb->next) {
++		if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) {
++			sk->sk_send_head = skb;
++			break;
++		}
++	}
++
++	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) {
++		struct inet_sock *inet = inet_sk(sk);
++		if (inet->num == 0) {
++			cpt_object_t *lobj = NULL;
++
++			if ((int)si->cpt_parent != -1)
++				lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++
++			if (lobj && lobj->o_obj) {
++				inet->num = ntohs(inet->sport);
++				local_bh_disable();
++				__inet_inherit_port(lobj->o_obj, sk);
++				local_bh_enable();
++				dprintk_ctx("port inherited from parent\n");
++			} else {
++				struct sock *lsk = find_parent(inet->sport, ctx);
++				if (lsk) {
++					inet->num = ntohs(inet->sport);
++					local_bh_disable();
++					__inet_inherit_port(lsk, sk);
++					local_bh_enable();
++					dprintk_ctx("port inherited\n");
++				} else {
++					eprintk_ctx("we are kinda lost...\n");
++				}
++			}
++		}
++
++		sk->sk_prot->hash(sk);
++
++		if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER)
++			sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout);
++		if (inet_csk(sk)->icsk_pending)
++			sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer,
++				       inet_csk(sk)->icsk_timeout);
++		if (sock_flag(sk, SOCK_KEEPOPEN)) {
++			unsigned long expires = jiffies_import(si->cpt_ka_timeout);
++			if (time_after(jiffies, expires))
++				expires = jiffies + HZ;
++			sk_reset_timer(sk, &sk->sk_timer, expires);
++		}
++	}
++
++	return 0;
++}
++
++
++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
++		  struct cpt_context *ctx)
++{
++	struct inet_sock *inet = inet_sk(sk);
++	struct net *net = get_exec_env()->ve_ns->net_ns;
++
++	lock_sock(sk);
++
++	sk->sk_state = si->cpt_state;
++
++	inet->daddr = si->cpt_daddr;
++	inet->dport = si->cpt_dport;
++	inet->saddr = si->cpt_saddr;
++	inet->rcv_saddr = si->cpt_rcv_saddr;
++	inet->sport = si->cpt_sport;
++	inet->uc_ttl = si->cpt_uc_ttl;
++	inet->tos = si->cpt_tos;
++	inet->cmsg_flags = si->cpt_cmsg_flags;
++	inet->mc_index = si->cpt_mc_index;
++	inet->mc_addr = si->cpt_mc_addr;
++	inet->hdrincl = si->cpt_hdrincl;
++	inet->mc_ttl = si->cpt_mc_ttl;
++	inet->mc_loop = si->cpt_mc_loop;
++	inet->pmtudisc = si->cpt_pmtudisc;
++	inet->recverr = si->cpt_recverr;
++	inet->freebind = si->cpt_freebind;
++	inet->id = si->cpt_idcounter;
++
++	inet->cork.flags = si->cpt_cork_flags;
++	inet->cork.fragsize = si->cpt_cork_fragsize;
++	inet->cork.length = si->cpt_cork_length;
++	inet->cork.addr = si->cpt_cork_addr;
++	inet->cork.fl.fl4_src = si->cpt_cork_saddr;
++	inet->cork.fl.fl4_dst = si->cpt_cork_daddr;
++	inet->cork.fl.oif = si->cpt_cork_oif;
++	if (inet->cork.fragsize) {
++		if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) {
++			eprintk_ctx("failed to restore cork route\n");
++			inet->cork.fragsize = 0;
++		}
++	}
++
++	if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
++		struct udp_sock *up = udp_sk(sk);
++		up->pending = si->cpt_udp_pending;
++		up->corkflag = si->cpt_udp_corkflag;
++		up->encap_type = si->cpt_udp_encap;
++		up->len = si->cpt_udp_len;
++	}
++
++	if (sk->sk_family == AF_INET6) {
++		struct ipv6_pinfo *np = inet6_sk(sk);
++
++		memcpy(&np->saddr, si->cpt_saddr6, 16);
++		memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16);
++		memcpy(&np->daddr, si->cpt_daddr6, 16);
++		np->flow_label = si->cpt_flow_label6;
++		np->frag_size = si->cpt_frag_size6;
++		np->hop_limit = si->cpt_hop_limit6;
++		np->mcast_hops = si->cpt_mcast_hops6;
++		np->mcast_oif = si->cpt_mcast_oif6;
++		np->rxopt.all = si->cpt_rxopt6;
++		np->mc_loop = si->cpt_mc_loop6;
++		np->recverr = si->cpt_recverr6;
++		np->sndflow = si->cpt_sndflow6;
++		np->pmtudisc = si->cpt_pmtudisc6;
++		np->ipv6only = si->cpt_ipv6only6;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++		if (si->cpt_mapped) {
++			extern struct inet_connection_sock_af_ops ipv6_mapped;
++			if (sk->sk_type == SOCK_STREAM &&
++			    sk->sk_protocol == IPPROTO_TCP) {
++				inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
++				sk->sk_backlog_rcv = tcp_v4_do_rcv;
++			}
++		}
++#endif
++	}
++
++	restore_queues(sk, si, pos, ctx);
++
++	if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
++		rst_socket_tcp(si, pos, sk, ctx);
++
++	release_sock(sk);
++	return 0;
++}
++
++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx)
++{
++	struct request_sock *req;
++
++	if (lsk->sk_state != TCP_LISTEN)
++		return -EINVAL;
++
++	req = reqsk_alloc(&tcp_request_sock_ops);
++	if (!req)
++		return -ENOMEM;
++
++	sk->sk_socket = NULL;
++	sk->sk_sleep = NULL;
++	inet_csk_reqsk_queue_add(lsk, req, sk);
++	return 0;
++}
++
++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si,
++			      loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	loff_t end = si->cpt_next;
++
++	pos += si->cpt_hdrlen;
++	while (pos < end) {
++		struct cpt_openreq_image oi;
++
++		err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx);
++		if (err) {
++			err = rst_sock_attr(&pos, sk, ctx);
++			if (err)
++				return err;
++			continue;
++		}
++
++		if (oi.cpt_object == CPT_OBJ_OPENREQ) {
++			struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops);
++			if (req == NULL)
++				return -ENOMEM;
++
++			memset(req, 0, sizeof(*req));
++			tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn;
++			tcp_rsk(req)->snt_isn = oi.cpt_snt_isn;
++			inet_rsk(req)->rmt_port = oi.cpt_rmt_port;
++			req->mss = oi.cpt_mss;
++			req->retrans = oi.cpt_retrans;
++			inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale;
++			inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale;
++			inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok;
++			inet_rsk(req)->sack_ok = oi.cpt_sack_ok;
++			inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok;
++			inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok;
++			inet_rsk(req)->acked = oi.cpt_acked;
++			req->window_clamp = oi.cpt_window_clamp;
++			req->rcv_wnd = oi.cpt_rcv_wnd;
++			req->ts_recent = oi.cpt_ts_recent;
++			req->expires = jiffies_import(oi.cpt_expires);
++
++			if (oi.cpt_family == AF_INET) {
++				memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4);
++				memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4);
++				inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
++			} else {
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++				memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16);
++				memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16);
++				inet6_rsk(req)->iif = oi.cpt_iif;
++				inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
++#endif
++			}
++		}
++		pos += oi.cpt_next;
++	}
++	return 0;
++}
++
++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
++		       loff_t pos, cpt_context_t *ctx)
++{
++	struct ip_mreqn imr;
++
++	if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
++		eprintk_ctx("IGMPv3 is still not supported\n");
++		return -EINVAL;
++	}
++
++	memset(&imr, 0, sizeof(imr));
++	imr.imr_ifindex = v->cpt_ifindex;
++	imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0];
++	return ip_mc_join_group(sk, &imr);
++}
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
++			loff_t pos, cpt_context_t *ctx)
++{
++
++	if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
++		eprintk_ctx("IGMPv3 is still not supported\n");
++		return -EINVAL;
++	}
++
++	return ipv6_sock_mc_join(sk, v->cpt_ifindex,
++				 (struct in6_addr*)v->cpt_mcaddr);
++}
++#endif
+diff --git a/kernel/cpt/rst_sysvipc.c b/kernel/cpt/rst_sysvipc.c
+new file mode 100644
+index 0000000..8803de5
+--- /dev/null
++++ b/kernel/cpt/rst_sysvipc.c
+@@ -0,0 +1,636 @@
++/*
++ *
++ *  kernel/cpt/rst_sysvipc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/nsproxy.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/shm.h>
++#include <linux/msg.h>
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++#include <bc/kmem.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++
++struct _warg {
++		struct file		*file;
++		struct cpt_sysvshm_image	*v;
++};
++
++static int fixup_one_shm(struct shmid_kernel *shp, void *arg)
++{
++	struct _warg *warg = arg;
++
++	if (shp->shm_file != warg->file)
++		return 0;
++	if (shp->shm_nattch)
++		return -EEXIST;
++
++	shp->shm_perm.uid = warg->v->cpt_uid;
++	shp->shm_perm.gid = warg->v->cpt_gid;
++	shp->shm_perm.cuid = warg->v->cpt_cuid;
++	shp->shm_perm.cgid = warg->v->cpt_cgid;
++	shp->shm_perm.mode = warg->v->cpt_mode;
++
++	shp->shm_atim = warg->v->cpt_atime;
++	shp->shm_dtim = warg->v->cpt_dtime;
++	shp->shm_ctim = warg->v->cpt_ctime;
++	shp->shm_cprid = warg->v->cpt_creator;
++	shp->shm_lprid = warg->v->cpt_last;
++
++	/* TODO: fix shp->mlock_user? */
++	return 1;
++}
++
++static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v)
++{
++	struct _warg warg;
++
++	warg.file = file;
++	warg.v = v;
++
++	return sysvipc_walk_shm(fixup_one_shm, &warg);
++}
++
++static int fixup_shm_data(struct file *file, loff_t pos, loff_t end,
++			  struct cpt_context *ctx)
++{
++	struct cpt_page_block pgb;
++	ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
++
++	do_write = file->f_dentry->d_inode->i_fop->write;
++	if (do_write == NULL) {
++		eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n");
++		return -EINVAL;
++	}
++
++	while (pos < end) {
++		loff_t opos;
++		loff_t ipos;
++		int count;
++		int err;
++
++		err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
++		if (err)
++			return err;
++		dprintk_ctx("restoring SHM block: %08x-%08x\n",
++		       (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
++		ipos = pos + pgb.cpt_hdrlen;
++		opos = pgb.cpt_start;
++		count = pgb.cpt_end-pgb.cpt_start;
++		while (count > 0) {
++			mm_segment_t oldfs;
++			int copy = count;
++
++			if (copy > PAGE_SIZE)
++				copy = PAGE_SIZE;
++			(void)cpt_get_buf(ctx);
++			oldfs = get_fs(); set_fs(KERNEL_DS);
++			err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
++			set_fs(oldfs);
++			if (err) {
++				__cpt_release_buf(ctx);
++				return err;
++			}
++			oldfs = get_fs(); set_fs(KERNEL_DS);
++			ipos += copy;
++			err = do_write(file, ctx->tmpbuf, copy, &opos);
++			set_fs(oldfs);
++			__cpt_release_buf(ctx);
++			if (err != copy) {
++				eprintk_ctx("write() failure\n");
++				if (err >= 0)
++					err = -EIO;
++				return err;
++			}
++			count -= copy;
++		}
++		pos += pgb.cpt_next;
++	}
++	return 0;
++}
++
++struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx)
++{
++	struct file *file;
++	int err;
++	loff_t dpos, epos;
++	union {
++		struct cpt_file_image		fi;
++		struct cpt_sysvshm_image	shmi;
++		struct cpt_inode_image 		ii;
++	} u;
++
++	err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
++	if (err < 0)
++		goto err_out;
++	pos = u.fi.cpt_inode;
++	err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
++	if (err < 0)
++		goto err_out;
++	dpos = pos + u.ii.cpt_hdrlen;
++	epos = pos + u.ii.cpt_next;
++	err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
++	if (err < 0)
++		goto err_out;
++	dpos += u.shmi.cpt_next;
++
++	file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id,
++				 u.shmi.cpt_segsz, u.shmi.cpt_mode);
++	if (!IS_ERR(file)) {
++		err = fixup_shm(file, &u.shmi);
++		if (err != -EEXIST && dpos < epos)
++			err = fixup_shm_data(file, dpos, epos, ctx);
++	} else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) {
++		struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
++		struct shmid_kernel *shp;
++
++		shp = shm_lock(ipc_ns, u.shmi.cpt_id);
++		BUG_ON(IS_ERR(shp));
++		get_file(shp->shm_file);
++		file = shp->shm_file;
++		shm_unlock(shp);
++	}
++	return file;
++
++err_out:
++	return ERR_PTR(err);
++}
++
++struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx)
++{
++	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
++	struct file *file;
++	union {
++		struct cpt_file_image		fi;
++		struct cpt_inode_image		ii;
++		struct cpt_sysvshm_image	shmi;
++	} u;
++	struct shmid_kernel *shp;
++	struct shm_file_data *sfd;
++	struct path path;
++	mode_t f_mode;
++	loff_t pos;
++	int err;
++
++	pos = vmai->cpt_file;
++	file = rst_sysv_shm_itself(pos, ctx);
++	if (IS_ERR(file) && PTR_ERR(file) != -EEXIST)
++		return file;
++	fput(file);
++
++	err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
++	if (err < 0)
++		goto err_out;
++	pos = u.fi.cpt_inode;
++	err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
++	if (err < 0)
++		goto err_out;
++	err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
++	if (err < 0)
++		goto err_out;
++
++	shp = shm_lock(ipc_ns, u.shmi.cpt_id);
++	BUG_ON(IS_ERR(shp));
++	path.dentry = dget(shp->shm_file->f_path.dentry);
++	path.mnt    = shp->shm_file->f_path.mnt;
++	shm_unlock(shp);
++
++	err = -ENOMEM;
++	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
++	if (!sfd)
++		goto out_put_dentry;
++
++	f_mode = 0;
++	if (vmai->cpt_flags & VM_READ)
++		f_mode |= FMODE_READ;
++	if (vmai->cpt_flags & VM_WRITE)
++		f_mode |= FMODE_WRITE;
++	if (vmai->cpt_flags & VM_EXEC)
++		f_mode |= FMODE_EXEC;
++
++	err = -ENOMEM;
++	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
++	if (!file)
++		goto out_free;
++
++	file->private_data = sfd;
++	file->f_mapping = shp->shm_file->f_mapping;
++	sfd->id = shp->shm_perm.id;
++	sfd->ns = get_ipc_ns(ipc_ns);
++	sfd->file = shp->shm_file;
++	sfd->vm_ops = NULL;
++
++	return file;
++
++out_free:
++	kfree(sfd);
++out_put_dentry:
++	dput(path.dentry);
++err_out:
++	return ERR_PTR(err);
++}
++
++static int attach_one_undo(int semid, struct sem_array *sma, void *arg)
++{
++	struct sem_undo *su = arg;
++	struct sem_undo_list *undo_list = current->sysvsem.undo_list;
++
++	if (semid != su->semid)
++		return 0;
++
++	su->proc_next = undo_list->proc_list;
++	undo_list->proc_list = su;
++
++	su->id_next = sma->undo;
++	sma->undo = su;
++
++	return 1;
++}
++
++static int attach_undo(struct sem_undo *su)
++{
++	return sysvipc_walk_sem(attach_one_undo, su);
++}
++
++static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx)
++{
++	int err;
++	struct sem_undo_list *undo_list;
++
++	if (current->sysvsem.undo_list) {
++		eprintk_ctx("Funny undo_list\n");
++		return 0;
++	}
++
++	undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC);
++	if (undo_list == NULL)
++		return -ENOMEM;
++
++	atomic_set(&undo_list->refcnt, 1);
++	spin_lock_init(&undo_list->lock);
++	current->sysvsem.undo_list = undo_list;
++
++	if (sui->cpt_next > sui->cpt_hdrlen) {
++		loff_t offset = pos + sui->cpt_hdrlen;
++		do {
++			struct sem_undo *new;
++			struct cpt_sysvsem_undo_image spi;
++			err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx);
++			if (err)
++				goto out;
++			new = kmalloc(sizeof(struct sem_undo) +
++					sizeof(short)*spi.cpt_nsem,
++					GFP_KERNEL_UBC);
++			if (!new) {
++				err = -ENOMEM;
++				goto out;
++			}
++
++			memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem);
++			new->semadj = (short *) &new[1];
++			new->semid = spi.cpt_id;
++			err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen);
++			if (err) {
++				kfree(new);
++				goto out;
++			}
++			err = attach_undo(new);
++			if (err <= 0) {
++				if (err == 0)
++					err = -ENOENT;
++				kfree(new);
++				goto out;
++			}
++			offset += spi.cpt_next;
++		} while (offset < pos + sui->cpt_next);
++	}
++	err = 0;
++
++out:
++	return err;
++}
++
++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	__u32 flag = 0;
++
++#if 0
++	if (ti->cpt_sysvsem_undo == CPT_NULL ||
++	    lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo))
++		flag |= CLONE_SYSVSEM;
++#endif
++	return flag;
++}
++
++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	int err;
++	struct sem_undo_list *f = current->sysvsem.undo_list;
++	cpt_object_t *obj;
++	struct cpt_object_hdr sui;
++
++	if (ti->cpt_sysvsem_undo == CPT_NULL) {
++		exit_sem(current);
++		return 0;
++	}
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx);
++	if (obj) {
++		if (obj->o_obj != f) {
++			exit_sem(current);
++			f = obj->o_obj;
++			atomic_inc(&f->refcnt);
++			current->sysvsem.undo_list = f;
++		}
++		return 0;
++	}
++
++	if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0)
++		goto out;
++
++	if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0)
++		goto out;
++
++	err = -ENOMEM;
++	obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx);
++	if (obj) {
++		err = 0;
++		cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx);
++	}
++
++	return 0;
++
++out:
++	return err;
++}
++
++struct _sarg {
++	int semid;
++	struct cpt_sysvsem_image	*v;
++	__u32				*arr;
++};
++
++static int fixup_one_sem(int semid, struct sem_array *sma, void *arg)
++{
++	struct _sarg *warg = arg;
++
++	if (semid != warg->semid)
++		return 0;
++
++	sma->sem_perm.uid = warg->v->cpt_uid;
++	sma->sem_perm.gid = warg->v->cpt_gid;
++	sma->sem_perm.cuid = warg->v->cpt_cuid;
++	sma->sem_perm.cgid = warg->v->cpt_cgid;
++	sma->sem_perm.mode = warg->v->cpt_mode;
++	sma->sem_perm.seq = warg->v->cpt_seq;
++
++	sma->sem_ctime = warg->v->cpt_ctime;
++	sma->sem_otime = warg->v->cpt_otime;
++	memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8);
++	return 1;
++}
++
++static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr)
++{
++	struct _sarg warg;
++
++	warg.semid = semid;
++	warg.v = v;
++	warg.arr = arr;
++
++	return sysvipc_walk_sem(fixup_one_sem, &warg);
++}
++
++
++static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si,
++		       struct cpt_context *ctx)
++{
++	int err;
++	__u32 *arr;
++	int nsems = (si->cpt_next - si->cpt_hdrlen)/8;
++
++	arr = kmalloc(nsems*8, GFP_KERNEL);
++	if (!arr)
++		return -ENOMEM;
++
++	err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen);
++	if (err)
++		goto out;
++	err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode);
++	if (err < 0) {
++		eprintk_ctx("SEM 3\n");
++		goto out;
++	}
++	err = fixup_sem(si->cpt_id, si, arr);
++	if (err == 0)
++		err = -ESRCH;
++	if (err > 0)
++		err = 0;
++out:
++	kfree(arr);
++	return err;
++}
++
++static int rst_sysv_sem(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_sysvsem_image sbuf;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		int err;
++		err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx);
++		if (err)
++			return err;
++		err = restore_sem(sec, &sbuf, ctx);
++		if (err)
++			return err;
++		sec += sbuf.cpt_next;
++	}
++	return 0;
++}
++
++struct _marg {
++	int				msqid;
++	struct cpt_sysvmsg_image	*v;
++	struct msg_queue		*m;
++};
++
++static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg)
++{
++	struct _marg *warg = arg;
++
++	if (msqid != warg->msqid)
++		return 0;
++
++	msq->q_perm.uid = warg->v->cpt_uid;
++	msq->q_perm.gid = warg->v->cpt_gid;
++	msq->q_perm.cuid = warg->v->cpt_cuid;
++	msq->q_perm.cgid = warg->v->cpt_cgid;
++	msq->q_perm.mode = warg->v->cpt_mode;
++	msq->q_perm.seq = warg->v->cpt_seq;
++
++	msq->q_stime = warg->v->cpt_stime;
++	msq->q_rtime = warg->v->cpt_rtime;
++	msq->q_ctime = warg->v->cpt_ctime;
++	msq->q_lspid = warg->v->cpt_last_sender;
++	msq->q_lrpid = warg->v->cpt_last_receiver;
++	msq->q_qbytes = warg->v->cpt_qbytes;
++
++	warg->m = msq;
++	return 1;
++}
++
++struct _larg
++{
++	cpt_context_t * ctx;
++	loff_t		pos;
++};
++
++static int do_load_msg(void * dst, int len, int offset, void * data)
++{
++	struct _larg * arg = data;
++	return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset);
++}
++
++static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos,
++		     cpt_context_t * ctx)
++{
++	int err;
++	struct _marg warg;
++	loff_t endpos = pos + v->cpt_next;
++	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
++
++	pos += v->cpt_hdrlen;
++
++	warg.msqid = msqid;
++	warg.v = v;
++
++	err = sysvipc_walk_msg(fixup_one_msg, &warg);
++	if (err <= 0)
++		return err;
++
++	while (pos < endpos) {
++		struct cpt_sysvmsg_msg_image mi;
++		struct msg_msg *m;
++		struct _larg data = {
++			.ctx = ctx
++		};
++
++		err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx);
++		if (err)
++			return err;
++		data.pos = pos + mi.cpt_hdrlen;
++		m = sysv_msg_load(do_load_msg, mi.cpt_size, &data);
++		if (IS_ERR(m))
++			return PTR_ERR(m);
++		m->m_type = mi.cpt_type;
++		m->m_ts = mi.cpt_size;
++		list_add_tail(&m->m_list, &warg.m->q_messages);
++		warg.m->q_cbytes += m->m_ts;
++		warg.m->q_qnum++;
++		atomic_add(m->m_ts, &ns->msg_bytes);
++		atomic_inc(&ns->msg_hdrs);
++			
++		pos += mi.cpt_next;
++	}
++	return 1;
++}
++
++static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si,
++		       struct cpt_context *ctx)
++{
++	int err;
++
++	err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode);
++	if (err < 0) {
++		eprintk_ctx("MSG 3\n");
++		goto out;
++	}
++	err = fixup_msg(si->cpt_id, si, pos, ctx);
++	if (err == 0)
++		err = -ESRCH;
++	if (err > 0)
++		err = 0;
++out:
++	return err;
++}
++
++static int rst_sysv_msg(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_sysvmsg_image sbuf;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		int err;
++		err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx);
++		if (err)
++			return err;
++		err = restore_msg(sec, &sbuf, ctx);
++		if (err)
++			return err;
++		sec += sbuf.cpt_next;
++	}
++	return 0;
++}
++
++
++int rst_sysv_ipc(struct cpt_context *ctx)
++{
++	int err;
++
++	err = rst_sysv_sem(ctx);
++	if (!err)
++		err = rst_sysv_msg(ctx);
++
++	return err;
++}
+diff --git a/kernel/cpt/rst_tty.c b/kernel/cpt/rst_tty.c
+new file mode 100644
+index 0000000..48bc4ce
+--- /dev/null
++++ b/kernel/cpt/rst_tty.c
+@@ -0,0 +1,384 @@
++/*
++ *
++ *  kernel/cpt/rst_tty.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mount.h>
++#include <linux/tty.h>
++#include <linux/vmalloc.h>
++#include <linux/nsproxy.h>
++#include <asm/unistd.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_process.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++
++static int pty_setup(struct tty_struct *stty, loff_t pos,
++		     struct cpt_tty_image *pi, struct cpt_context *ctx)
++{
++	unsigned long flags;
++
++	stty->pgrp = NULL;
++	stty->session = NULL;
++	stty->packet = pi->cpt_packet;
++	stty->stopped = pi->cpt_stopped;
++	stty->hw_stopped = pi->cpt_hw_stopped;
++	stty->flow_stopped = pi->cpt_flow_stopped;
++#define DONOT_CHANGE ((1<<TTY_CHARGED)|(1<<TTY_CLOSING)|(1<<TTY_LDISC))
++	flags = stty->flags & DONOT_CHANGE;
++	stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE);
++	stty->ctrl_status = pi->cpt_ctrl_status;
++	stty->winsize.ws_row = pi->cpt_ws_row;
++	stty->winsize.ws_col = pi->cpt_ws_col;
++	stty->winsize.ws_ypixel = pi->cpt_ws_prow;
++	stty->winsize.ws_xpixel = pi->cpt_ws_pcol;
++	stty->canon_column = pi->cpt_canon_column;
++	stty->column = pi->cpt_column;
++	stty->raw = pi->cpt_raw;
++	stty->real_raw = pi->cpt_real_raw;
++	stty->erasing = pi->cpt_erasing;
++	stty->lnext = pi->cpt_lnext;
++	stty->icanon = pi->cpt_icanon;
++	stty->closing = pi->cpt_closing;
++	stty->minimum_to_wake = pi->cpt_minimum_to_wake;
++
++	stty->termios->c_iflag = pi->cpt_c_iflag;
++	stty->termios->c_oflag = pi->cpt_c_oflag;
++	stty->termios->c_lflag = pi->cpt_c_lflag;
++	stty->termios->c_cflag = pi->cpt_c_cflag;
++	memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS);
++	memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags));
++
++	if (pi->cpt_next > pi->cpt_hdrlen) {
++		int err;
++		struct cpt_obj_bits b;
++		err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx);
++		if (err)
++			return err;
++		if (b.cpt_size == 0)
++			return 0;
++		err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen);
++		if (err)
++			return err;
++
++		spin_lock_irq(&stty->read_lock);
++		stty->read_tail = 0;
++		stty->read_cnt = b.cpt_size;
++		stty->read_head = b.cpt_size;
++		stty->canon_head = stty->read_tail + pi->cpt_canon_head;
++		stty->canon_data = pi->cpt_canon_data;
++		spin_unlock_irq(&stty->read_lock);
++	}
++
++	return 0;
++}
++
++/* Find slave/master tty in image, when we already know master/slave.
++ * It might be optimized, of course. */
++static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_TTY];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_tty_image *pibuf;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return CPT_NULL;
++	if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
++		return CPT_NULL;
++	pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL);
++	if (pibuf == NULL) {
++		eprintk_ctx("cannot allocate buffer\n");
++		return CPT_NULL;
++	}
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx))
++			return CPT_NULL;
++		if (pibuf->cpt_index == pi->cpt_index &&
++		    !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) &&
++		    pos != sec) {
++			pty_setup(stty, sec, pibuf, ctx);
++			return sec;
++		}
++		sec += pibuf->cpt_next;
++	}
++	kfree(pibuf);
++	return CPT_NULL;
++}
++
++static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master,
++			   struct cpt_context *ctx)
++{
++	int err;
++	struct iattr newattrs;
++	struct dentry *d = master->f_dentry;
++
++	newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE;
++	newattrs.ia_uid = ii->cpt_uid;
++	newattrs.ia_gid = ii->cpt_gid;
++	newattrs.ia_mode = ii->cpt_mode;
++
++	mutex_lock(&d->d_inode->i_mutex);
++	err = notify_change(d, &newattrs);
++	mutex_unlock(&d->d_inode->i_mutex);
++
++	return err;
++}
++
++/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open
++ * /dev/ptmx until we get pty with desired index.
++ */
++
++struct file *ptmx_open(int index, unsigned int flags)
++{
++	struct file *file;
++	struct file **stack = NULL;
++	int depth = 0;
++
++	for (;;) {
++		struct tty_struct *tty;
++
++		file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++		if (IS_ERR(file))
++			break;
++		tty = file->private_data;
++		if (tty->index == index)
++			break;
++
++		if (depth == PAGE_SIZE/sizeof(struct file *)) {
++			fput(file);
++			file = ERR_PTR(-EBUSY);
++			break;
++		}
++		if (stack == NULL) {
++			stack = (struct file **)__get_free_page(GFP_KERNEL);
++			if (!stack) {
++				fput(file);
++				file = ERR_PTR(-ENOMEM);
++				break;
++			}
++		}
++		stack[depth] = file;
++		depth++;
++	}
++	while (depth > 0) {
++		depth--;
++		fput(stack[depth]);
++	}
++	if (stack)
++		free_page((unsigned long)stack);
++	return file;
++}
++
++
++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii,
++			   unsigned flags, struct cpt_context *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++	struct file *master, *slave;
++	struct tty_struct *stty;
++	struct cpt_tty_image *pi;
++	static char *a = "pqrstuvwxyzabcde";
++	static char *b = "0123456789abcdef";
++	char pairname[16];
++	unsigned master_flags, slave_flags;
++
++	if (fi->cpt_priv == CPT_NULL)
++		return ERR_PTR(-EINVAL);
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx);
++	if (obj && obj->o_parent) {
++		dprintk_ctx("obtained pty as pair to existing\n");
++		master = obj->o_parent;
++		stty = master->private_data;
++
++		if (stty->driver->subtype == PTY_TYPE_MASTER &&
++		    (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) {
++			wprintk_ctx("cloning ptmx\n");
++			get_file(master);
++			return master;
++		}
++
++		master = dentry_open(dget(master->f_dentry),
++				     mntget(master->f_vfsmnt), flags);
++		if (!IS_ERR(master)) {
++			stty = master->private_data;
++			if (stty->driver->subtype != PTY_TYPE_MASTER)
++				fixup_tty_attrs(ii, master, ctx);
++		}
++		return master;
++	}
++
++	pi = cpt_get_buf(ctx);
++	err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx);
++	if (err) {
++		cpt_release_buf(ctx);
++		return ERR_PTR(err);
++	}
++
++	master_flags = slave_flags = 0;
++	if (pi->cpt_drv_subtype == PTY_TYPE_MASTER)
++		master_flags = flags;
++	else
++		slave_flags = flags;
++
++	/*
++	 * Open pair master/slave.
++	 */
++	if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) {
++		master = ptmx_open(pi->cpt_index, master_flags);
++	} else {
++		sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]);
++		master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++	}
++	if (IS_ERR(master)) {
++		eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master));
++		cpt_release_buf(ctx);
++		return master;
++	}
++	stty = master->private_data;
++	clear_bit(TTY_PTY_LOCK, &stty->flags);
++	if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM)
++		sprintf(pairname, "/dev/pts/%d", stty->index);
++	else
++		sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]);
++	slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++	if (IS_ERR(slave)) {
++		eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave));
++		fput(master);
++		cpt_release_buf(ctx);
++		return slave;
++	}
++
++	if (pi->cpt_drv_subtype != PTY_TYPE_MASTER)
++		fixup_tty_attrs(ii, slave, ctx);
++
++	cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx);
++	cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx);
++	cpt_object_add(CPT_OBJ_FILE, master, ctx);
++	cpt_object_add(CPT_OBJ_FILE, slave, ctx);
++
++	if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) {
++		loff_t pos;
++		obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
++		obj->o_parent = master;
++		cpt_obj_setpos(obj, fi->cpt_priv, ctx);
++		pty_setup(stty, fi->cpt_priv, pi, ctx);
++
++		obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
++		obj->o_parent = slave;
++		pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx);
++		cpt_obj_setpos(obj, pos, ctx);
++
++		obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx);
++		cpt_obj_setpos(obj, CPT_NULL, ctx);
++		get_file(master);
++		cpt_release_buf(ctx);
++		return master;
++	} else {
++		loff_t pos;
++		obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
++		obj->o_parent = slave;
++		cpt_obj_setpos(obj, fi->cpt_priv, ctx);
++		pty_setup(stty->link, fi->cpt_priv, pi, ctx);
++
++		obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
++		obj->o_parent = master;
++		pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx);
++		cpt_obj_setpos(obj, pos, ctx);
++
++		obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx);
++		cpt_obj_setpos(obj, CPT_NULL, ctx);
++		get_file(slave);
++		cpt_release_buf(ctx);
++		return slave;
++	}
++}
++
++int rst_tty_jobcontrol(struct cpt_context *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_TTY];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		cpt_object_t *obj;
++		struct cpt_tty_image *pibuf = cpt_get_buf(ctx);
++
++		if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) {
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++
++		obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx);
++		if (obj) {
++			struct tty_struct *stty = obj->o_obj;
++			if ((int)pibuf->cpt_pgrp > 0) {
++				rcu_read_lock();
++				stty->pgrp = get_pid(alloc_vpid_safe(pibuf->cpt_pgrp));
++				rcu_read_unlock();
++				if (!stty->pgrp)
++					dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp);
++			} else if (pibuf->cpt_pgrp) {
++				stty->pgrp = alloc_pid(current->nsproxy->pid_ns,
++							0);
++				if (!stty->pgrp) {
++					eprintk_ctx("cannot allocate stray tty->pgrp");
++					cpt_release_buf(ctx);
++					return -EINVAL;
++				}
++			}
++			if ((int)pibuf->cpt_session > 0) {
++				struct pid *sess;
++
++				rcu_read_lock();
++				sess = get_pid(alloc_vpid_safe(pibuf->cpt_session));
++				rcu_read_unlock();
++				if (!sess) {
++					dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session);
++				} else if (!stty->session) {
++					stty->session = sess;
++				}
++			}
++		}
++		sec += pibuf->cpt_next;
++		cpt_release_buf(ctx);
++	}
++	return 0;
++}
+diff --git a/kernel/cpt/rst_ubc.c b/kernel/cpt/rst_ubc.c
+new file mode 100644
+index 0000000..a39ae28
+--- /dev/null
++++ b/kernel/cpt/rst_ubc.c
+@@ -0,0 +1,131 @@
++/*
++ *
++ *  kernel/cpt/rst_ubc.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/types.h>
++#include <bc/beancounter.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx);
++	if (obj == NULL) {
++		eprintk("RST: unknown ub @%Ld\n", (long long)pos);
++		return get_beancounter(get_exec_ub());
++	}
++	return get_beancounter(obj->o_obj);
++}
++
++void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id)
++{
++	to[bc_parm_id].barrier = from[bc_parm_id].barrier;
++	to[bc_parm_id].limit = from[bc_parm_id].limit;
++}
++
++void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id)
++{
++	ubprm[bc_parm_id].barrier = UB_MAXVALUE;
++	ubprm[bc_parm_id].limit = UB_MAXVALUE;
++}
++
++static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
++		int held)
++{
++	prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier);
++	prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit);
++	if (held)
++		prm->held = dmp->held;
++	prm->maxheld = dmp->maxheld;
++	prm->minheld = dmp->minheld;
++	prm->failcnt = dmp->failcnt;
++}
++
++static int restore_one_bc(struct cpt_beancounter_image *v,
++		cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct user_beancounter *bc;
++	cpt_object_t *pobj;
++	int i;
++
++	if (v->cpt_parent != CPT_NULL) {
++		pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx);
++		if (pobj == NULL)
++			return -ESRCH;
++		bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1);
++	} else {
++		bc = get_exec_ub();
++		while (bc->parent)
++			bc = bc->parent;
++		get_beancounter(bc);
++	}
++	if (bc == NULL)
++		return -ENOMEM;
++	obj->o_obj = bc;
++
++	if (ctx->image_version < CPT_VERSION_18 &&
++			CPT_VERSION_MINOR(ctx->image_version) < 1)
++		goto out;
++
++	for (i = 0; i < UB_RESOURCES; i++) {
++		restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0);
++		restore_one_bc_parm(v->cpt_parms + i * 2 + 1,
++				bc->ub_store + i, 1);
++	}
++
++out:
++	if (!bc->parent)
++		for (i = 0; i < UB_RESOURCES; i++)
++			copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i);
++
++	return 0;
++}
++
++int rst_undump_ubc(struct cpt_context *ctx)
++{
++	loff_t start, end;
++	struct cpt_beancounter_image *v;
++	cpt_object_t *obj;
++	int err;
++
++	err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end);
++	if (err)
++		return err;
++
++	while (start < end) {
++		v = cpt_get_buf(ctx);
++		err = rst_get_object(CPT_OBJ_UBC, start, v, ctx);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++
++		obj = alloc_cpt_object(GFP_KERNEL, ctx);
++		cpt_obj_setpos(obj, start, ctx);
++		intern_cpt_object(CPT_OBJ_UBC, obj, ctx);
++
++		restore_one_bc(v, obj, ctx);
++
++		cpt_release_buf(ctx);
++		start += v->cpt_next;
++	}
++	return 0;
++}
++
++void rst_finish_ubc(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++
++	for_each_object(obj, CPT_OBJ_UBC)
++		put_beancounter(obj->o_obj);
++}
+diff --git a/kernel/cpt/rst_undump.c b/kernel/cpt/rst_undump.c
+new file mode 100644
+index 0000000..1a002d5
+--- /dev/null
++++ b/kernel/cpt/rst_undump.c
+@@ -0,0 +1,1007 @@
++/*
++ *
++ *  kernel/cpt/rst_undump.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/poll.h>
++#include <linux/mnt_namespace.h>
++#include <linux/personality.h>
++#include <linux/binfmts.h>
++#include <linux/smp_lock.h>
++#include <linux/ve_proto.h>
++#include <linux/virtinfo.h>
++#include <linux/virtinfoscp.h>
++#include <linux/compat.h>
++#include <linux/vzcalluser.h>
++#include <bc/beancounter.h>
++#ifdef CONFIG_X86
++#include <asm/desc.h>
++#endif
++#include <asm/unistd.h>
++#include <linux/nsproxy.h>
++#include <linux/pid_namespace.h>
++#include <linux/utsname.h>
++#include <linux/futex.h>
++#include <linux/shm.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_process.h"
++#include "cpt_socket.h"
++#include "cpt_net.h"
++#include "cpt_ubc.h"
++#include "cpt_kernel.h"
++
++static int rst_utsname(cpt_context_t *ctx);
++
++
++struct thr_context {
++	struct completion init_complete;
++	struct completion task_done;
++	int error;
++	struct cpt_context *ctx;
++	cpt_object_t	*tobj;
++};
++
++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx);
++
++static int vps_rst_veinfo(struct cpt_context *ctx)
++{
++	int err;
++	struct cpt_veinfo_image *i;
++	struct ve_struct *ve;
++	struct timespec delta;
++	loff_t start, end;
++	struct ipc_namespace *ns;
++
++	err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end);
++	if (err)
++		goto out;
++
++	i = cpt_get_buf(ctx);
++	memset(i, 0, sizeof(*i));
++	err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx);
++	if (err)
++		goto out_rel;
++
++	ve = get_exec_env();
++	ns = ve->ve_ns->ipc_ns;
++
++	/* Damn. Fatal mistake, these two values are size_t! */
++	ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU;
++	ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU;
++	ns->shm_ctlmni = i->shm_ctl_mni;
++
++	ns->msg_ctlmax = i->msg_ctl_max;
++	ns->msg_ctlmni = i->msg_ctl_mni;
++	ns->msg_ctlmnb = i->msg_ctl_mnb;
++
++	BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
++	ns->sem_ctls[0] = i->sem_ctl_arr[0];
++	ns->sem_ctls[1] = i->sem_ctl_arr[1];
++	ns->sem_ctls[2] = i->sem_ctl_arr[2];
++	ns->sem_ctls[3] = i->sem_ctl_arr[3];
++
++	cpt_timespec_import(&delta, i->start_timespec_delta);
++	_set_normalized_timespec(&ve->start_timespec,
++			ve->start_timespec.tv_sec - delta.tv_sec,
++			ve->start_timespec.tv_nsec - delta.tv_nsec);
++	ve->start_jiffies -= i->start_jiffies_delta;
++	// // FIXME: what???
++	// // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy;
++
++	ctx->last_vpid = i->last_pid;
++
++	err = 0;
++out_rel:
++	cpt_release_buf(ctx);
++out:
++	return err;
++}
++
++static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	int err;
++	struct env_create_param3 param;
++
++	do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
++	do_gettimespec(&ctx->delta_time);
++
++	_set_normalized_timespec(&ctx->delta_time,
++				 ctx->delta_time.tv_sec - ctx->start_time.tv_sec,
++				 ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec);
++	ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec;
++	if (ctx->delta_nsec < 0) {
++		wprintk_ctx("Wall time is behind source by %Ld ns, "
++			    "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec);
++	}
++
++        _set_normalized_timespec(&ctx->cpt_monotonic_time,
++                                 ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec,
++                                 ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec);
++
++	memset(&param, 0, sizeof(param));
++	param.iptables_mask = ctx->iptables_mask;
++	param.feature_mask = ctx->features;
++
++	/* feature_mask is set as required - pretend we know everything */
++	param.known_features = (ctx->image_version < CPT_VERSION_18) ?
++		VE_FEATURES_OLD : ~(__u64)0;
++
++	err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2,
++			&param, sizeof(param));
++	if (err < 0)
++		eprintk_ctx("real_env_create: %d\n", err);
++
++	get_exec_env()->jiffies_fixup =
++		(ctx->delta_time.tv_sec < 0 ?
++		 0 : timespec_to_jiffies(&ctx->delta_time)) -
++		(unsigned long)(get_jiffies_64() - ctx->virt_jiffies64);
++	dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup,
++		    (long long)ctx->delta_nsec);
++	return err < 0 ? err : 0;
++}
++
++static int hook(void *arg)
++{
++	struct thr_context *thr_ctx = arg;
++	struct cpt_context *ctx;
++	cpt_object_t *tobj;
++	struct cpt_task_image *ti;
++	int err = 0;
++	int exiting = 0;
++
++	current->state = TASK_UNINTERRUPTIBLE;
++	complete(&thr_ctx->init_complete);
++	schedule();
++
++	ctx = thr_ctx->ctx;
++	tobj = thr_ctx->tobj;
++	ti = tobj->o_image;
++
++	current->fs->umask = 0;
++
++	if (ti->cpt_pid == 1) {
++#ifdef CONFIG_BEANCOUNTERS
++		struct user_beancounter *bc;
++#endif
++
++		err = vps_rst_reparent_root(tobj, ctx);
++
++		if (err) {
++			rst_report_error(err, ctx);
++			goto out;
++		}
++
++		memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t));
++
++		if (ctx->statusfile) {
++			fput(ctx->statusfile);
++			ctx->statusfile = NULL;
++		}
++
++		if (ctx->lockfile) {
++			char b;
++			mm_segment_t oldfs;
++			err = -EINVAL;
++
++			oldfs = get_fs(); set_fs(KERNEL_DS);
++			if (ctx->lockfile->f_op && ctx->lockfile->f_op->read)
++				err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos);
++			set_fs(oldfs);
++			fput(ctx->lockfile);
++			ctx->lockfile = NULL;
++		}
++
++		if (err) {
++			eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err);
++			goto out;
++		}
++		err = vps_rst_veinfo(ctx);
++		if (err) {
++			eprintk_ctx("rst_veinfo: %d\n", err);
++			goto out;
++		}
++
++		err = rst_utsname(ctx);
++		if (err) {
++			eprintk_ctx("rst_utsname: %d\n", err);
++			goto out;
++		}
++
++		err = rst_root_namespace(ctx);
++		if (err) {
++			eprintk_ctx("rst_namespace: %d\n", err);
++			goto out;
++		}
++
++		if ((err = rst_restore_net(ctx)) != 0) {
++			eprintk_ctx("rst_restore_net: %d\n", err);
++			goto out;
++		}
++
++		err = rst_sockets(ctx);
++		if (err) {
++			eprintk_ctx("rst_sockets: %d\n", err);
++			goto out;
++		}
++		err = rst_sysv_ipc(ctx);
++		if (err) {
++			eprintk_ctx("rst_sysv_ipc: %d\n", err);
++			goto out;
++		}
++#ifdef CONFIG_BEANCOUNTERS
++		bc = get_exec_ub();
++		set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE);
++		set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC);
++		set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE);
++		set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE);
++#endif
++	}
++
++	do {
++		if (current->user->uid != ti->cpt_user) {
++			struct user_struct *u;
++
++			u = alloc_uid(get_exec_env()->ve_ns->user_ns, ti->cpt_user);
++			if (!u) {
++				eprintk_ctx("alloc_user\n");
++			} else {
++				switch_uid(u);
++			}
++		}
++	} while (0);
++
++	if ((err = rst_mm_complete(ti, ctx)) != 0) {
++		eprintk_ctx("rst_mm: %d\n", err);
++		goto out;
++	}
++
++	if ((err = rst_files_complete(ti, ctx)) != 0) {
++		eprintk_ctx("rst_files: %d\n", err);
++		goto out;
++	}
++
++	if ((err = rst_fs_complete(ti, ctx)) != 0) {
++		eprintk_ctx("rst_fs: %d\n", err);
++		goto out;
++	}
++
++	if ((err = rst_semundo_complete(ti, ctx)) != 0) {
++		eprintk_ctx("rst_semundo: %d\n", err);
++		goto out;
++	}
++
++	if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) {
++		eprintk_ctx("rst_signal: %d\n", err);
++		goto out;
++	}
++
++	if (ti->cpt_personality != 0)
++		__set_personality(ti->cpt_personality);
++
++#ifdef CONFIG_X86_64
++	/* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */
++	if (!ti->cpt_64bit)
++		__set_personality(PER_LINUX32);
++#endif
++
++	current->set_child_tid = NULL;
++	current->clear_child_tid = NULL;
++	current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV);
++	current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV);
++	current->exit_code = ti->cpt_exit_code;
++	current->pdeath_signal = ti->cpt_pdeath_signal;
++
++	if (ti->cpt_restart.fn != CPT_RBL_0) {
++		if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP
++#ifdef CONFIG_COMPAT
++		    || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP
++#endif
++		    ) {
++			struct restart_block *rb;
++			ktime_t e;
++
++			e.tv64 = 0;
++
++			if (ctx->image_version >= CPT_VERSION_20)
++				e = ktime_add_ns(e, ti->cpt_restart.arg2);
++			else if (ctx->image_version >= CPT_VERSION_9)
++				e = ktime_add_ns(e, ti->cpt_restart.arg0);
++			else
++				e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC);
++			if (e.tv64 < 0)
++				e.tv64 = TICK_NSEC;
++			e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
++
++			rb = &task_thread_info(current)->restart_block;
++			if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP)
++				rb->fn = hrtimer_nanosleep_restart;
++#ifdef CONFIG_COMPAT
++			else
++				rb->fn = compat_nanosleep_restart;
++#endif
++			if (ctx->image_version >= CPT_VERSION_20) {
++				rb->arg0 = ti->cpt_restart.arg0;
++				rb->arg1 = ti->cpt_restart.arg1;
++				rb->arg2 = e.tv64 & 0xFFFFFFFF;
++				rb->arg3 = e.tv64 >> 32;
++			} else if (ctx->image_version >= CPT_VERSION_9) {
++				rb->arg0 = ti->cpt_restart.arg2;
++				rb->arg1 = ti->cpt_restart.arg3;
++				rb->arg2 = e.tv64 & 0xFFFFFFFF;
++				rb->arg3 = e.tv64 >> 32;
++			} else {
++				rb->arg0 = ti->cpt_restart.arg1;
++				rb->arg1 = CLOCK_MONOTONIC;
++				rb->arg2 = e.tv64 & 0xFFFFFFFF;
++				rb->arg3 = e.tv64 >> 32;
++			}
++		} else if (ti->cpt_restart.fn == CPT_RBL_POLL) {
++			struct restart_block *rb;
++			ktime_t e;
++			struct timespec ts;
++			unsigned long timeout_jiffies;
++			
++			e.tv64 = 0;
++			e = ktime_add_ns(e, ti->cpt_restart.arg2);
++			e = ktime_sub(e, timespec_to_ktime(ctx->delta_time));
++			ts = ns_to_timespec(ktime_to_ns(e));
++			timeout_jiffies = timespec_to_jiffies(&ts);
++
++			rb = &task_thread_info(current)->restart_block;
++			rb->fn = do_restart_poll;
++			rb->arg0 = ti->cpt_restart.arg0;
++			rb->arg1 = ti->cpt_restart.arg1;
++			rb->arg2 = timeout_jiffies & 0xFFFFFFFF;
++			rb->arg3 = (u64)timeout_jiffies >> 32;
++		} else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) {
++			struct restart_block *rb;
++			ktime_t e;
++
++			e.tv64 = 0;
++			e = ktime_add_ns(e, ti->cpt_restart.arg2);
++			e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
++
++			rb = &task_thread_info(current)->restart_block;
++			rb->fn = futex_wait_restart;
++			rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0;
++			rb->futex.val   = ti->cpt_restart.arg1;
++			rb->futex.time  = e.tv64;
++			rb->futex.flags = ti->cpt_restart.arg3;
++		} else
++			eprintk_ctx("unknown restart block\n");
++	}
++
++	if (thread_group_leader(current)) {
++		current->signal->it_real_incr.tv64 = 0;
++		if (ctx->image_version >= CPT_VERSION_9) {
++			current->signal->it_real_incr =
++			ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr);
++		} else {
++			current->signal->it_real_incr =
++			ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC);
++		}
++		current->signal->it_prof_incr = ti->cpt_it_prof_incr;
++		current->signal->it_virt_incr = ti->cpt_it_virt_incr; 
++		current->signal->it_prof_expires = ti->cpt_it_prof_value;
++		current->signal->it_virt_expires = ti->cpt_it_virt_value;
++	}
++
++	err = rst_clone_children(tobj, ctx);
++	if (err) {
++		eprintk_ctx("rst_clone_children\n");
++		goto out;
++	}
++
++	if (exiting)
++		current->signal->flags |= SIGNAL_GROUP_EXIT;
++
++	if (ti->cpt_pid == 1) {
++		if ((err = rst_process_linkage(ctx)) != 0) {
++			eprintk_ctx("rst_process_linkage: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_do_filejobs(ctx)) != 0) {
++			eprintk_ctx("rst_do_filejobs: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_eventpoll(ctx)) != 0) {
++			eprintk_ctx("rst_eventpoll: %d\n", err);
++			goto out;
++		}
++#ifdef CONFIG_INOTIFY_USER
++		if ((err = rst_inotify(ctx)) != 0) {
++			eprintk_ctx("rst_inotify: %d\n", err);
++			goto out;
++		}
++#endif
++		if ((err = rst_sockets_complete(ctx)) != 0) {
++			eprintk_ctx("rst_sockets_complete: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_stray_files(ctx)) != 0) {
++			eprintk_ctx("rst_stray_files: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_posix_locks(ctx)) != 0) {
++			eprintk_ctx("rst_posix_locks: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_tty_jobcontrol(ctx)) != 0) {
++			eprintk_ctx("rst_tty_jobcontrol: %d\n", err);
++			goto out;
++		}
++		if ((err = rst_restore_fs(ctx)) != 0) {
++			eprintk_ctx("rst_restore_fs: %d\n", err);
++			goto out;
++		}
++		if (virtinfo_notifier_call(VITYPE_SCP,
++				VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) {
++			err = -ECHRNG;
++			eprintk_ctx("scp_restore failed\n");
++			goto out;
++		}
++		if (ctx->last_vpid)
++			get_exec_env()->ve_ns->pid_ns->last_pid =
++				ctx->last_vpid;
++	}
++
++out:
++	thr_ctx->error = err;
++	complete(&thr_ctx->task_done);
++
++	if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++		current->flags |= PF_EXIT_RESTART;
++		do_exit(ti->cpt_exit_code);
++	} else {
++		__set_current_state(TASK_UNINTERRUPTIBLE);
++	}
++
++	schedule();
++
++	dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
++
++	module_put(THIS_MODULE);
++	complete_and_exit(NULL, 0);
++	return 0;
++}
++
++#if 0
++static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++	struct task_beancounter *tbc;
++
++	tbc = task_bc(current);
++
++	put_beancounter(tbc->fork_sub);
++	tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx);
++	if (ti->cpt_mm_ub != CPT_NULL) {
++		put_beancounter(tbc->exec_ub);
++		tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx);
++	}
++}
++#endif
++
++static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx,
++		struct thr_context *thr_ctx)
++{
++	struct task_struct *tsk;
++	int pid;
++
++	thr_ctx->ctx = ctx;
++	thr_ctx->error = 0;
++	init_completion(&thr_ctx->init_complete);
++	init_completion(&thr_ctx->task_done);
++#if 0
++	set_task_ubs(obj->o_image, ctx);
++#endif
++
++	pid = local_kernel_thread(hook, thr_ctx, 0, 0);
++	if (pid < 0)
++		return pid;
++	read_lock(&tasklist_lock);
++	tsk = find_task_by_vpid(pid);
++	if (tsk)
++		get_task_struct(tsk);
++	read_unlock(&tasklist_lock);
++	if (tsk == NULL)
++		return -ESRCH;
++	cpt_obj_setobj(obj, tsk, ctx);
++	thr_ctx->tobj = obj;
++	return 0;
++}
++
++static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	struct task_struct *tsk = obj->o_obj;
++	struct cpt_task_image *ti = obj->o_image;
++
++	memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm));
++	rst_mm_basic(obj, ti, ctx);
++	return 0;
++}
++
++static int make_baby(cpt_object_t *cobj,
++		     struct cpt_task_image *pi,
++		     struct cpt_context *ctx)
++{
++	unsigned long flags;
++	struct cpt_task_image *ci = cobj->o_image;
++	struct thr_context thr_ctx;
++	struct task_struct *tsk;
++	pid_t pid;
++	struct fs_struct *tfs = NULL;
++
++	flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx)
++		| rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx);
++	if (ci->cpt_rppid != pi->cpt_pid) {
++		flags |= CLONE_THREAD|CLONE_PARENT;
++		if (ci->cpt_signal != pi->cpt_signal ||
++		    !(flags&CLONE_SIGHAND) ||
++		    (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) {
++			eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n",
++			       (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid,
++			       (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags
++			       );
++			return -EINVAL;
++		}
++	}
++
++	thr_ctx.ctx = ctx;
++	thr_ctx.error = 0;
++	init_completion(&thr_ctx.init_complete);
++	init_completion(&thr_ctx.task_done);
++	thr_ctx.tobj = cobj;
++
++#if 0
++	set_task_ubs(ci, ctx);
++#endif
++
++	if (current->fs == NULL) {
++		tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs;
++		if (tfs == NULL)
++			return -EINVAL;
++		atomic_inc(&tfs->count);
++		current->fs = tfs;
++	}
++	pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid);
++	if (tfs) {
++		current->fs = NULL;
++		atomic_dec(&tfs->count);
++	}
++	if (pid < 0)
++		return pid;
++
++	read_lock(&tasklist_lock);
++	tsk = find_task_by_vpid(pid);
++	if (tsk)
++		get_task_struct(tsk);
++	read_unlock(&tasklist_lock);
++	if (tsk == NULL)
++		return -ESRCH;
++	cpt_obj_setobj(cobj, tsk, ctx);
++	thr_ctx.tobj = cobj;
++	wait_for_completion(&thr_ctx.init_complete);
++	wait_task_inactive(cobj->o_obj);
++	rst_basic_init_task(cobj, ctx);
++
++	/* clone() increases group_stop_count if it was not zero and
++	 * CLONE_THREAD was asked. Undo.
++	 */
++	if (current->signal->group_stop_count && (flags & CLONE_THREAD)) {
++		if (tsk->signal != current->signal) BUG();
++		current->signal->group_stop_count--;
++	}
++
++	wake_up_process(tsk);
++	wait_for_completion(&thr_ctx.task_done);
++	wait_task_inactive(tsk);
++
++	return thr_ctx.error;
++}
++
++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx)
++{
++	int err = 0;
++	struct cpt_task_image *ti = obj->o_image;
++	cpt_object_t *cobj;
++
++	for_each_object(cobj, CPT_OBJ_TASK) {
++		struct cpt_task_image *ci = cobj->o_image;
++		if (cobj == obj)
++			continue;
++		if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) ||
++		    (ci->cpt_leader == ti->cpt_pid &&
++		     ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) {
++			err = make_baby(cobj, ti, ctx);
++			if (err) {
++				eprintk_ctx("make_baby: %d\n", err);
++				return err;
++			}
++		}
++	}
++	return 0;
++}
++
++static int read_task_images(struct cpt_context *ctx)
++{
++	int err;
++	loff_t start, end;
++
++	err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end);
++	if (err)
++		return err;
++
++	while (start < end) {
++		cpt_object_t *obj;
++		struct cpt_task_image *ti = cpt_get_buf(ctx);
++
++		err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx);
++		if (err) {
++			cpt_release_buf(ctx);
++			return err;
++		}
++#if 0
++		if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) {
++			eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid);
++			cpt_release_buf(ctx);
++			return -EINVAL;
++		}
++#endif
++		obj = alloc_cpt_object(GFP_KERNEL, ctx);
++		cpt_obj_setpos(obj, start, ctx);
++		intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
++		obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL);
++		if (obj->o_image == NULL) {
++			cpt_release_buf(ctx);
++			return -ENOMEM;
++		}
++		memcpy(obj->o_image, ti, sizeof(*ti));
++		err = ctx->pread(obj->o_image + sizeof(*ti),
++				 ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti));
++		cpt_release_buf(ctx);
++		if (err)
++			return err;
++		start += ti->cpt_next;
++	}
++	return 0;
++}
++
++
++static int vps_rst_restore_tree(struct cpt_context *ctx)
++{
++	int err;
++	cpt_object_t *obj;
++	struct thr_context thr_ctx_root;
++
++	err = read_task_images(ctx);
++	if (err)
++		return err;
++
++	err = rst_undump_ubc(ctx);
++	if (err)
++		return err;
++
++	if (virtinfo_notifier_call(VITYPE_SCP,
++				VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL)
++		return -ECHRNG;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	err = rst_setup_pagein(ctx);
++	if (err)
++		return err;
++#endif
++	for_each_object(obj, CPT_OBJ_TASK) {
++		err = create_root_task(obj, ctx, &thr_ctx_root);
++		if (err)
++			return err;
++
++		wait_for_completion(&thr_ctx_root.init_complete);
++		wait_task_inactive(obj->o_obj);
++		rst_basic_init_task(obj, ctx);
++
++		wake_up_process(obj->o_obj);
++		wait_for_completion(&thr_ctx_root.task_done);
++		wait_task_inactive(obj->o_obj);
++		err = thr_ctx_root.error;
++		if (err)
++			return err;
++		break;
++	}
++
++	return err;
++}
++
++#ifndef CONFIG_IA64
++int rst_read_vdso(struct cpt_context *ctx)
++{
++	int err;
++	loff_t start, end;
++	struct cpt_page_block *pgb;
++
++	ctx->vdso = NULL;
++	err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end);
++	if (err)
++		return err;
++	if (start == CPT_NULL)
++		return 0;
++	if (end < start + sizeof(*pgb) + PAGE_SIZE)
++		return -EINVAL;
++
++	pgb = cpt_get_buf(ctx);
++	err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx);
++	if (err) {
++		goto err_buf;
++	}
++	ctx->vdso = (char*)__get_free_page(GFP_KERNEL);
++	if (ctx->vdso == NULL) {
++		err = -ENOMEM;
++		goto err_buf;
++	}
++	err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb));
++	if (err)
++		goto err_page;
++	if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) {
++		free_page((unsigned long)ctx->vdso);
++		ctx->vdso = NULL;
++	}
++
++	cpt_release_buf(ctx);
++	return 0;
++err_page:
++	free_page((unsigned long)ctx->vdso);
++	ctx->vdso = NULL;
++err_buf:
++	cpt_release_buf(ctx);
++	return err;
++}
++#endif
++
++int vps_rst_undump(struct cpt_context *ctx)
++{
++	int err;
++	unsigned long umask;
++
++	err = rst_open_dumpfile(ctx);
++	if (err)
++		return err;
++
++	if (ctx->tasks64) {
++#if defined(CONFIG_IA64)
++		if (ctx->image_arch != CPT_OS_ARCH_IA64)
++#elif defined(CONFIG_X86_64)
++		if (ctx->image_arch != CPT_OS_ARCH_EMT64)
++#else
++		if (1)
++#endif
++		{
++			eprintk_ctx("Cannot restore 64 bit container on this architecture\n");
++			return -EINVAL;
++		}
++	}
++
++	umask = current->fs->umask;
++	current->fs->umask = 0;
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	err = rst_setup_pagein(ctx);
++#endif
++#ifndef CONFIG_IA64
++	if (err == 0)
++		err = rst_read_vdso(ctx);
++#endif
++	if (err == 0)
++		err = vps_rst_restore_tree(ctx);
++
++	if (err == 0)
++		err = rst_restore_process(ctx);
++
++	if (err)
++		virtinfo_notifier_call(VITYPE_SCP,
++				VIRTINFO_SCP_RSTFAIL, ctx);
++
++	current->fs->umask = umask;
++
++        return err;
++}
++
++static int rst_unlock_ve(struct cpt_context *ctx)
++{
++	struct ve_struct *env;
++
++	env = get_ve_by_id(ctx->ve_id);
++	if (!env)
++		return -ESRCH;
++	down_write(&env->op_sem);
++	env->is_locked = 0;
++	up_write(&env->op_sem);
++	put_ve(env);
++	return 0;
++}
++
++int recalc_sigpending_tsk(struct task_struct *t);
++
++int rst_resume(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++	int err = 0;
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *bc;
++#endif
++
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++
++		fput(file);
++	}
++
++#ifdef CONFIG_BEANCOUNTERS
++	bc = get_beancounter_byuid(ctx->ve_id, 0);
++	BUG_ON(!bc);
++	copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE);
++	copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC);
++	copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE);
++	copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE);
++	put_beancounter(bc);
++#endif
++
++	rst_resume_network(ctx);
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++		struct cpt_task_image *ti = obj->o_image;
++
++		if (!tsk)
++			continue;
++
++		if (ti->cpt_state == TASK_UNINTERRUPTIBLE) {
++			dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
++
++			/* Weird... If a signal is sent to stopped task,
++			 * nobody makes recalc_sigpending(). We have to do
++			 * this by hands after wake_up_process().
++			 * if we did this before a signal could arrive before
++			 * wake_up_process() and stall.
++			 */
++			spin_lock_irq(&tsk->sighand->siglock);
++			if (!signal_pending(tsk))
++				recalc_sigpending_tsk(tsk);
++			spin_unlock_irq(&tsk->sighand->siglock);
++
++			wake_up_process(tsk);
++		} else {
++			if (ti->cpt_state == TASK_STOPPED ||
++			    ti->cpt_state == TASK_TRACED) {
++				set_task_state(tsk, ti->cpt_state);
++			}
++		}
++		put_task_struct(tsk);
++	}
++
++	rst_unlock_ve(ctx);
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	rst_complete_pagein(ctx, 0);
++#endif
++
++	rst_finish_ubc(ctx);
++	cpt_object_destroy(ctx);
++
++        return err;
++}
++
++int rst_kill(struct cpt_context *ctx)
++{
++	cpt_object_t *obj;
++	int err = 0;
++
++	for_each_object(obj, CPT_OBJ_FILE) {
++		struct file *file = obj->o_obj;
++
++		fput(file);
++	}
++
++	for_each_object(obj, CPT_OBJ_TASK) {
++		struct task_struct *tsk = obj->o_obj;
++
++		if (tsk == NULL)
++			continue;
++
++		if (tsk->exit_state == 0) {
++			send_sig(SIGKILL, tsk, 1);
++
++			spin_lock_irq(&tsk->sighand->siglock);
++			sigfillset(&tsk->blocked);
++			sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
++			set_tsk_thread_flag(tsk, TIF_SIGPENDING);
++			clear_tsk_thread_flag(tsk, TIF_FREEZE);
++			if (tsk->flags & PF_FROZEN)
++				tsk->flags &= ~PF_FROZEN;
++			spin_unlock_irq(&tsk->sighand->siglock);
++
++			wake_up_process(tsk);
++		}
++
++		put_task_struct(tsk);
++	}
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++	rst_complete_pagein(ctx, 1);
++#endif
++
++	rst_finish_ubc(ctx);
++	cpt_object_destroy(ctx);
++
++        return err;
++}
++
++static int rst_utsname(cpt_context_t *ctx)
++{
++	int err;
++	loff_t sec = ctx->sections[CPT_SECT_UTSNAME];
++	loff_t endsec;
++	struct cpt_section_hdr h;
++	struct cpt_object_hdr o;
++	struct ve_struct *ve;
++	struct uts_namespace *ns;
++	int i;
++
++	if (sec == CPT_NULL)
++		return 0;
++
++	err = ctx->pread(&h, sizeof(h), ctx, sec);
++	if (err)
++		return err;
++	if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h))
++		return -EINVAL;
++
++	ve = get_exec_env();
++	ns = ve->ve_ns->uts_ns;
++
++	i = 0;
++	endsec = sec + h.cpt_next;
++	sec += h.cpt_hdrlen;
++	while (sec < endsec) {
++		int len;
++		char *ptr;
++		err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx);
++		if (err)
++			return err;
++		len = o.cpt_next - o.cpt_hdrlen;
++		if (len > __NEW_UTS_LEN + 1)
++			return -ENAMETOOLONG;
++		switch (i) {
++		case 0:
++			ptr = ns->name.nodename; break;
++		case 1:
++			ptr = ns->name.domainname; break;
++		default:
++			return -EINVAL;
++		}
++		err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen);
++		if (err)
++			return err;
++		i++;
++		sec += o.cpt_next;
++	}
++
++	return 0;
++}
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index c77bc3a..1cd334c 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -152,7 +152,7 @@ static inline void check_for_tasks(int cpu)
+ 	struct task_struct *p;
+ 
+ 	write_lock_irq(&tasklist_lock);
+-	for_each_process(p) {
++	for_each_process_all(p) {
+ 		if (task_cpu(p) == cpu &&
+ 		    (!cputime_eq(p->utime, cputime_zero) ||
+ 		     !cputime_eq(p->stime, cputime_zero)))
+diff --git a/kernel/exit.c b/kernel/exit.c
+index 8f6185e..dcc5665 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -22,6 +22,7 @@
+ #include <linux/fdtable.h>
+ #include <linux/binfmts.h>
+ #include <linux/nsproxy.h>
++#include <linux/virtinfo.h>
+ #include <linux/pid_namespace.h>
+ #include <linux/ptrace.h>
+ #include <linux/profile.h>
+@@ -45,13 +46,18 @@
+ #include <linux/resource.h>
+ #include <linux/blkdev.h>
+ #include <linux/task_io_accounting_ops.h>
++#include <linux/ve.h>
++#include <linux/fairsched.h>
++
++#include <bc/misc.h>
++#include <bc/oom_kill.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+ #include <asm/pgtable.h>
+ #include <asm/mmu_context.h>
+ 
+-static void exit_mm(struct task_struct * tsk);
++void exit_mm(struct task_struct * tsk);
+ 
+ static inline int task_detached(struct task_struct *p)
+ {
+@@ -67,6 +73,9 @@ static void __unhash_process(struct task_struct *p)
+ 		detach_pid(p, PIDTYPE_SID);
+ 
+ 		list_del_rcu(&p->tasks);
++#ifdef CONFIG_VE
++		list_del_rcu(&p->ve_task_info.vetask_list);
++#endif
+ 		__get_cpu_var(process_counts)--;
+ 	}
+ 	list_del_rcu(&p->thread_group);
+@@ -162,6 +171,8 @@ repeat:
+ 	ptrace_unlink(p);
+ 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ 	__exit_signal(p);
++	nr_zombie--;
++	atomic_inc(&nr_dead);
+ 
+ 	/*
+ 	 * If we are the last non-leader member of the thread
+@@ -183,9 +194,12 @@ repeat:
+ 		 */
+ 		zap_leader = task_detached(leader);
+ 	}
++	put_task_fairsched_node(p);
+ 
+ 	write_unlock_irq(&tasklist_lock);
+ 	release_thread(p);
++	ub_task_uncharge(p);
++	pput_ve(p->ve_task_info.owner_env);
+ 	call_rcu(&p->rcu, delayed_put_task_struct);
+ 
+ 	p = leader;
+@@ -515,6 +529,7 @@ void put_files_struct(struct files_struct *files)
+ 		free_fdtable(fdt);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(put_files_struct);
+ 
+ void reset_files_struct(struct files_struct *files)
+ {
+@@ -652,13 +667,17 @@ assign_new_owner:
+  * Turn us into a lazy TLB process if we
+  * aren't already..
+  */
+-static void exit_mm(struct task_struct * tsk)
++void exit_mm(struct task_struct * tsk)
+ {
+ 	struct mm_struct *mm = tsk->mm;
+ 
+ 	mm_release(tsk, mm);
+ 	if (!mm)
+ 		return;
++
++	if (test_tsk_thread_flag(tsk, TIF_MEMDIE))
++		mm->oom_killed = 1;
++
+ 	/*
+ 	 * Serialize with any possible pending coredump.
+ 	 * We must hold mmap_sem around checking core_waiters
+@@ -690,6 +709,7 @@ static void exit_mm(struct task_struct * tsk)
+ 	mm_update_next_owner(mm);
+ 	mmput(mm);
+ }
++EXPORT_SYMBOL_GPL(exit_mm);
+ 
+ static void
+ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+@@ -864,6 +884,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
+ 	    !capable(CAP_KILL))
+ 		tsk->exit_signal = SIGCHLD;
+ 
++	if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper)
++		/* We dont want people slaying init. */
++		tsk->exit_signal = SIGCHLD;
++
+ 	/* If something other than our normal parent is ptracing us, then
+ 	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+ 	 * only has special meaning to our real parent.
+@@ -880,6 +904,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
+ 	if (task_detached(tsk) && likely(!tsk->ptrace))
+ 		state = EXIT_DEAD;
+ 	tsk->exit_state = state;
++	nr_zombie++;
+ 
+ 	/* mt-exec, de_thread() is waiting for us */
+ 	if (thread_group_leader(tsk) &&
+@@ -953,6 +978,7 @@ static inline void exit_child_reaper(struct task_struct *tsk)
+ 	 * perform the role of the child_reaper.
+ 	 */
+ 	zap_pid_ns_processes(tsk->nsproxy->pid_ns);
++	(void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL);
+ }
+ 
+ NORET_TYPE void do_exit(long code)
+@@ -1023,12 +1049,14 @@ NORET_TYPE void do_exit(long code)
+ 	}
+ 	acct_collect(code, group_dead);
+ #ifdef CONFIG_FUTEX
+-	if (unlikely(tsk->robust_list))
+-		exit_robust_list(tsk);
++	if (!(tsk->flags & PF_EXIT_RESTART)) {
++		if (unlikely(tsk->robust_list))
++			exit_robust_list(tsk);
+ #ifdef CONFIG_COMPAT
+-	if (unlikely(tsk->compat_robust_list))
+-		compat_exit_robust_list(tsk);
++		if (unlikely(tsk->compat_robust_list))
++			compat_exit_robust_list(tsk);
+ #endif
++	}
+ #endif
+ 	if (group_dead)
+ 		tty_audit_exit();
+@@ -1057,8 +1085,16 @@ NORET_TYPE void do_exit(long code)
+ 	if (tsk->binfmt)
+ 		module_put(tsk->binfmt->module);
+ 
+-	proc_exit_connector(tsk);
+-	exit_notify(tsk, group_dead);
++	if (!(tsk->flags & PF_EXIT_RESTART)) {
++		proc_exit_connector(tsk);
++		exit_notify(tsk, group_dead);
++	} else {
++		write_lock_irq(&tasklist_lock);
++		tsk->exit_state = EXIT_ZOMBIE;
++		nr_zombie++;
++		write_unlock_irq(&tasklist_lock);
++		exit_task_namespaces(tsk);
++	}
+ #ifdef CONFIG_NUMA
+ 	mpol_put(tsk->mempolicy);
+ 	tsk->mempolicy = NULL;
+@@ -1719,6 +1755,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
+ 	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(sys_wait4);
+ 
+ #ifdef __ARCH_WANT_SYS_WAITPID
+ 
+diff --git a/kernel/fairsched.c b/kernel/fairsched.c
+new file mode 100644
+index 0000000..80eebdf
+--- /dev/null
++++ b/kernel/fairsched.c
+@@ -0,0 +1,633 @@
++/*
++ * Fair Scheduler
++ *
++ * Copyright (C) 2000-2008  SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/fairsched.h>
++#include <linux/err.h>
++#include <linux/module.h>
++
++struct fairsched_node fairsched_init_node = {
++	.id		= FAIRSCHED_INIT_NODE_ID,
++	.tg		= &init_task_group,
++#ifdef CONFIG_VE
++	.owner_env	= get_ve0(),
++#endif
++	.weight		= 1,
++};
++
++static DEFINE_MUTEX(fairsched_mutex);
++
++/* list protected with fairsched_mutex */
++static LIST_HEAD(fairsched_node_head);
++static int fairsched_nr_nodes;
++
++void __init fairsched_init_early(void)
++{
++       list_add(&fairsched_init_node.nodelist, &fairsched_node_head);
++       fairsched_nr_nodes++;
++}
++
++#define FSCHWEIGHT_BASE		512000
++
++/******************************************************************************
++ * cfs group shares = FSCHWEIGHT_BASE / fairsched weight
++ *
++ * vzctl cpuunits default 1000
++ * cfs shares default value is 1024 (see init_task_group_load in sched.c)
++ * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024
++ *                              ^--- from vzctl
++ * weight in 1..65535  -->  shares in 7..512000
++ * shares should be >1 (see comment in sched_group_set_shares function)
++ *****************************************************************************/
++
++static struct fairsched_node *fairsched_find(unsigned int id)
++{
++	struct fairsched_node *p;
++	list_for_each_entry(p, &fairsched_node_head, nodelist) {
++		if (p->id == id)
++			return p;
++	}
++	return NULL;
++}
++
++/******************************************************************************
++ * System calls
++ *
++ * All do_xxx functions are called under fairsched mutex and after
++ * capability check.
++ *
++ * The binary interfaces follow some other Fair Scheduler implementations
++ * (although some system call arguments are not needed for our implementation).
++ *****************************************************************************/
++
++static int do_fairsched_mknod(unsigned int parent, unsigned int weight,
++		unsigned int newid)
++{
++	struct fairsched_node *node;
++	int retval;
++
++	retval = -EINVAL;
++	if (weight < 1 || weight > FSCHWEIGHT_MAX)
++		goto out;
++	if (newid < 0 || newid > INT_MAX)
++		goto out;
++
++	retval = -EBUSY;
++	if (fairsched_find(newid) != NULL)
++		goto out;
++
++	retval = -ENOMEM;
++	node = kzalloc(sizeof(*node), GFP_KERNEL);
++	if (node == NULL)
++		goto out;
++
++	node->tg = sched_create_group(&init_task_group);
++	if (IS_ERR(node->tg))
++		goto out_free;
++
++	node->id = newid;
++	node->weight = weight;
++	sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight);
++#ifdef CONFIG_VE
++	node->owner_env = get_exec_env();
++#endif
++	list_add(&node->nodelist, &fairsched_node_head);
++	fairsched_nr_nodes++;
++
++	retval = newid;
++out:
++	return retval;
++
++out_free:
++	kfree(node);
++	return retval;
++}
++
++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
++				    unsigned int newid)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_mknod(parent, weight, newid);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_mknod);
++
++static int do_fairsched_rmnod(unsigned int id)
++{
++	struct fairsched_node *node;
++	int retval;
++
++	retval = -EINVAL;
++	node = fairsched_find(id);
++	if (node == NULL)
++		goto out;
++        if (node == &fairsched_init_node)
++                goto out;
++
++	retval = -EBUSY;
++	if (node->refcnt)
++		goto out;
++
++	list_del(&node->nodelist);
++	fairsched_nr_nodes--;
++
++	sched_destroy_group(node->tg);
++	kfree(node);
++	retval = 0;
++out:
++	return retval;
++}
++
++asmlinkage int sys_fairsched_rmnod(unsigned int id)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_rmnod(id);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_rmnod);
++
++static int do_fairsched_chwt(unsigned int id, unsigned weight)
++{
++	struct fairsched_node *node;
++
++	if (id == 0)
++		return -EINVAL;
++	if (weight < 1 || weight > FSCHWEIGHT_MAX)
++		return -EINVAL;
++
++	node = fairsched_find(id);
++	if (node == NULL)
++		return -ENOENT;
++
++	node->weight = weight;
++	sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight);
++
++	return 0;
++}
++
++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_chwt(id, weight);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++
++static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus)
++{
++	struct fairsched_node *node;
++
++	if (id == 0)
++		return -EINVAL;
++
++	node = fairsched_find(id);
++	if (node == NULL)
++		return -ENOENT;
++
++	return 0;
++}
++
++asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_vcpus(id, vcpus);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_vcpus);
++
++static int do_fairsched_rate(unsigned int id, int op, unsigned rate)
++{
++	struct fairsched_node *node;
++	int retval;
++
++	if (id == 0)
++		return -EINVAL;
++	if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31)))
++		return -EINVAL;
++
++	node = fairsched_find(id);
++	if (node == NULL)
++		return -ENOENT;
++
++	retval = -EINVAL;
++	switch (op) {
++	case FAIRSCHED_SET_RATE:
++		node->rate = rate;
++		node->rate_limited = 1;
++		retval = rate;
++		break;
++	case FAIRSCHED_DROP_RATE:
++		node->rate = 0;
++		node->rate_limited = 0;
++		retval = 0;
++		break;
++	case FAIRSCHED_GET_RATE:
++		if (node->rate_limited)
++			retval = node->rate;
++		else
++			retval = -ENODATA;
++		break;
++	}
++	return retval;
++}
++
++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_rate(id, op, rate);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++
++static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid)
++{
++	struct task_struct *p;
++	struct fairsched_node *node;
++	int retval;
++
++	retval = -ENOENT;
++	node = fairsched_find(nodeid);
++	if (node == NULL)
++		goto out;
++
++	write_lock_irq(&tasklist_lock);
++	retval = -ESRCH;
++	p = find_task_by_pid(pid);
++	if (p == NULL)
++		goto out_unlock;
++
++	get_task_struct(p);
++	put_task_fairsched_node(p);
++	p->fsched_node = node;
++	get_task_fairsched_node(p);
++	write_unlock_irq(&tasklist_lock);
++
++	smp_wmb();
++	sched_move_task(p);
++	put_task_struct(p);
++	return 0;
++
++out_unlock:
++	write_unlock_irq(&tasklist_lock);
++out:
++	return retval;
++}
++
++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid)
++{
++	int retval;
++
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++
++	mutex_lock(&fairsched_mutex);
++	retval = do_fairsched_mvpr(pid, nodeid);
++	mutex_unlock(&fairsched_mutex);
++
++	return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_mvpr);
++
++#ifdef CONFIG_PROC_FS
++
++/*********************************************************************/
++/*
++ * proc interface
++ */
++/*********************************************************************/
++
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/vmalloc.h>
++
++struct fairsched_node_dump {
++	int id;
++	unsigned weight;
++	unsigned rate;
++	int rate_limited;
++	int nr_pcpu;
++	int nr_tasks, nr_runtasks;
++};
++
++struct fairsched_dump {
++	int len;
++	struct fairsched_node_dump nodes[0];
++};
++
++static struct fairsched_dump *fairsched_do_dump(int compat)
++{
++	int nr_nodes;
++	int len;
++	struct fairsched_dump *dump;
++	struct fairsched_node *node;
++	struct fairsched_node_dump *p;
++
++	mutex_lock(&fairsched_mutex);
++	nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1);
++	len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]);
++	dump = ub_vmalloc(len);
++	if (dump == NULL)
++		goto out;
++
++	p = dump->nodes;
++	list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) {
++		if ((char *)p - (char *)dump >= len)
++			break;
++		p->nr_tasks = 0;
++		p->nr_runtasks = 0;
++#ifdef CONFIG_VE
++		if (!ve_accessible(node->owner_env, get_exec_env()))
++			continue;
++		p->nr_tasks = atomic_read(&node->owner_env->pcounter);
++		p->nr_runtasks = nr_running_ve(node->owner_env);
++#endif
++		p->id = node->id;
++		p->weight = node->weight;
++		p->rate = node->rate;
++		p->rate_limited = node->rate_limited;
++		p->nr_pcpu = num_online_cpus();
++		p++;
++	}
++	dump->len = p - dump->nodes;
++out:
++	mutex_unlock(&fairsched_mutex);
++	return dump;
++}
++
++#define FAIRSCHED_PROC_HEADLINES 2
++
++#define FAIRSHED_DEBUG          " debug"
++
++#ifdef CONFIG_VE
++/*
++ * File format is dictated by compatibility reasons.
++ */
++static int fairsched_seq_show(struct seq_file *m, void *v)
++{
++	struct fairsched_dump *dump;
++	struct fairsched_node_dump *p;
++	unsigned vid, nid, pid, r;
++
++	dump = m->private;
++	p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL);
++	if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
++		if (p == dump->nodes)
++			seq_printf(m, "Version: 2.6 debug\n");
++		else if (p == dump->nodes + 1)
++			seq_printf(m,
++				       "      veid "
++				       "        id "
++				       "    parent "
++				       "weight "
++				       " rate "
++				       "tasks "
++				       "  run "
++				       "cpus"
++				       " "
++				       "flg "
++				       "ready "
++				       "           start_tag "
++				       "               value "
++				       "               delay"
++				       "\n");
++	} else {
++		p -= FAIRSCHED_PROC_HEADLINES;
++		vid = nid = pid = 0;
++		r = (unsigned long)v & 3;
++		if (p == dump->nodes) {
++			if (r == 2)
++				nid = p->id;
++		} else {
++			if (!r)
++				nid = p->id;
++			else if (r == 1)
++				vid = pid = p->id;
++			else
++				vid = p->id, nid = 1;
++		}
++		seq_printf(m,
++			       "%10u "
++			       "%10u %10u %6u %5u %5u %5u %4u"
++			       " "
++			       " %c%c %5u %20Lu %20Lu %20Lu"
++			       "\n",
++			       vid,
++			       nid,
++			       pid,
++			       p->weight,
++			       p->rate,
++			       p->nr_tasks,
++			       p->nr_runtasks,
++			       p->nr_pcpu,
++			       p->rate_limited ? 'L' : '.',
++			       '.',
++			       p->nr_runtasks,
++			       0ll, 0ll, 0ll);
++	}
++
++	return 0;
++}
++
++static void *fairsched_seq_start(struct seq_file *m, loff_t *pos)
++{
++	struct fairsched_dump *dump;
++	unsigned long l;
++
++	dump = m->private;
++	if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES)
++		return NULL;
++	if (*pos < FAIRSCHED_PROC_HEADLINES)
++		return dump->nodes + *pos;
++	/* guess why... */
++	l = (unsigned long)(dump->nodes +
++		((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3);
++	l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3;
++	return (void *)l;
++}
++static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	++*pos;
++	return fairsched_seq_start(m, pos);
++}
++#endif /* CONFIG_VE */
++
++static int fairsched2_seq_show(struct seq_file *m, void *v)
++{
++	struct fairsched_dump *dump;
++	struct fairsched_node_dump *p;
++
++	dump = m->private;
++	p = v;
++	if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
++		if (p == dump->nodes)
++			seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n");
++		else if (p == dump->nodes + 1)
++			seq_printf(m,
++				       "        id "
++				       "weight "
++				       " rate "
++				       "  run "
++				       "cpus"
++#ifdef FAIRSHED_DEBUG
++				       " "
++				       "flg "
++				       "ready "
++				       "           start_tag "
++				       "               value "
++				       "               delay"
++#endif
++				       "\n");
++	} else {
++		p -= FAIRSCHED_PROC_HEADLINES;
++		seq_printf(m,
++			       "%10u %6u %5u %5u %4u"
++#ifdef FAIRSHED_DEBUG
++			       " "
++			       " %c%c %5u %20Lu %20Lu %20Lu"
++#endif
++			       "\n",
++			       p->id,
++			       p->weight,
++			       p->rate,
++			       p->nr_runtasks,
++			       p->nr_pcpu
++#ifdef FAIRSHED_DEBUG
++			       ,
++			       p->rate_limited ? 'L' : '.',
++			       '.',
++			       p->nr_runtasks,
++			       0ll, 0ll, 0ll
++#endif
++			       );
++	}
++
++	return 0;
++}
++
++static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos)
++{
++	struct fairsched_dump *dump;
++
++	dump = m->private;
++	if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES)
++		return NULL;
++	return dump->nodes + *pos;
++}
++static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	++*pos;
++	return fairsched2_seq_start(m, pos);
++}
++static void fairsched2_seq_stop(struct seq_file *m, void *v)
++{
++}
++
++#ifdef CONFIG_VE
++static struct seq_operations fairsched_seq_op = {
++	.start		= fairsched_seq_start,
++	.next		= fairsched_seq_next,
++	.stop		= fairsched2_seq_stop,
++	.show		= fairsched_seq_show
++};
++#endif
++static struct seq_operations fairsched2_seq_op = {
++	.start		= fairsched2_seq_start,
++	.next		= fairsched2_seq_next,
++	.stop		= fairsched2_seq_stop,
++	.show		= fairsched2_seq_show
++};
++static int fairsched_seq_open(struct inode *inode, struct file *file)
++{
++	int ret;
++	struct seq_file *m;
++	int compat;
++
++#ifdef CONFIG_VE
++	compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1);
++	ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op);
++#else
++	compat = 0;
++	ret = seq_open(file, &fairsched2_seq_op);
++#endif
++	if (ret)
++		return ret;
++	m = file->private_data;
++	m->private = fairsched_do_dump(compat);
++	if (m->private == NULL) {
++		seq_release(inode, file);
++		ret = -ENOMEM;
++	}
++	return ret;
++}
++static int fairsched_seq_release(struct inode *inode, struct file *file)
++{
++	struct seq_file *m;
++	struct fairsched_dump *dump;
++
++	m = file->private_data;
++	dump = m->private;
++	m->private = NULL;
++	vfree(dump);
++	seq_release(inode, file);
++	return 0;
++}
++static struct file_operations proc_fairsched_operations = {
++	.open		= fairsched_seq_open,
++	.read		= seq_read,
++	.llseek		= seq_lseek,
++	.release	= fairsched_seq_release
++};
++
++void __init fairsched_init_late(void)
++{
++	proc_create("fairsched", S_IRUGO, &glob_proc_root,
++			&proc_fairsched_operations);
++	proc_create("fairsched2", S_IRUGO, &glob_proc_root,
++			&proc_fairsched_operations);
++}
++
++#else
++
++void __init fairsched_init_late(void) { }
++
++#endif /* CONFIG_PROC_FS */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 19908b2..f366869 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -26,6 +26,7 @@
+ #include <linux/key.h>
+ #include <linux/binfmts.h>
+ #include <linux/mman.h>
++#include <linux/virtinfo.h>
+ #include <linux/fs.h>
+ #include <linux/nsproxy.h>
+ #include <linux/capability.h>
+@@ -54,6 +55,7 @@
+ #include <linux/tty.h>
+ #include <linux/proc_fs.h>
+ #include <linux/blkdev.h>
++#include <linux/ve.h>
+ 
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+@@ -62,17 +64,23 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <bc/vmpages.h>
++#include <bc/misc.h>
++#include <bc/oom_kill.h>
++
+ /*
+  * Protected counters by write_lock_irq(&tasklist_lock)
+  */
+ unsigned long total_forks;	/* Handle normal Linux uptimes. */
+ int nr_threads; 		/* The idle threads do not count.. */
++EXPORT_SYMBOL_GPL(nr_threads);
+ 
+ int max_threads;		/* tunable limit on nr_threads */
+ 
+ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+ 
+ __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
++EXPORT_SYMBOL(tasklist_lock);
+ 
+ int nr_processes(void)
+ {
+@@ -124,14 +132,20 @@ void __put_task_struct(struct task_struct *tsk)
+ 	WARN_ON(atomic_read(&tsk->usage));
+ 	WARN_ON(tsk == current);
+ 
++	ub_task_put(tsk);
+ 	security_task_free(tsk);
+ 	free_uid(tsk->user);
+ 	put_group_info(tsk->group_info);
+ 	delayacct_tsk_free(tsk);
+ 
++#ifdef CONFIG_VE
++	put_ve(VE_TASK_INFO(tsk)->owner_env);
++	atomic_dec(&nr_dead);
++#endif
+ 	if (!profile_handoff_task(tsk))
+ 		free_task(tsk);
+ }
++EXPORT_SYMBOL_GPL(__put_task_struct);
+ 
+ /*
+  * macro override instead of weak attribute alias, to workaround
+@@ -150,7 +164,7 @@ void __init fork_init(unsigned long mempages)
+ 	/* create a slab on which task_structs can be allocated */
+ 	task_struct_cachep =
+ 		kmem_cache_create("task_struct", sizeof(struct task_struct),
+-			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
++			ARCH_MIN_TASKALIGN, SLAB_PANIC|SLAB_UBC, NULL);
+ #endif
+ 
+ 	/* do the arch specific task caches init */
+@@ -270,6 +284,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ 			continue;
+ 		}
+ 		charge = 0;
++		if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
++					mpnt->vm_flags & ~VM_LOCKED,
++					mpnt->vm_file, UB_HARD))
++			goto fail_noch;
+ 		if (mpnt->vm_flags & VM_ACCOUNT) {
+ 			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ 			if (security_vm_enough_memory(len))
+@@ -316,7 +334,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ 		rb_parent = &tmp->vm_rb;
+ 
+ 		mm->map_count++;
+-		retval = copy_page_range(mm, oldmm, mpnt);
++		retval = copy_page_range(mm, oldmm, tmp, mpnt);
+ 
+ 		if (tmp->vm_ops && tmp->vm_ops->open)
+ 			tmp->vm_ops->open(tmp);
+@@ -335,6 +353,9 @@ out:
+ fail_nomem_policy:
+ 	kmem_cache_free(vm_area_cachep, tmp);
+ fail_nomem:
++	ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
++			mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
++fail_noch:
+ 	retval = -ENOMEM;
+ 	vm_unacct_memory(charge);
+ 	goto out;
+@@ -383,12 +404,22 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+ 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+ 	mm->cached_hole_size = ~0UL;
+ 	mm_init_owner(mm, p);
++	/*
++	 * This looks ugly, buy when we came from
++	 *      sys_execve -> mm_alloc -> here
++	 * we need to get exec_ub, not task_ub. But when
++	 * we're here like this
++	 *      sys_fork() -> dup_mm -> here
++	 * we need task_ub, not the exec one... xemul
++	 */
++	set_mm_ub(mm, p);
+ 
+ 	if (likely(!mm_alloc_pgd(mm))) {
+ 		mm->def_flags = 0;
+ 		return mm;
+ 	}
+ 
++	put_mm_ub(mm);
+ 	free_mm(mm);
+ 	return NULL;
+ }
+@@ -407,6 +438,7 @@ struct mm_struct * mm_alloc(void)
+ 	}
+ 	return mm;
+ }
++EXPORT_SYMBOL_GPL(mm_alloc);
+ 
+ /*
+  * Called when the last reference to the mm
+@@ -418,6 +450,7 @@ void __mmdrop(struct mm_struct *mm)
+ 	BUG_ON(mm == &init_mm);
+ 	mm_free_pgd(mm);
+ 	destroy_context(mm);
++	put_mm_ub(mm);
+ 	free_mm(mm);
+ }
+ EXPORT_SYMBOL_GPL(__mmdrop);
+@@ -439,6 +472,9 @@ void mmput(struct mm_struct *mm)
+ 			spin_unlock(&mmlist_lock);
+ 		}
+ 		put_swap_token(mm);
++		(void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm);
++		if (mm->oom_killed)
++			ub_oom_task_dead(current);
+ 		mmdrop(mm);
+ 	}
+ }
+@@ -568,6 +604,7 @@ fail_nocontext:
+ 	 * because it calls destroy_context()
+ 	 */
+ 	mm_free_pgd(mm);
++	put_mm_ub(mm);
+ 	free_mm(mm);
+ 	return NULL;
+ }
+@@ -874,14 +911,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 					struct pt_regs *regs,
+ 					unsigned long stack_size,
+ 					int __user *child_tidptr,
+-					struct pid *pid)
++					struct pid *pid, pid_t vpid)
+ {
+ 	int retval;
+ 	struct task_struct *p;
+ 	int cgroup_callbacks_done = 0;
+ 
++#ifdef CONFIG_VE
++	if (clone_flags & CLONE_NAMESPACES_MASK)
++		return ERR_PTR(-EINVAL);
++#else
+ 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+ 		return ERR_PTR(-EINVAL);
++#endif
+ 
+ 	/*
+ 	 * Thread groups must share signals as well, and detached threads
+@@ -909,6 +951,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 
+ 	rt_mutex_init_task(p);
+ 
++	if (ub_task_charge(current, p))
++		goto bad_fork_charge;
++
+ #ifdef CONFIG_TRACE_IRQFLAGS
+ 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+ 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+@@ -1064,7 +1109,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 
+ 	if (pid != &init_struct_pid) {
+ 		retval = -ENOMEM;
+-		pid = alloc_pid(task_active_pid_ns(p));
++		pid = alloc_pid(task_active_pid_ns(p), vpid);
+ 		if (!pid)
+ 			goto bad_fork_cleanup_io;
+ 
+@@ -1072,6 +1117,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 			retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+ 			if (retval < 0)
+ 				goto bad_fork_free_pid;
++			if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD)
++				task_active_pid_ns(p)->flags |= PID_NS_HIDDEN;
+ 		}
+ 	}
+ 
+@@ -1169,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 	 * thread can't slip out of an OOM kill (or normal SIGKILL).
+  	 */
+ 	recalc_sigpending();
+-	if (signal_pending(current)) {
++	if (signal_pending(current) && !vpid) {
+ 		spin_unlock(&current->sighand->siglock);
+ 		write_unlock_irq(&tasklist_lock);
+ 		retval = -ERESTARTNOINTR;
+@@ -1212,14 +1259,24 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ 			attach_pid(p, PIDTYPE_SID, task_session(current));
+ 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
++#ifdef CONFIG_VE
++			list_add_tail_rcu(&p->ve_task_info.vetask_list,
++					&p->ve_task_info.owner_env->vetask_lh);
++#endif
+ 			__get_cpu_var(process_counts)++;
+ 		}
+ 		attach_pid(p, PIDTYPE_PID, pid);
+ 		nr_threads++;
+ 	}
++	(void)get_ve(p->ve_task_info.owner_env);
++	pget_ve(p->ve_task_info.owner_env);
+ 
++#ifdef CONFIG_VE
++	seqcount_init(&p->ve_task_info.wakeup_lock);
++#endif
+ 	total_forks++;
+ 	spin_unlock(&current->sighand->siglock);
++	get_task_fairsched_node(p);
+ 	write_unlock_irq(&tasklist_lock);
+ 	proc_fork_connector(p);
+ 	cgroup_post_fork(p);
+@@ -1267,6 +1324,9 @@ bad_fork_cleanup_count:
+ 	atomic_dec(&p->user->processes);
+ 	free_uid(p->user);
+ bad_fork_free:
++	ub_task_uncharge(p);
++	ub_task_put(p);
++bad_fork_charge:
+ 	free_task(p);
+ fork_out:
+ 	return ERR_PTR(retval);
+@@ -1284,7 +1344,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
+ 	struct pt_regs regs;
+ 
+ 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
+-				&init_struct_pid);
++				&init_struct_pid, 0);
+ 	if (!IS_ERR(task))
+ 		init_idle(task, cpu);
+ 
+@@ -1313,12 +1373,13 @@ static int fork_traceflag(unsigned clone_flags)
+  * It copies the process, and if successful kick-starts
+  * it and waits for it to finish using the VM if required.
+  */
+-long do_fork(unsigned long clone_flags,
++long do_fork_pid(unsigned long clone_flags,
+ 	      unsigned long stack_start,
+ 	      struct pt_regs *regs,
+ 	      unsigned long stack_size,
+ 	      int __user *parent_tidptr,
+-	      int __user *child_tidptr)
++	      int __user *child_tidptr,
++	      long vpid)
+ {
+ 	struct task_struct *p;
+ 	int trace = 0;
+@@ -1341,6 +1402,10 @@ long do_fork(unsigned long clone_flags,
+ 		}
+ 	}
+ 
++	nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags);
++	if (nr)
++		return nr;
++
+ 	if (unlikely(current->ptrace)) {
+ 		trace = fork_traceflag (clone_flags);
+ 		if (trace)
+@@ -1348,7 +1413,7 @@ long do_fork(unsigned long clone_flags,
+ 	}
+ 
+ 	p = copy_process(clone_flags, stack_start, regs, stack_size,
+-			child_tidptr, NULL);
++			child_tidptr, NULL, vpid);
+ 	/*
+ 	 * Do this prior waking up the new thread - the thread pointer
+ 	 * might get invalid after that point, if the thread exits quickly.
+@@ -1374,6 +1439,8 @@ long do_fork(unsigned long clone_flags,
+ 			set_tsk_thread_flag(p, TIF_SIGPENDING);
+ 		}
+ 
++		(void)virtinfo_gencall(VIRTINFO_DOFORKRET, p);
++
+ 		if (!(clone_flags & CLONE_STOPPED))
+ 			wake_up_new_task(p, clone_flags);
+ 		else
+@@ -1396,6 +1463,8 @@ long do_fork(unsigned long clone_flags,
+ 	} else {
+ 		nr = PTR_ERR(p);
+ 	}
++
++	(void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr);
+ 	return nr;
+ }
+ 
+@@ -1411,27 +1480,40 @@ static void sighand_ctor(struct kmem_cache *cachep, void *data)
+ 	init_waitqueue_head(&sighand->signalfd_wqh);
+ }
+ 
++EXPORT_SYMBOL(do_fork_pid);
++
++long do_fork(unsigned long clone_flags,
++		unsigned long stack_start,
++		struct pt_regs *regs,
++		unsigned long stack_size,
++		int __user *parent_tidptr,
++		int __user *child_tidptr)
++{
++	return do_fork_pid(clone_flags, stack_start, regs, stack_size,
++			parent_tidptr, child_tidptr, 0);
++}
++
+ void __init proc_caches_init(void)
+ {
+ 	sighand_cachep = kmem_cache_create("sighand_cache",
+ 			sizeof(struct sighand_struct), 0,
+-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
++			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|SLAB_UBC,
+ 			sighand_ctor);
+ 	signal_cachep = kmem_cache_create("signal_cache",
+ 			sizeof(struct signal_struct), 0,
+-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ 	files_cachep = kmem_cache_create("files_cache",
+ 			sizeof(struct files_struct), 0,
+-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ 	fs_cachep = kmem_cache_create("fs_cache",
+ 			sizeof(struct fs_struct), 0,
+-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ 	vm_area_cachep = kmem_cache_create("vm_area_struct",
+ 			sizeof(struct vm_area_struct), 0,
+-			SLAB_PANIC, NULL);
++			SLAB_PANIC|SLAB_UBC, NULL);
+ 	mm_cachep = kmem_cache_create("mm_struct",
+ 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ }
+ 
+ /*
+@@ -1569,6 +1651,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
+ 				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
+ 				CLONE_NEWNET))
+ 		goto bad_unshare_out;
++#ifdef CONFIG_VE
++	if (unshare_flags & CLONE_NAMESPACES_MASK)
++		goto bad_unshare_out;
++#endif
+ 
+ 	/*
+ 	 * CLONE_NEWIPC must also detach from the undolist: after switching
+@@ -1587,9 +1673,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
+ 		goto bad_unshare_cleanup_sigh;
+ 	if ((err = unshare_fd(unshare_flags, &new_fd)))
+ 		goto bad_unshare_cleanup_vm;
++#ifndef CONFIG_VE
+ 	if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ 			new_fs)))
+ 		goto bad_unshare_cleanup_fd;
++#endif
+ 
+ 	if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ 		if (do_sysvsem) {
+@@ -1633,7 +1721,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
+ 	if (new_nsproxy)
+ 		put_nsproxy(new_nsproxy);
+ 
++#ifndef CONFIG_VE
+ bad_unshare_cleanup_fd:
++#endif
+ 	if (new_fd)
+ 		put_files_struct(new_fd);
+ 
+diff --git a/kernel/futex.c b/kernel/futex.c
+index 7d1136e..a02be16 100644
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1198,8 +1198,6 @@ handle_fault:
+  */
+ #define FLAGS_SHARED  1
+ 
+-static long futex_wait_restart(struct restart_block *restart);
+-
+ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ 		      u32 val, ktime_t *abs_time, u32 bitset)
+ {
+@@ -1365,7 +1363,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ }
+ 
+ 
+-static long futex_wait_restart(struct restart_block *restart)
++long futex_wait_restart(struct restart_block *restart)
+ {
+ 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+ 	struct rw_semaphore *fshared = NULL;
+@@ -1378,6 +1376,7 @@ static long futex_wait_restart(struct restart_block *restart)
+ 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+ 				restart->futex.bitset);
+ }
++EXPORT_SYMBOL_GPL(futex_wait_restart);
+ 
+ 
+ /*
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index ab80515..fd9ae33 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -1524,6 +1524,7 @@ out:
+ 	destroy_hrtimer_on_stack(&t.timer);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart);
+ 
+ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ 		       const enum hrtimer_mode mode, const clockid_t clockid)
+diff --git a/kernel/kmod.c b/kernel/kmod.c
+index 8df97d3..4c93a6c 100644
+--- a/kernel/kmod.c
++++ b/kernel/kmod.c
+@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...)
+ #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
+ 	static int kmod_loop_msg;
+ 
++	/* Don't allow request_module() inside VE. */
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
++
+ 	va_start(args, fmt);
+ 	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ 	va_end(args);
+@@ -451,6 +455,9 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
+ 	DECLARE_COMPLETION_ONSTACK(done);
+ 	int retval = 0;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return -EPERM;
++
+ 	helper_lock();
+ 	if (sub_info->path[0] == '\0')
+ 		goto out;
+diff --git a/kernel/kprobes.c b/kernel/kprobes.c
+index 1485ca8..5b4040a 100644
+--- a/kernel/kprobes.c
++++ b/kernel/kprobes.c
+@@ -118,14 +118,14 @@ static int __kprobes check_safety(void)
+ 	ret = freeze_processes();
+ 	if (ret == 0) {
+ 		struct task_struct *p, *q;
+-		do_each_thread(p, q) {
++		do_each_thread_all(p, q) {
+ 			if (p != current && p->state == TASK_RUNNING &&
+ 			    p->pid != 0) {
+ 				printk("Check failed: %s is running\n",p->comm);
+ 				ret = -1;
+ 				goto loop_end;
+ 			}
+-		} while_each_thread(p, q);
++		} while_each_thread_all(p, q);
+ 	}
+ loop_end:
+ 	thaw_processes();
+diff --git a/kernel/lockdep.c b/kernel/lockdep.c
+index 81a4e4a..f59230a 100644
+--- a/kernel/lockdep.c
++++ b/kernel/lockdep.c
+@@ -3182,7 +3182,7 @@ retry:
+ 	if (count != 10)
+ 		printk(" locked it.\n");
+ 
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		/*
+ 		 * It's not reliable to print a task's held locks
+ 		 * if it's not sleeping (or if it's not the current
+@@ -3195,7 +3195,7 @@ retry:
+ 		if (!unlock)
+ 			if (read_trylock(&tasklist_lock))
+ 				unlock = 1;
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 
+ 	printk("\n");
+ 	printk("=============================================\n\n");
+diff --git a/kernel/module.c b/kernel/module.c
+index 5f80478..131c925 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -2463,6 +2463,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
+ static void *m_start(struct seq_file *m, loff_t *pos)
+ {
+ 	mutex_lock(&module_mutex);
++	if (!ve_is_super(get_exec_env()))
++		return NULL;
+ 	return seq_list_start(&modules, *pos);
+ }
+ 
+diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
+index adc7851..1c0848f 100644
+--- a/kernel/nsproxy.c
++++ b/kernel/nsproxy.c
+@@ -27,6 +27,14 @@ static struct kmem_cache *nsproxy_cachep;
+ 
+ struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+ 
++void get_task_namespaces(struct task_struct *tsk)
++{
++	struct nsproxy *ns = tsk->nsproxy;
++	if (ns) {
++		get_nsproxy(ns);
++	}
++}
++
+ /*
+  * creates a copy of "orig" with refcount 1.
+  */
+@@ -134,10 +142,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+ 				CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
+ 		return 0;
+ 
++#ifndef CONFIG_VE
+ 	if (!capable(CAP_SYS_ADMIN)) {
+ 		err = -EPERM;
+ 		goto out;
+ 	}
++#endif
+ 
+ 	/*
+ 	 * CLONE_NEWIPC must detach from the undolist: after switching
+@@ -169,6 +179,7 @@ out:
+ 	put_nsproxy(old_ns);
+ 	return err;
+ }
++EXPORT_SYMBOL(copy_namespaces);
+ 
+ void free_nsproxy(struct nsproxy *ns)
+ {
+@@ -185,6 +196,22 @@ void free_nsproxy(struct nsproxy *ns)
+ 	put_net(ns->net_ns);
+ 	kmem_cache_free(nsproxy_cachep, ns);
+ }
++EXPORT_SYMBOL(free_nsproxy);
++
++struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk)
++{
++	struct mnt_namespace *mnt_ns = NULL;
++
++	task_lock(tsk);
++	if (tsk->nsproxy)
++		mnt_ns = tsk->nsproxy->mnt_ns;
++	if (mnt_ns)
++		get_mnt_ns(mnt_ns);
++	task_unlock(tsk);
++
++	return mnt_ns;
++}
++EXPORT_SYMBOL(get_task_mnt_ns);
+ 
+ /*
+  * Called from unshare. Unshare all the namespaces part of nsproxy.
+diff --git a/kernel/pid.c b/kernel/pid.c
+index 20d59fa..833f5b4 100644
+--- a/kernel/pid.c
++++ b/kernel/pid.c
+@@ -32,6 +32,7 @@
+ #include <linux/init.h>
+ #include <linux/bootmem.h>
+ #include <linux/hash.h>
++#include <bc/kmem.h>
+ #include <linux/pid_namespace.h>
+ #include <linux/init_task.h>
+ #include <linux/syscalls.h>
+@@ -109,7 +110,7 @@ EXPORT_SYMBOL(is_container_init);
+  * For now it is easier to be safe than to prove it can't happen.
+  */
+ 
+-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
++__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+ 
+ static void free_pidmap(struct upid *upid)
+ {
+@@ -120,8 +121,9 @@ static void free_pidmap(struct upid *upid)
+ 	clear_bit(offset, map->page);
+ 	atomic_inc(&map->nr_free);
+ }
++EXPORT_SYMBOL_GPL(free_pidmap);
+ 
+-static int alloc_pidmap(struct pid_namespace *pid_ns)
++int alloc_pidmap(struct pid_namespace *pid_ns)
+ {
+ 	int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ 	struct pidmap *map;
+@@ -181,6 +183,36 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
+ 	return -1;
+ }
+ 
++int set_pidmap(struct pid_namespace *pid_ns, pid_t pid)
++{
++	int offset;
++	struct pidmap *map;
++
++	offset = pid & BITS_PER_PAGE_MASK;
++	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
++	if (unlikely(!map->page)) {
++		void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
++		/*
++		 * Free the page if someone raced with us
++		 * installing it:
++		 */
++		spin_lock_irq(&pidmap_lock);
++		if (map->page)
++			kfree(page);
++		else
++			map->page = page;
++		spin_unlock_irq(&pidmap_lock);
++		if (unlikely(!map->page))
++			return -ENOMEM;
++	}
++
++	if (test_and_set_bit(offset, map->page))
++		return -EBUSY;
++
++	atomic_dec(&map->nr_free);
++	return pid;
++}
++
+ int next_pidmap(struct pid_namespace *pid_ns, int last)
+ {
+ 	int offset;
+@@ -226,25 +258,33 @@ void free_pid(struct pid *pid)
+ 	/* We can be called with write_lock_irq(&tasklist_lock) held */
+ 	int i;
+ 	unsigned long flags;
++	struct upid *upid;
+ 
+ 	spin_lock_irqsave(&pidmap_lock, flags);
+-	for (i = 0; i <= pid->level; i++)
+-		hlist_del_rcu(&pid->numbers[i].pid_chain);
+-	spin_unlock_irqrestore(&pidmap_lock, flags);
++	for (i = 0; i <= pid->level; i++) {
++		upid = &pid->numbers[i];
++		if (!hlist_unhashed(&upid->pid_chain))
++			hlist_del_rcu(&upid->pid_chain);
++	}
++	spin_unlock(&pidmap_lock);
++	ub_kmemsize_uncharge(pid->ub, pid->numbers[pid->level].ns->pid_cachep->objuse);
++	local_irq_restore(flags);
+ 
+ 	for (i = 0; i <= pid->level; i++)
+ 		free_pidmap(pid->numbers + i);
+-
++	put_beancounter(pid->ub);
+ 	call_rcu(&pid->rcu, delayed_put_pid);
+ }
++EXPORT_SYMBOL_GPL(free_pid);
+ 
+-struct pid *alloc_pid(struct pid_namespace *ns)
++struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid)
+ {
+ 	struct pid *pid;
+ 	enum pid_type type;
+ 	int i, nr;
+ 	struct pid_namespace *tmp;
+ 	struct upid *upid;
++	struct user_beancounter *ub;
+ 
+ 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+ 	if (!pid)
+@@ -252,7 +292,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
+ 
+ 	tmp = ns;
+ 	for (i = ns->level; i >= 0; i--) {
+-		nr = alloc_pidmap(tmp);
++		if (vpid != 0 && i == ns->level)
++			nr = set_pidmap(tmp, vpid);
++		else
++			nr = alloc_pidmap(tmp);
+ 		if (nr < 0)
+ 			goto out_free;
+ 
+@@ -267,17 +310,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
+ 	for (type = 0; type < PIDTYPE_MAX; ++type)
+ 		INIT_HLIST_HEAD(&pid->tasks[type]);
+ 
++#ifdef CONFIG_BEANCOUNTERS
++	ub = get_exec_ub();
++	local_irq_disable();
++	if (ub_kmemsize_charge(ub, ns->pid_cachep->objuse, UB_HARD))
++		goto out_enable;
++	pid->ub = get_beancounter(ub);
++	spin_lock(&pidmap_lock);
++#else
+ 	spin_lock_irq(&pidmap_lock);
++#endif
+ 	for (i = ns->level; i >= 0; i--) {
+ 		upid = &pid->numbers[i];
+ 		hlist_add_head_rcu(&upid->pid_chain,
+ 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
++		if (upid->ns->flags & PID_NS_HIDDEN)
++			while (i--)
++				INIT_HLIST_NODE(&pid->numbers[i].pid_chain);
+ 	}
+ 	spin_unlock_irq(&pidmap_lock);
+ 
+ out:
+ 	return pid;
+ 
++out_enable:
++	local_irq_enable();
++	put_pid_ns(ns);
+ out_free:
+ 	while (++i <= ns->level)
+ 		free_pidmap(pid->numbers + i);
+@@ -286,6 +344,7 @@ out_free:
+ 	pid = NULL;
+ 	goto out;
+ }
++EXPORT_SYMBOL_GPL(alloc_pid);
+ 
+ struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
+ {
+@@ -314,6 +373,45 @@ struct pid *find_pid(int nr)
+ }
+ EXPORT_SYMBOL_GPL(find_pid);
+ 
++void reattach_pid(struct task_struct *tsk, enum pid_type type,
++		struct pid *pid)
++{
++	int i;
++	struct pid *old_pid;
++	struct pid_link *link;
++	struct upid *upid;
++
++	link = &tsk->pids[type];
++	old_pid = link->pid;
++
++	hlist_del_rcu(&link->node);
++	link->pid = pid;
++	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
++
++	if (type != PIDTYPE_PID) {
++		for (i = PIDTYPE_MAX; --i >= 0; )
++			if (!hlist_empty(&old_pid->tasks[i]))
++				return;
++
++		for (i = 0; i < pid->level; i++)
++			hlist_del_rcu(&old_pid->numbers[i].pid_chain);
++	} else {
++		for (i = PIDTYPE_MAX; --i >= 0; )
++			if (!hlist_empty(&old_pid->tasks[i]))
++				BUG();
++
++		for (i = 0; i < pid->level; i++)
++			hlist_replace_rcu(&old_pid->numbers[i].pid_chain,
++					&pid->numbers[i].pid_chain);
++
++		upid = &pid->numbers[pid->level];
++		hlist_add_head_rcu(&upid->pid_chain,
++				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
++	}
++
++	call_rcu(&old_pid->rcu, delayed_put_pid);
++}
++
+ /*
+  * attach_pid() must be called with the tasklist_lock write-held.
+  */
+@@ -326,6 +424,7 @@ void attach_pid(struct task_struct *task, enum pid_type type,
+ 	link->pid = pid;
+ 	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+ }
++EXPORT_SYMBOL_GPL(attach_pid);
+ 
+ static void __change_pid(struct task_struct *task, enum pid_type type,
+ 			struct pid *new)
+@@ -346,6 +445,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
+ 
+ 	free_pid(pid);
+ }
++EXPORT_SYMBOL_GPL(detach_pid);
+ 
+ void detach_pid(struct task_struct *task, enum pid_type type)
+ {
+@@ -498,6 +598,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
+ }
+ EXPORT_SYMBOL_GPL(find_get_pid);
+ 
++pid_t pid_to_vpid(pid_t nr)
++{
++	struct pid *pid;
++
++	pid = find_pid(nr);
++	if (pid)
++		return pid->numbers[pid->level].nr;
++	return -1;
++}
++EXPORT_SYMBOL_GPL(pid_to_vpid);
++
+ /*
+  * The pid hash table is scaled according to the amount of memory in the
+  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
+diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
+index 98702b4..c478b80 100644
+--- a/kernel/pid_namespace.c
++++ b/kernel/pid_namespace.c
+@@ -13,6 +13,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/err.h>
+ 
++#include <bc/kmem.h>
++
+ #define BITS_PER_PAGE		(PAGE_SIZE*8)
+ 
+ struct pid_cache {
+@@ -87,6 +89,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
+ 	ns->last_pid = 0;
+ 	ns->child_reaper = NULL;
+ 	ns->level = level;
++	ns->flags = 0;
+ 
+ 	set_bit(0, ns->pidmap[0].page);
+ 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+@@ -151,6 +154,160 @@ void free_pid_ns(struct kref *kref)
+ 		put_pid_ns(parent);
+ }
+ 
++/*
++ * this is a dirty ugly hack.
++ */
++
++static int __pid_ns_attach_task(struct pid_namespace *ns,
++		struct task_struct *tsk, pid_t nr)
++{
++	struct pid *pid;
++	enum pid_type type;
++	unsigned long old_size, new_size;
++
++	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
++	if (!pid)
++		goto out;
++
++	if (nr == 0)
++		nr = alloc_pidmap(ns);
++	else
++		nr = set_pidmap(ns, nr);
++
++	if (nr < 0)
++		goto out_free;
++
++	memcpy(pid, task_pid(tsk),
++		sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid));
++	get_pid_ns(ns);
++	pid->level++;
++	BUG_ON(pid->level != ns->level);
++	pid->numbers[pid->level].nr = nr;
++	pid->numbers[pid->level].ns = ns;
++	atomic_set(&pid->count, 1);
++	for (type = 0; type < PIDTYPE_MAX; ++type)
++		INIT_HLIST_HEAD(&pid->tasks[type]);
++
++	old_size = pid->numbers[pid->level - 1].ns->pid_cachep->objuse;
++	new_size = pid->numbers[pid->level].ns->pid_cachep->objuse;
++	local_irq_disable();
++	/*
++	 * Depending on sizeof(struct foo), cache flags (redzoning, etc)
++	 * and actual CPU (cacheline_size() jump from 64 to 128 bytes after
++	 * CPU detection) new size can very well be smaller than old size.
++	 */
++	if (new_size > old_size) {
++		if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0)
++			goto out_enable;
++	} else
++		ub_kmemsize_uncharge(pid->ub, old_size - new_size);
++
++	write_lock(&tasklist_lock);
++
++	spin_lock(&pidmap_lock);
++	reattach_pid(tsk, PIDTYPE_SID, pid);
++	set_task_session(tsk, pid_nr(pid));
++	reattach_pid(tsk, PIDTYPE_PGID, pid);
++	tsk->signal->__pgrp = pid_nr(pid);
++	current->signal->tty_old_pgrp = NULL;
++
++	reattach_pid(tsk, PIDTYPE_PID, pid);
++	spin_unlock(&pidmap_lock);
++
++	write_unlock_irq(&tasklist_lock);
++
++	return 0;
++
++out_enable:
++	local_irq_enable();
++	put_pid_ns(ns);
++out_free:
++	kmem_cache_free(ns->pid_cachep, pid);
++out:
++	return -ENOMEM;
++}
++
++int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk)
++{
++	return __pid_ns_attach_task(ns, tsk, 0);
++}
++EXPORT_SYMBOL_GPL(pid_ns_attach_task);
++
++int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk)
++{
++	int err;
++
++	err = __pid_ns_attach_task(ns, tsk, 1);
++	if (err < 0)
++		return err;
++
++	ns->child_reaper = tsk;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(pid_ns_attach_init);
++
++#ifdef CONFIG_VE
++static noinline void show_lost_task(struct task_struct *p)
++{
++	char buf[512] = "N/A";
++#ifdef CONFIG_PROC_FS
++	extern char * task_sig(struct task_struct *p, char *buffer);
++
++	task_sig(p, buf);
++#endif
++	printk("Lost task: %d/%s/%p\nSignals:%s\n", p->pid, p->comm, p, buf);
++}
++
++static void zap_ve_processes(struct ve_struct *env)
++{
++	/*
++	 * Here the VE changes its state into "not running".
++	 * op_sem taken for write is a barrier to all VE manipulations from
++	 * ioctl: it waits for operations currently in progress and blocks all
++	 * subsequent operations until is_running is set to 0 and op_sem is
++	 * released.
++	 */
++	down_write(&env->op_sem);
++	env->is_running = 0;
++	up_write(&env->op_sem);
++
++	/* wait for all init childs exit */
++	while (atomic_read(&env->pcounter) > 1) {
++		struct task_struct *g, *p;
++		long delay = 1;
++
++		if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0)
++			continue;
++		/* it was ENOCHLD or no more children somehow */
++		if (atomic_read(&env->pcounter) == 1)
++			break;
++
++		/* clear all signals to avoid wakeups */
++		if (signal_pending(current))
++			flush_signals(current);
++		/* we have child without signal sent */
++		__set_current_state(TASK_INTERRUPTIBLE);
++		schedule_timeout(delay);
++		delay = (delay < HZ) ? (delay << 1) : HZ;
++		read_lock(&tasklist_lock);
++		do_each_thread_ve(g, p) {
++			if (p != current) {
++				/*
++				 * by that time no processes other then entered
++				 * may exist in the VE. if some were missed by
++				 * zap_pid_ns_processes() this was a BUG
++				 */
++				if (!p->did_ve_enter)
++					show_lost_task(p);
++
++				force_sig_specific(SIGKILL, p);
++			}
++		} while_each_thread_ve(g, p);
++		read_unlock(&tasklist_lock);
++	}
++}
++#endif
++
+ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+ {
+ 	int nr;
+@@ -183,6 +340,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+ 	} while (rc != -ECHILD);
+ 
+ 
++#ifdef CONFIG_VE
++	zap_ve_processes(get_exec_env());
++#endif
+ 	/* Child reaper for the pid namespace is going away */
+ 	pid_ns->child_reaper = NULL;
+ 	return;
+diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
+index dbd8398..f331727 100644
+--- a/kernel/posix-timers.c
++++ b/kernel/posix-timers.c
+@@ -31,6 +31,8 @@
+  * POSIX clocks & timers
+  */
+ #include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/smp_lock.h>
+ #include <linux/interrupt.h>
+ #include <linux/slab.h>
+ #include <linux/time.h>
+@@ -46,6 +48,9 @@
+ #include <linux/wait.h>
+ #include <linux/workqueue.h>
+ #include <linux/module.h>
++#include <linux/pid_namespace.h>
++
++#include <bc/beancounter.h>
+ 
+ /*
+  * Management arrays for POSIX timers.	 Timers are kept in slab memory
+@@ -240,8 +245,8 @@ static __init int init_posix_timers(void)
+ 	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ 
+ 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
+-					sizeof (struct k_itimer), 0, SLAB_PANIC,
+-					NULL);
++					sizeof (struct k_itimer), 0,
++					SLAB_PANIC|SLAB_UBC, NULL);
+ 	idr_init(&posix_timers_id);
+ 	return 0;
+ }
+@@ -298,6 +303,13 @@ void do_schedule_next_timer(struct siginfo *info)
+ 
+ int posix_timer_event(struct k_itimer *timr,int si_private)
+ {
++	int ret;
++	struct ve_struct *ve;
++	struct user_beancounter *ub;
++
++	ve = set_exec_env(timr->it_process->ve_task_info.owner_env);
++	ub = set_exec_ub(timr->it_process->task_bc.task_ub);
++
+ 	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+ 	timr->sigq->info.si_sys_private = si_private;
+ 	/* Send signal to the process that owns this timer.*/
+@@ -310,10 +322,10 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
+ 
+ 	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+ 		struct task_struct *leader;
+-		int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
++		ret = send_sigqueue(timr->sigq, timr->it_process, 0);
+ 
+ 		if (likely(ret >= 0))
+-			return ret;
++			goto out;
+ 
+ 		timr->it_sigev_notify = SIGEV_SIGNAL;
+ 		leader = timr->it_process->group_leader;
+@@ -321,7 +333,11 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
+ 		timr->it_process = leader;
+ 	}
+ 
+-	return send_sigqueue(timr->sigq, timr->it_process, 1);
++	ret = send_sigqueue(timr->sigq, timr->it_process, 1);
++out:
++	(void)set_exec_ub(ub);
++	(void)set_exec_env(ve);
++	return ret;
+ }
+ EXPORT_SYMBOL_GPL(posix_timer_event);
+ 
+diff --git a/kernel/power/process.c b/kernel/power/process.c
+index f1d0b34..6bfb79f 100644
+--- a/kernel/power/process.c
++++ b/kernel/power/process.c
+@@ -14,6 +14,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/freezer.h>
+ 
++static atomic_t global_suspend = ATOMIC_INIT(0);
++
+ /* 
+  * Timeout for stopping processes
+  */
+@@ -26,7 +28,9 @@ static inline int freezeable(struct task_struct * p)
+ {
+ 	if ((p == current) ||
+ 	    (p->flags & PF_NOFREEZE) ||
+-	    (p->exit_state != 0))
++	    (p->exit_state != 0) ||
++	    (p->state == TASK_STOPPED) ||
++	    (p->state == TASK_TRACED))
+ 		return 0;
+ 	return 1;
+ }
+@@ -50,6 +54,28 @@ void refrigerator(void)
+ 	   processes around? */
+ 	long save;
+ 
++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE)
++	save = current->state;
++	current->state = TASK_UNINTERRUPTIBLE;
++
++	spin_lock_irq(&current->sighand->siglock);
++	if (test_and_clear_thread_flag(TIF_FREEZE)) {
++		recalc_sigpending(); /* We sent fake signal, clean it up */
++		if (atomic_read(&global_suspend) ||
++				atomic_read(&get_exec_env()->suspend))
++			current->flags |= PF_FROZEN;
++		else
++			current->state = save;
++	} else {
++		/* Freeze request could be canceled before we entered
++		 * refrigerator(). In this case we do nothing. */
++		current->state = save;
++	}
++	spin_unlock_irq(&current->sighand->siglock);
++
++	while (current->flags & PF_FROZEN)
++		schedule();
++#else
+ 	task_lock(current);
+ 	if (freezing(current)) {
+ 		frozen_process();
+@@ -71,6 +97,7 @@ void refrigerator(void)
+ 			break;
+ 		schedule();
+ 	}
++#endif
+ 	pr_debug("%s left refrigerator\n", current->comm);
+ 	__set_current_state(save);
+ }
+@@ -171,7 +198,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
+ 	do {
+ 		todo = 0;
+ 		read_lock(&tasklist_lock);
+-		do_each_thread(g, p) {
++		do_each_thread_all(g, p) {
+ 			if (frozen(p) || !freezeable(p))
+ 				continue;
+ 
+@@ -187,7 +214,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
+ 			if (!task_is_stopped_or_traced(p) &&
+ 			    !freezer_should_skip(p))
+ 				todo++;
+-		} while_each_thread(g, p);
++		} while_each_thread_all(g, p);
+ 		read_unlock(&tasklist_lock);
+ 		yield();			/* Yield is okay here */
+ 		if (time_after(jiffies, end_time))
+@@ -211,13 +238,13 @@ static int try_to_freeze_tasks(int freeze_user_space)
+ 				elapsed_csecs / 100, elapsed_csecs % 100, todo);
+ 		show_state();
+ 		read_lock(&tasklist_lock);
+-		do_each_thread(g, p) {
++		do_each_thread_all(g, p) {
+ 			task_lock(p);
+ 			if (freezing(p) && !freezer_should_skip(p))
+ 				printk(KERN_ERR " %s\n", p->comm);
+ 			cancel_freezing(p);
+ 			task_unlock(p);
+-		} while_each_thread(g, p);
++		} while_each_thread_all(g, p);
+ 		read_unlock(&tasklist_lock);
+ 	} else {
+ 		printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
+@@ -234,6 +261,7 @@ int freeze_processes(void)
+ {
+ 	int error;
+ 
++	atomic_inc(&global_suspend);
+ 	printk("Freezing user space processes ... ");
+ 	error = try_to_freeze_tasks(FREEZER_USER_SPACE);
+ 	if (error)
+@@ -248,6 +276,7 @@ int freeze_processes(void)
+  Exit:
+ 	BUG_ON(in_atomic());
+ 	printk("\n");
++	atomic_dec(&global_suspend);
+ 	return error;
+ }
+ 
+@@ -256,15 +285,17 @@ static void thaw_tasks(int thaw_user_space)
+ 	struct task_struct *g, *p;
+ 
+ 	read_lock(&tasklist_lock);
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		if (!freezeable(p))
+ 			continue;
+ 
+ 		if (!p->mm == thaw_user_space)
+ 			continue;
+ 
+-		thaw_process(p);
+-	} while_each_thread(g, p);
++		if (!thaw_process(p))
++			printk(KERN_WARNING " Strange, %s not stopped\n",
++				p->comm );
++	} while_each_thread_all(g, p);
+ 	read_unlock(&tasklist_lock);
+ }
+ 
+diff --git a/kernel/printk.c b/kernel/printk.c
+index e2129e8..f472b1a 100644
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -31,7 +31,9 @@
+ #include <linux/smp.h>
+ #include <linux/security.h>
+ #include <linux/bootmem.h>
++#include <linux/vzratelimit.h>
+ #include <linux/syscalls.h>
++#include <linux/veprintk.h>
+ 
+ #include <asm/uaccess.h>
+ 
+@@ -90,7 +92,7 @@ static int console_locked, console_suspended;
+  * It is also used in interesting ways to provide interlocking in
+  * release_console_sem().
+  */
+-static DEFINE_SPINLOCK(logbuf_lock);
++DEFINE_SPINLOCK(logbuf_lock);
+ 
+ #define LOG_BUF_MASK (log_buf_len-1)
+ #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+@@ -124,6 +126,7 @@ static int preferred_console = -1;
+ 
+ /* Flag: console code may call schedule() */
+ static int console_may_schedule;
++int console_silence_loglevel;
+ 
+ #ifdef CONFIG_PRINTK
+ 
+@@ -132,6 +135,19 @@ static char *log_buf = __log_buf;
+ static int log_buf_len = __LOG_BUF_LEN;
+ static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+ 
++static int __init setup_console_silencelevel(char *str)
++{
++	int level;
++
++	if (get_option(&str, &level) != 1)
++		return 0;
++
++	console_silence_loglevel = level;
++	return 1;
++}
++
++__setup("silencelevel=", setup_console_silencelevel);
++
+ static int __init log_buf_len_setup(char *str)
+ {
+ 	unsigned size = memparse(str, &str);
+@@ -302,6 +318,9 @@ int do_syslog(int type, char __user *buf, int len)
+ 	char c;
+ 	int error = 0;
+ 
++	if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7))
++		goto out;
++
+ 	error = security_syslog(type);
+ 	if (error)
+ 		return error;
+@@ -322,15 +341,15 @@ int do_syslog(int type, char __user *buf, int len)
+ 			error = -EFAULT;
+ 			goto out;
+ 		}
+-		error = wait_event_interruptible(log_wait,
+-							(log_start - log_end));
++		error = wait_event_interruptible(ve_log_wait,
++						(ve_log_start - ve_log_end));
+ 		if (error)
+ 			goto out;
+ 		i = 0;
+ 		spin_lock_irq(&logbuf_lock);
+-		while (!error && (log_start != log_end) && i < len) {
+-			c = LOG_BUF(log_start);
+-			log_start++;
++		while (!error && (ve_log_start != ve_log_end) && i < len) {
++			c = VE_LOG_BUF(ve_log_start);
++			ve_log_start++;
+ 			spin_unlock_irq(&logbuf_lock);
+ 			error = __put_user(c,buf);
+ 			buf++;
+@@ -356,15 +375,17 @@ int do_syslog(int type, char __user *buf, int len)
+ 			error = -EFAULT;
+ 			goto out;
+ 		}
++		if (ve_log_buf == NULL)
++			goto out;
+ 		count = len;
+-		if (count > log_buf_len)
+-			count = log_buf_len;
++		if (count > ve_log_buf_len)
++			count = ve_log_buf_len;
+ 		spin_lock_irq(&logbuf_lock);
+-		if (count > logged_chars)
+-			count = logged_chars;
++		if (count > ve_logged_chars)
++			count = ve_logged_chars;
+ 		if (do_clear)
+-			logged_chars = 0;
+-		limit = log_end;
++			ve_logged_chars = 0;
++		limit = ve_log_end;
+ 		/*
+ 		 * __put_user() could sleep, and while we sleep
+ 		 * printk() could overwrite the messages
+@@ -373,9 +394,9 @@ int do_syslog(int type, char __user *buf, int len)
+ 		 */
+ 		for (i = 0; i < count && !error; i++) {
+ 			j = limit-1-i;
+-			if (j + log_buf_len < log_end)
++			if (j + ve_log_buf_len < ve_log_end)
+ 				break;
+-			c = LOG_BUF(j);
++			c = VE_LOG_BUF(j);
+ 			spin_unlock_irq(&logbuf_lock);
+ 			error = __put_user(c,&buf[count-1-i]);
+ 			cond_resched();
+@@ -399,7 +420,7 @@ int do_syslog(int type, char __user *buf, int len)
+ 		}
+ 		break;
+ 	case 5:		/* Clear ring buffer */
+-		logged_chars = 0;
++		ve_logged_chars = 0;
+ 		break;
+ 	case 6:		/* Disable logging to console */
+ 		console_loglevel = minimum_console_loglevel;
+@@ -411,16 +432,19 @@ int do_syslog(int type, char __user *buf, int len)
+ 		error = -EINVAL;
+ 		if (len < 1 || len > 8)
+ 			goto out;
++		error = 0;
++		/* VE has no console, so return success */
++		if (!ve_is_super(get_exec_env()))
++			goto out;
+ 		if (len < minimum_console_loglevel)
+ 			len = minimum_console_loglevel;
+ 		console_loglevel = len;
+-		error = 0;
+ 		break;
+ 	case 9:		/* Number of chars in the log buffer */
+-		error = log_end - log_start;
++		error = ve_log_end - ve_log_start;
+ 		break;
+ 	case 10:	/* Size of the log buffer */
+-		error = log_buf_len;
++		error = ve_log_buf_len;
+ 		break;
+ 	default:
+ 		error = -EINVAL;
+@@ -531,14 +555,14 @@ static void call_console_drivers(unsigned start, unsigned end)
+ 
+ static void emit_log_char(char c)
+ {
+-	LOG_BUF(log_end) = c;
+-	log_end++;
+-	if (log_end - log_start > log_buf_len)
+-		log_start = log_end - log_buf_len;
+-	if (log_end - con_start > log_buf_len)
+-		con_start = log_end - log_buf_len;
+-	if (logged_chars < log_buf_len)
+-		logged_chars++;
++	VE_LOG_BUF(ve_log_end) = c;
++	ve_log_end++;
++	if (ve_log_end - ve_log_start > ve_log_buf_len)
++		ve_log_start = ve_log_end - ve_log_buf_len;
++	if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len)
++		con_start = ve_log_end - ve_log_buf_len;
++	if (ve_logged_chars < ve_log_buf_len)
++		ve_logged_chars++;
+ }
+ 
+ /*
+@@ -604,6 +628,30 @@ static int have_callable_console(void)
+  * printf(3)
+  */
+ 
++static inline int ve_log_init(void)
++{
++#ifdef CONFIG_VE
++	if (ve_log_buf != NULL)
++		return 0;
++
++	if (ve_is_super(get_exec_env())) {
++		ve0._log_wait = &log_wait;
++		ve0._log_start = &log_start;
++		ve0._log_end = &log_end;
++		ve0._logged_chars = &logged_chars;
++		ve0.log_buf = log_buf;
++		return 0;
++	}
++
++	ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC);
++	if (!ve_log_buf)
++		return -ENOMEM;
++
++	memset(ve_log_buf, 0, ve_log_buf_len);
++#endif
++	return 0;
++}
++
+ asmlinkage int printk(const char *fmt, ...)
+ {
+ 	va_list args;
+@@ -670,7 +718,7 @@ static const char printk_recursion_bug_msg [] =
+ 			KERN_CRIT "BUG: recent printk recursion!\n";
+ static int printk_recursion_bug;
+ 
+-asmlinkage int vprintk(const char *fmt, va_list args)
++asmlinkage int __vprintk(const char *fmt, va_list args)
+ {
+ 	static int log_level_unknown = 1;
+ 	static char printk_buf[1024];
+@@ -679,6 +727,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+ 	int printed_len = 0;
+ 	int this_cpu;
+ 	char *p;
++	int err, need_wake;
+ 
+ 	boot_delay_msec();
+ 
+@@ -709,6 +758,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+ 	spin_lock(&logbuf_lock);
+ 	printk_cpu = this_cpu;
+ 
++	err = ve_log_init();
++	if (err) {
++		spin_unlock_irqrestore(&logbuf_lock, flags);
++		return err;
++	}
++
+ 	if (printk_recursion_bug) {
+ 		printk_recursion_bug = 0;
+ 		strcpy(printk_buf, printk_recursion_bug_msg);
+@@ -785,7 +840,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+ 	 * will release 'logbuf_lock' regardless of whether it
+ 	 * actually gets the semaphore or not.
+ 	 */
+-	if (acquire_console_semaphore_for_printk(this_cpu))
++	if (!ve_is_super(get_exec_env())) {
++		need_wake = (ve_log_start != ve_log_end);
++		spin_unlock_irqrestore(&logbuf_lock, flags);
++		if (!oops_in_progress && need_wake)
++			wake_up_interruptible(&ve_log_wait);
++	} else if (acquire_console_semaphore_for_printk(this_cpu))
+ 		release_console_sem();
+ 
+ 	lockdep_on();
+@@ -798,6 +858,41 @@ out_restore_irqs:
+ EXPORT_SYMBOL(printk);
+ EXPORT_SYMBOL(vprintk);
+ 
++asmlinkage int vprintk(const char *fmt, va_list args)
++{
++	int i;
++	struct ve_struct *env;
++
++	env = set_exec_env(get_ve0());
++	i = __vprintk(fmt, args);
++	(void)set_exec_env(env);
++	return i;
++}
++
++asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
++{
++	int printed_len;
++
++	printed_len = 0;
++	if (ve_is_super(get_exec_env()) || (dst & VE0_LOG))
++		printed_len = vprintk(fmt, args);
++	if (!ve_is_super(get_exec_env()) && (dst & VE_LOG))
++		printed_len = __vprintk(fmt, args);
++	return printed_len;
++}
++
++asmlinkage int ve_printk(int dst, const char *fmt, ...)
++{
++	va_list args;
++	int printed_len;
++
++	va_start(args, fmt);
++	printed_len = ve_vprintk(dst, fmt, args);
++	va_end(args);
++	return printed_len;
++}
++EXPORT_SYMBOL(ve_printk);
++
+ #else
+ 
+ asmlinkage long sys_syslog(int type, char __user *buf, int len)
+@@ -1346,6 +1441,36 @@ int printk_ratelimit(void)
+ }
+ EXPORT_SYMBOL(printk_ratelimit);
+ 
++/*
++ *	Rate limiting stuff.
++ */
++int vz_ratelimit(struct vz_rate_info *p)
++{
++	unsigned long cjif, djif;
++	unsigned long flags;
++	static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
++	long new_bucket;
++
++	spin_lock_irqsave(&ratelimit_lock, flags);
++	cjif = jiffies;
++	djif = cjif - p->last;
++	if (djif < p->interval) {
++		if (p->bucket >= p->burst) {
++			spin_unlock_irqrestore(&ratelimit_lock, flags);
++			return 0;
++		}
++		p->bucket++;
++	} else {
++		new_bucket = p->bucket - (djif / (unsigned)p->interval);
++		if (new_bucket < 0)
++			new_bucket = 0;
++		p->bucket = new_bucket + 1;
++	}
++	p->last = cjif;
++	spin_unlock_irqrestore(&ratelimit_lock, flags);
++	return 1;
++}
++
+ /**
+  * printk_timed_ratelimit - caller-controlled printk ratelimiting
+  * @caller_jiffies: pointer to caller's state
+diff --git a/kernel/ptrace.c b/kernel/ptrace.c
+index 6c19e94..df8f075 100644
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -132,6 +132,8 @@ int __ptrace_may_attach(struct task_struct *task)
+ 	 * or halting the specified task is impossible.
+ 	 */
+ 	int dumpable = 0;
++	int vps_dumpable = 0;
++
+ 	/* Don't let security modules deny introspection */
+ 	if (task == current)
+ 		return 0;
+@@ -143,11 +145,17 @@ int __ptrace_may_attach(struct task_struct *task)
+ 	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+ 		return -EPERM;
+ 	smp_rmb();
+-	if (task->mm)
++	if (task->mm) {
+ 		dumpable = get_dumpable(task->mm);
++		vps_dumpable = (task->mm->vps_dumpable == 1);
++	}
++
+ 	if (!dumpable && !capable(CAP_SYS_PTRACE))
+ 		return -EPERM;
+-
++	if (!vps_dumpable && !ve_is_super(get_exec_env()))
++		return -EPERM;
++	if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env()))
++		return -EPERM;
+ 	return security_ptrace(current, task);
+ }
+ 
+@@ -198,6 +206,8 @@ repeat:
+ 	retval = __ptrace_may_attach(task);
+ 	if (retval)
+ 		goto bad;
++	if (task->mm->vps_dumpable == 2)
++		goto bad;
+ 
+ 	/* Go */
+ 	task->ptrace |= PT_PTRACED;
+@@ -291,6 +301,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
+ 	}
+ 	return copied;
+ }
++EXPORT_SYMBOL_GPL(access_process_vm);
+ 
+ static int ptrace_setoptions(struct task_struct *child, long data)
+ {
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 4e2f603..57a7d99 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -70,6 +70,7 @@
+ #include <linux/bootmem.h>
+ #include <linux/debugfs.h>
+ #include <linux/ctype.h>
++#include <linux/fairsched.h>
+ 
+ #include <asm/tlb.h>
+ #include <asm/irq_regs.h>
+@@ -340,6 +341,8 @@ static inline struct task_group *task_group(struct task_struct *p)
+ #elif defined(CONFIG_CGROUP_SCHED)
+ 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
+ 				struct task_group, css);
++#elif defined(CONFIG_VZ_FAIRSCHED)
++	tg = p->fsched_node->tg;
+ #else
+ 	tg = &init_task_group;
+ #endif
+@@ -509,6 +512,9 @@ struct rq {
+ 	 */
+ 	unsigned long nr_uninterruptible;
+ 
++	unsigned long nr_sleeping;
++	unsigned long nr_stopped;
++
+ 	struct task_struct *curr, *idle;
+ 	unsigned long next_balance;
+ 	struct mm_struct *prev_mm;
+@@ -578,6 +584,11 @@ static inline int cpu_of(struct rq *rq)
+ #endif
+ }
+ 
++struct kernel_stat_glob kstat_glob;
++DEFINE_SPINLOCK(kstat_glb_lock);
++EXPORT_SYMBOL(kstat_glob);
++EXPORT_SYMBOL(kstat_glb_lock);
++
+ /*
+  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+  * See detach_destroy_domains: synchronize_sched for details.
+@@ -981,6 +992,217 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+ 	spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+ 
++#ifdef CONFIG_VE
++static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu)
++{
++	VE_CPU_STATS(ve, cpu)->nr_iowait++;
++}
++
++static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu)
++{
++	VE_CPU_STATS(ve, cpu)->nr_iowait--;
++}
++
++static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu)
++{
++	VE_CPU_STATS(ve, cpu)->nr_unint++;
++}
++
++static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu)
++{
++	VE_CPU_STATS(ve, cpu)->nr_unint--;
++}
++
++#define cycles_after(a, b)	((long long)(b) - (long long)(a) < 0)
++
++cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu)
++{
++	struct ve_cpu_stats *ve_stat;
++	unsigned v;
++	cycles_t strt, ret, cycles;
++
++	ve_stat = VE_CPU_STATS(ve, cpu);
++	do {
++		v = read_seqcount_begin(&ve_stat->stat_lock);
++		ret = ve_stat->idle_time;
++		strt = ve_stat->strt_idle_time;
++		if (strt && nr_uninterruptible_ve(ve) == 0) {
++			cycles = get_cycles();
++			if (cycles_after(cycles, strt))
++				ret += cycles - strt;
++		}
++	} while (read_seqcount_retry(&ve_stat->stat_lock, v));
++	return ret;
++}
++EXPORT_SYMBOL(ve_sched_get_idle_time);
++
++cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu)
++{
++	struct ve_cpu_stats *ve_stat;
++	unsigned v;
++	cycles_t strt, ret, cycles;
++
++	ve_stat = VE_CPU_STATS(ve, cpu);
++	do {
++		v = read_seqcount_begin(&ve_stat->stat_lock);
++		ret = ve_stat->iowait_time;
++		strt = ve_stat->strt_idle_time;
++		if (strt && nr_iowait_ve(ve) > 0) {
++			cycles = get_cycles();
++			if (cycles_after(cycles, strt))
++				ret += cycles - strt;
++		}
++	} while (read_seqcount_retry(&ve_stat->stat_lock, v));
++	return ret;
++}
++EXPORT_SYMBOL(ve_sched_get_iowait_time);
++
++static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles)
++{
++	struct ve_cpu_stats *ve_stat;
++
++	ve_stat = VE_CPU_STATS(ve, cpu);
++
++	write_seqcount_begin(&ve_stat->stat_lock);
++	if (ve_stat->strt_idle_time) {
++		if (cycles_after(cycles, ve_stat->strt_idle_time)) {
++			if (nr_iowait_ve(ve) == 0)
++				ve_stat->idle_time +=
++					cycles - ve_stat->strt_idle_time;
++			else
++				ve_stat->iowait_time +=
++					cycles - ve_stat->strt_idle_time;
++		}
++		ve_stat->strt_idle_time = 0;
++	}
++	write_seqcount_end(&ve_stat->stat_lock);
++}
++
++static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles)
++{
++	struct ve_cpu_stats *ve_stat;
++
++	ve_stat = VE_CPU_STATS(ve, cpu);
++
++	write_seqcount_begin(&ve_stat->stat_lock);
++	ve_stat->strt_idle_time = cycles;
++	write_seqcount_end(&ve_stat->stat_lock);
++}
++
++static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles)
++{
++	if (++VE_CPU_STATS(ve, cpu)->nr_running == 1)
++		ve_stop_idle(ve, cpu, cycles);
++}
++
++static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles)
++{
++	if (--VE_CPU_STATS(ve, cpu)->nr_running == 0)
++		ve_strt_idle(ve, cpu, cycles);
++}
++
++void ve_sched_attach(struct ve_struct *target_ve)
++{
++	struct task_struct *tsk;
++	unsigned int cpu;
++	cycles_t cycles;
++
++	tsk = current;
++	preempt_disable();
++	cycles = get_cycles();
++	cpu = task_cpu(tsk);
++	ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles);
++	ve_nr_running_inc(target_ve, cpu, cycles);
++	preempt_enable();
++}
++EXPORT_SYMBOL(ve_sched_attach);
++
++static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc)
++{
++	struct ve_task_info *ti;
++
++	ti = VE_TASK_INFO(p);
++	write_seqcount_begin(&ti->wakeup_lock);
++	ti->wakeup_stamp = cyc;
++	write_seqcount_end(&ti->wakeup_lock);
++}
++
++static inline void update_sched_lat(struct task_struct *t, cycles_t cycles)
++{
++	int cpu;
++	cycles_t ve_wstamp;
++
++	/* safe due to runqueue lock */
++	cpu = smp_processor_id();
++	ve_wstamp = t->ve_task_info.wakeup_stamp;
++
++	if (ve_wstamp && cycles > ve_wstamp) {
++		KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
++				cpu, cycles - ve_wstamp);
++		KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve,
++				cpu, cycles - ve_wstamp);
++	}
++}
++
++static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles)
++{
++#ifdef CONFIG_FAIRSCHED
++	if (prev != this_pcpu()->idle) {
++#else
++	if (prev != this_rq()->idle) {
++#endif
++		VE_CPU_STATS(prev->ve_task_info.owner_env,
++				smp_processor_id())->used_time +=
++			cycles - prev->ve_task_info.sched_time;
++
++		prev->ve_task_info.sched_time = cycles;
++	}
++}
++#else
++static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles)
++{
++}
++
++static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles)
++{
++}
++
++static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu)
++{
++}
++
++static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu)
++{
++}
++
++static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu)
++{
++}
++
++static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu)
++{
++}
++
++static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles)
++{
++}
++#endif
++
++struct task_nrs_struct {
++	long nr_running;
++	long nr_unint;
++	long nr_stopped;
++	long nr_sleeping;
++	long nr_iowait;
++	long long nr_switches;
++} ____cacheline_aligned_in_smp;
++
++unsigned long nr_zombie = 0;	/* protected by tasklist_lock */
++EXPORT_SYMBOL(nr_zombie);
++
++atomic_t nr_dead = ATOMIC_INIT(0);
++EXPORT_SYMBOL(nr_dead);
++
+ /*
+  * this_rq_lock - lock this runqueue and disable interrupts.
+  */
+@@ -1608,11 +1830,21 @@ static int effective_prio(struct task_struct *p)
+  */
+ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+ {
+-	if (task_contributes_to_load(p))
++	cycles_t cycles;
++
++#ifdef CONFIG_VE
++	cycles = get_cycles();
++	write_wakeup_stamp(p, cycles);
++	p->ve_task_info.sleep_time += cycles;
++#endif
++	if (task_contributes_to_load(p)) {
+ 		rq->nr_uninterruptible--;
++		ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p));
++	}
+ 
+ 	enqueue_task(rq, p, wakeup);
+ 	inc_nr_running(p, rq);
++	ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles);
+ }
+ 
+ /*
+@@ -1620,6 +1852,30 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+  */
+ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+ {
++	cycles_t cycles;
++#ifdef CONFIG_VE
++	unsigned int cpu, pcpu;
++	struct ve_struct *ve;
++
++	cycles = get_cycles();
++	cpu = task_cpu(p);
++	pcpu = smp_processor_id();
++	ve = p->ve_task_info.owner_env;
++
++	p->ve_task_info.sleep_time -= cycles;
++#endif
++	if (p->state == TASK_UNINTERRUPTIBLE) {
++		ve_nr_unint_inc(ve, cpu);
++	}
++	if (p->state == TASK_INTERRUPTIBLE) {
++		rq->nr_sleeping++;
++	}
++	if (p->state == TASK_STOPPED) {
++		rq->nr_stopped++;
++	}
++
++	ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles);
++
+ 	if (task_contributes_to_load(p))
+ 		rq->nr_uninterruptible++;
+ 
+@@ -1843,6 +2099,7 @@ void wait_task_inactive(struct task_struct *p)
+ 		break;
+ 	}
+ }
++EXPORT_SYMBOL_GPL(wait_task_inactive);
+ 
+ /***
+  * kick_process - kick a running thread to enter/exit the kernel
+@@ -2248,6 +2505,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
+ 	/* Want to start with kernel preemption disabled. */
+ 	task_thread_info(p)->preempt_count = 1;
+ #endif
++#ifdef CONFIG_VE
++	/* cosmetic: sleep till wakeup below */
++	p->ve_task_info.sleep_time -= get_cycles();
++#endif
+ 	put_cpu();
+ }
+ 
+@@ -2278,6 +2539,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+ 		 */
+ 		p->sched_class->task_new(rq, p);
+ 		inc_nr_running(p, rq);
++		ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p),
++				  get_cycles());
+ 	}
+ 	check_preempt_curr(rq, p);
+ #ifdef CONFIG_SMP
+@@ -2439,6 +2702,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
+ 	if (current->set_child_tid)
+ 		put_user(task_pid_vnr(current), current->set_child_tid);
+ }
++EXPORT_SYMBOL_GPL(schedule_tail);
+ 
+ /*
+  * context_switch - switch to the new MM and the new
+@@ -2509,6 +2773,7 @@ unsigned long nr_running(void)
+ 
+ 	return sum;
+ }
++EXPORT_SYMBOL_GPL(nr_running);
+ 
+ unsigned long nr_uninterruptible(void)
+ {
+@@ -2526,6 +2791,7 @@ unsigned long nr_uninterruptible(void)
+ 
+ 	return sum;
+ }
++EXPORT_SYMBOL_GPL(nr_uninterruptible);
+ 
+ unsigned long long nr_context_switches(void)
+ {
+@@ -2563,6 +2829,72 @@ unsigned long nr_active(void)
+ 	return running + uninterruptible;
+ }
+ 
++unsigned long nr_stopped(void)
++{
++	unsigned long i, sum = 0;
++
++	for_each_online_cpu(i)
++		sum += cpu_rq(i)->nr_stopped;
++	if (unlikely((long)sum < 0))
++		sum = 0;
++	return sum;
++}
++EXPORT_SYMBOL(nr_stopped);
++
++unsigned long nr_sleeping(void)
++{
++	unsigned long i, sum = 0;
++
++	for_each_online_cpu(i)
++		sum += cpu_rq(i)->nr_sleeping;
++	if (unlikely((long)sum < 0))
++		sum = 0;
++	return sum;
++}
++EXPORT_SYMBOL(nr_sleeping);
++
++#ifdef CONFIG_VE
++unsigned long nr_running_ve(struct ve_struct *ve)
++{
++	int i;
++	long sum = 0;
++	cpumask_t ve_cpus;
++
++	ve_cpu_online_map(ve, &ve_cpus);
++	for_each_cpu_mask(i, ve_cpus)
++		sum += VE_CPU_STATS(ve, i)->nr_running;
++	return (unsigned long)(sum < 0 ? 0 : sum);
++}
++EXPORT_SYMBOL(nr_running_ve);
++
++unsigned long nr_uninterruptible_ve(struct ve_struct *ve)
++{
++	int i;
++	long sum = 0;
++	cpumask_t ve_cpus;
++
++	sum = 0;
++	ve_cpu_online_map(ve, &ve_cpus);
++	for_each_cpu_mask(i, ve_cpus)
++		sum += VE_CPU_STATS(ve, i)->nr_unint;
++	return (unsigned long)(sum < 0 ? 0 : sum);
++}
++EXPORT_SYMBOL(nr_uninterruptible_ve);
++
++unsigned long nr_iowait_ve(struct ve_struct *ve)
++{
++	int i;
++	long sum = 0;
++	cpumask_t ve_cpus;
++
++	ve_cpu_online_map(ve, &ve_cpus);
++	for_each_cpu_mask(i, ve_cpus)
++		sum += VE_CPU_STATS(ve, i)->nr_iowait;
++	return (unsigned long)(sum < 0 ? 0 : sum);
++}
++EXPORT_SYMBOL(nr_iowait_ve);
++#endif
++
+ /*
+  * Update rq->cpu_load[] statistics. This function is usually called every
+  * scheduler tick (TICK_NSEC).
+@@ -2593,6 +2925,16 @@ static void update_cpu_load(struct rq *this_rq)
+ 	}
+ }
+ 
++#ifdef CONFIG_VE
++#define update_ve_cpu_time(p, time, tick)			\
++	do {							\
++		VE_CPU_STATS((p)->ve_task_info.owner_env,	\
++				task_cpu(p))->time += tick;	\
++	} while (0)
++#else
++#define update_ve_cpu_time(p, time, tick)      do { } while (0)
++#endif
++
+ #ifdef CONFIG_SMP
+ 
+ /*
+@@ -2720,8 +3062,15 @@ void sched_exec(void)
+ static void pull_task(struct rq *src_rq, struct task_struct *p,
+ 		      struct rq *this_rq, int this_cpu)
+ {
++	struct ve_struct *ve;
++	cycles_t cycles = get_cycles();
++
++	ve = VE_TASK_INFO(p)->owner_env;
++
+ 	deactivate_task(src_rq, p, 0);
++	ve_nr_running_dec(ve, task_cpu(p), cycles);
+ 	set_task_cpu(p, this_cpu);
++	ve_nr_running_inc(ve, task_cpu(p), cycles);
+ 	activate_task(this_rq, p, 0);
+ 	/*
+ 	 * Note that idle threads have a prio of MAX_PRIO, for this test
+@@ -3891,10 +4240,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
+ 
+ 	/* Add user time to cpustat. */
+ 	tmp = cputime_to_cputime64(cputime);
+-	if (TASK_NICE(p) > 0)
++	if (TASK_NICE(p) > 0) {
+ 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+-	else
++		update_ve_cpu_time(p, nice, tmp);
++	} else {
+ 		cpustat->user = cputime64_add(cpustat->user, tmp);
++		update_ve_cpu_time(p, user, tmp);
++	}
+ }
+ 
+ /*
+@@ -3948,6 +4300,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
+ 
+ 	/* Add system time to cpustat. */
+ 	tmp = cputime_to_cputime64(cputime);
++	update_ve_cpu_time(p, system, tmp);
+ 	if (hardirq_count() - hardirq_offset)
+ 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ 	else if (softirq_count())
+@@ -4188,12 +4541,30 @@ need_resched_nonpreemptible:
+ 	next = pick_next_task(rq, prev);
+ 
+ 	if (likely(prev != next)) {
++		cycles_t cycles = get_cycles();
++
+ 		sched_info_switch(prev, next);
+ 
+ 		rq->nr_switches++;
+ 		rq->curr = next;
+ 		++*switch_count;
+ 
++#ifdef CONFIG_VE
++		prev->ve_task_info.sleep_stamp = cycles;
++		if (prev->state == TASK_RUNNING && prev != this_rq()->idle)
++			write_wakeup_stamp(prev, cycles);
++		update_sched_lat(next, cycles);
++
++		/* because next & prev are protected with
++		 * runqueue lock we may not worry about
++		 * wakeup_stamp and sched_time protection
++		 * (same thing in 'else' branch below)
++		 */
++		update_ve_task_info(prev, cycles);
++		next->ve_task_info.sched_time = cycles;
++		write_wakeup_stamp(next, 0);
++#endif
++
+ 		context_switch(rq, prev, next); /* unlocks the rq */
+ 		/*
+ 		 * the context switch might have flipped the stack from under
+@@ -4201,8 +4572,10 @@ need_resched_nonpreemptible:
+ 		 */
+ 		cpu = smp_processor_id();
+ 		rq = cpu_rq(cpu);
+-	} else
++	} else {
++		update_ve_task_info(prev, get_cycles());
+ 		spin_unlock_irq(&rq->lock);
++	}
+ 
+ 	hrtick_set(rq);
+ 
+@@ -4785,7 +5158,7 @@ recheck:
+ 	/*
+ 	 * Allow unprivileged RT tasks to decrease priority:
+ 	 */
+-	if (!capable(CAP_SYS_NICE)) {
++	if (!capable(CAP_SYS_ADMIN)) {
+ 		if (rt_policy(policy)) {
+ 			unsigned long rlim_rtprio;
+ 
+@@ -5257,10 +5630,15 @@ EXPORT_SYMBOL(yield);
+ void __sched io_schedule(void)
+ {
+ 	struct rq *rq = &__raw_get_cpu_var(runqueues);
++#ifdef CONFIG_VE
++	struct ve_struct *ve = current->ve_task_info.owner_env;
++#endif
+ 
+ 	delayacct_blkio_start();
+ 	atomic_inc(&rq->nr_iowait);
++	ve_nr_iowait_inc(ve, task_cpu(current));
+ 	schedule();
++	ve_nr_iowait_dec(ve, task_cpu(current));
+ 	atomic_dec(&rq->nr_iowait);
+ 	delayacct_blkio_end();
+ }
+@@ -5270,10 +5648,15 @@ long __sched io_schedule_timeout(long timeout)
+ {
+ 	struct rq *rq = &__raw_get_cpu_var(runqueues);
+ 	long ret;
++#ifdef CONFIG_VE
++	struct ve_struct *ve = current->ve_task_info.owner_env;
++#endif
+ 
+ 	delayacct_blkio_start();
+ 	atomic_inc(&rq->nr_iowait);
++	ve_nr_iowait_inc(ve, task_cpu(current));
+ 	ret = schedule_timeout(timeout);
++	ve_nr_iowait_dec(ve, task_cpu(current));
+ 	atomic_dec(&rq->nr_iowait);
+ 	delayacct_blkio_end();
+ 	return ret;
+@@ -5394,17 +5777,7 @@ void sched_show_task(struct task_struct *p)
+ 	state = p->state ? __ffs(p->state) + 1 : 0;
+ 	printk(KERN_INFO "%-13.13s %c", p->comm,
+ 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+-#if BITS_PER_LONG == 32
+-	if (state == TASK_RUNNING)
+-		printk(KERN_CONT " running  ");
+-	else
+-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+-#else
+-	if (state == TASK_RUNNING)
+-		printk(KERN_CONT "  running task    ");
+-	else
+-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+-#endif
++	printk(KERN_CONT " %p ", p);
+ #ifdef CONFIG_DEBUG_STACK_USAGE
+ 	{
+ 		unsigned long *n = end_of_stack(p);
+@@ -5425,13 +5798,13 @@ void show_state_filter(unsigned long state_filter)
+ 
+ #if BITS_PER_LONG == 32
+ 	printk(KERN_INFO
+-		"  task                PC stack   pid father\n");
++		"  task          taskaddr stack   pid father\n");
+ #else
+ 	printk(KERN_INFO
+-		"  task                        PC stack   pid father\n");
++		"  task                  taskaddr stack   pid father\n");
+ #endif
+ 	read_lock(&tasklist_lock);
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		/*
+ 		 * reset the NMI-timeout, listing all files on a slow
+ 		 * console might take alot of time:
+@@ -5439,7 +5812,7 @@ void show_state_filter(unsigned long state_filter)
+ 		touch_nmi_watchdog();
+ 		if (!state_filter || (p->state & state_filter))
+ 			sched_show_task(p);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 
+ 	touch_all_softlockup_watchdogs();
+ 
+@@ -5795,13 +6168,13 @@ static void migrate_live_tasks(int src_cpu)
+ 
+ 	read_lock(&tasklist_lock);
+ 
+-	do_each_thread(t, p) {
++	do_each_thread_all(t, p) {
+ 		if (p == current)
+ 			continue;
+ 
+ 		if (task_cpu(p) == src_cpu)
+ 			move_task_off_dead_cpu(src_cpu, p);
+-	} while_each_thread(t, p);
++	} while_each_thread_all(t, p);
+ 
+ 	read_unlock(&tasklist_lock);
+ }
+@@ -7753,7 +8126,7 @@ void __init sched_init(void)
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 		init_task_group.shares = init_task_group_load;
+ 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+-#ifdef CONFIG_CGROUP_SCHED
++#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED)
+ 		/*
+ 		 * How much cpu bandwidth does init_task_group get?
+ 		 *
+@@ -7799,7 +8172,7 @@ void __init sched_init(void)
+ 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+ #ifdef CONFIG_RT_GROUP_SCHED
+ 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+-#ifdef CONFIG_CGROUP_SCHED
++#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED)
+ 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
+ #elif defined CONFIG_USER_SCHED
+ 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
+@@ -7858,6 +8231,7 @@ void __init sched_init(void)
+ 	 * During early bootup we pretend to be a normal task:
+ 	 */
+ 	current->sched_class = &fair_sched_class;
++	fairsched_init_early();
+ 
+ 	scheduler_running = 1;
+ }
+@@ -7910,7 +8284,7 @@ void normalize_rt_tasks(void)
+ 	struct rq *rq;
+ 
+ 	read_lock_irqsave(&tasklist_lock, flags);
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		/*
+ 		 * Only normalize user tasks:
+ 		 */
+@@ -7941,7 +8315,7 @@ void normalize_rt_tasks(void)
+ 
+ 		__task_rq_unlock(rq);
+ 		spin_unlock(&p->pi_lock);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 
+ 	read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
+index 8bb7130..e891f48 100644
+--- a/kernel/sched_debug.c
++++ b/kernel/sched_debug.c
+@@ -101,12 +101,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+ 
+ 	read_lock_irqsave(&tasklist_lock, flags);
+ 
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ 			continue;
+ 
+ 		print_task(m, rq, p);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 
+ 	read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+diff --git a/kernel/signal.c b/kernel/signal.c
+index 6c0958e..fd916a1 100644
+--- a/kernel/signal.c
++++ b/kernel/signal.c
+@@ -31,13 +31,32 @@
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+ #include <asm/siginfo.h>
++#include <bc/misc.h>
+ #include "audit.h"	/* audit_signal_info() */
+ 
+ /*
+  * SLAB caches for signal bits.
+  */
+ 
+-static struct kmem_cache *sigqueue_cachep;
++struct kmem_cache *sigqueue_cachep;
++EXPORT_SYMBOL(sigqueue_cachep);
++
++static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t)
++{
++	struct ve_struct *ve;
++
++	/* always allow signals from the kernel */
++	if (info == SEND_SIG_FORCED ||
++		       (!is_si_special(info) && SI_FROMKERNEL(info)))
++		return 0;
++
++	ve = current->ve_task_info.owner_env;
++	if (ve->ve_ns->pid_ns->child_reaper != t)
++		return 0;
++	if (ve_is_super(get_exec_env()))
++		return 0;
++	return !sig_user_defined(t, sig) || sig_kernel_only(sig);
++}
+ 
+ static int __sig_ignored(struct task_struct *t, int sig)
+ {
+@@ -101,7 +120,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+ 
+ #define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
+ 
+-static int recalc_sigpending_tsk(struct task_struct *t)
++int recalc_sigpending_tsk(struct task_struct *t)
+ {
+ 	if (t->signal->group_stop_count > 0 ||
+ 	    PENDING(&t->pending, &t->blocked) ||
+@@ -126,6 +145,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
+ 	if (recalc_sigpending_tsk(t))
+ 		signal_wake_up(t, 0);
+ }
++EXPORT_SYMBOL_GPL(recalc_sigpending_tsk);
+ 
+ void recalc_sigpending(void)
+ {
+@@ -184,8 +204,13 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+ 	atomic_inc(&user->sigpending);
+ 	if (override_rlimit ||
+ 	    atomic_read(&user->sigpending) <=
+-			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
++			t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+ 		q = kmem_cache_alloc(sigqueue_cachep, flags);
++		if (q && ub_siginfo_charge(q, get_task_ub(t))) {
++			kmem_cache_free(sigqueue_cachep, q);
++			q = NULL;
++		}
++	}
+ 	if (unlikely(q == NULL)) {
+ 		atomic_dec(&user->sigpending);
+ 	} else {
+@@ -202,6 +227,7 @@ static void __sigqueue_free(struct sigqueue *q)
+ 		return;
+ 	atomic_dec(&q->user->sigpending);
+ 	free_uid(q->user);
++	ub_siginfo_uncharge(q);
+ 	kmem_cache_free(sigqueue_cachep, q);
+ }
+ 
+@@ -384,7 +410,18 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+ 			siginfo_t *info)
+ {
+-	int sig = next_signal(pending, mask);
++	int sig = 0;
++
++	/* SIGKILL must have priority, otherwise it is quite easy
++	 * to create an unkillable process, sending sig < SIGKILL
++	 * to self */
++	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
++		if (!sigismember(mask, SIGKILL))
++			sig = SIGKILL;
++	}
++
++	if (likely(!sig))
++		sig = next_signal(pending, mask);
+ 
+ 	if (sig) {
+ 		if (current->notifier) {
+@@ -509,6 +546,7 @@ void signal_wake_up(struct task_struct *t, int resume)
+ 	if (!wake_up_state(t, mask))
+ 		kick_process(t);
+ }
++EXPORT_SYMBOL_GPL(signal_wake_up);
+ 
+ /*
+  * Remove signals in mask from the pending set and queue.
+@@ -630,7 +668,7 @@ static int prepare_signal(int sig, struct task_struct *p)
+ 		t = p;
+ 		do {
+ 			rm_from_queue(sigmask(SIGCONT), &t->pending);
+-		} while_each_thread(p, t);
++		} while_each_thread_all(p, t);
+ 	} else if (sig == SIGCONT) {
+ 		unsigned int why;
+ 		/*
+@@ -662,7 +700,7 @@ static int prepare_signal(int sig, struct task_struct *p)
+ 				state |= TASK_INTERRUPTIBLE;
+ 			}
+ 			wake_up_state(t, state);
+-		} while_each_thread(p, t);
++		} while_each_thread_all(p, t);
+ 
+ 		/*
+ 		 * Notify the parent with CLD_CONTINUED if we were stopped.
+@@ -783,7 +821,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
+ 			do {
+ 				sigaddset(&t->pending.signal, SIGKILL);
+ 				signal_wake_up(t, 1);
+-			} while_each_thread(p, t);
++			} while_each_thread_all(p, t);
+ 			return;
+ 		}
+ 	}
+@@ -1019,7 +1057,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+ 	if (!ret && sig) {
+ 		ret = -ESRCH;
+ 		if (lock_task_sighand(p, &flags)) {
+-			ret = __group_send_sig_info(sig, info, p);
++			ret = sig_ve_ignored(sig, info, p) ? 0 :
++				__group_send_sig_info(sig, info, p);
+ 			unlock_task_sighand(p, &flags);
+ 		}
+ 	}
+@@ -1144,7 +1183,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
+ 		int retval = 0, count = 0;
+ 		struct task_struct * p;
+ 
+-		for_each_process(p) {
++		for_each_process_ve(p) {
+ 			if (p->pid > 1 && !same_thread_group(p, current)) {
+ 				int err = group_send_sig_info(sig, info, p);
+ 				++count;
+@@ -1359,6 +1398,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
+ 	BUG_ON(!tsk->ptrace &&
+ 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ 
++#ifdef CONFIG_VE
++	/* Allow to send only SIGCHLD from VE */
++	if (sig != SIGCHLD &&
++			tsk->ve_task_info.owner_env != 
++			tsk->parent->ve_task_info.owner_env)
++		sig = SIGCHLD;
++#endif
++
+ 	info.si_signo = sig;
+ 	info.si_errno = 0;
+ 	/*
+@@ -1630,7 +1677,9 @@ finish_stop(int stop_count)
+ 	}
+ 
+ 	do {
++		set_stop_state(current);
+ 		schedule();
++		clear_stop_state(current);
+ 	} while (try_to_freeze());
+ 	/*
+ 	 * Now we don't run again until continued.
+@@ -1683,6 +1732,7 @@ static int do_signal_stop(int signr)
+ 		sig->group_stop_count = stop_count;
+ 	}
+ 
++	clear_pn_state(current);
+ 	if (stop_count == 0)
+ 		sig->flags = SIGNAL_STOP_STOPPED;
+ 	current->exit_code = sig->group_exit_code;
+@@ -1746,8 +1796,6 @@ relock:
+ 	 * Now that we woke up, it's crucial if we're supposed to be
+ 	 * frozen that we freeze now before running anything substantial.
+ 	 */
+-	try_to_freeze();
+-
+ 	spin_lock_irq(&sighand->siglock);
+ 	/*
+ 	 * Every stopped thread goes here after wakeup. Check to see if
+@@ -2236,7 +2284,8 @@ static int do_tkill(int tgid, int pid, int sig)
+ 		 * signal is private anyway.
+ 		 */
+ 		if (!error && sig && lock_task_sighand(p, &flags)) {
+-			error = specific_send_sig_info(sig, &info, p);
++			if (!sig_ve_ignored(sig, &info, p))
++				error = specific_send_sig_info(sig, &info, p);
+ 			unlock_task_sighand(p, &flags);
+ 		}
+ 	}
+@@ -2592,5 +2641,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+ 
+ void __init signals_init(void)
+ {
+-	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
++	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC);
+ }
+diff --git a/kernel/softirq.c b/kernel/softirq.c
+index 36e0617..a74d919 100644
+--- a/kernel/softirq.c
++++ b/kernel/softirq.c
+@@ -22,6 +22,8 @@
+ #include <linux/smp.h>
+ #include <linux/tick.h>
+ 
++#include <bc/beancounter.h>
++
+ #include <asm/irq.h>
+ /*
+    - No shared variables, all the data are CPU local.
+@@ -209,10 +211,14 @@ EXPORT_SYMBOL(local_bh_enable_ip);
+ 
+ asmlinkage void __do_softirq(void)
+ {
++	struct user_beancounter *ub;
+ 	struct softirq_action *h;
+ 	__u32 pending;
+ 	int max_restart = MAX_SOFTIRQ_RESTART;
+ 	int cpu;
++	struct ve_struct *envid;
++
++	envid = set_exec_env(get_ve0());
+ 
+ 	pending = local_softirq_pending();
+ 	account_system_vtime(current);
+@@ -229,6 +235,7 @@ restart:
+ 
+ 	h = softirq_vec;
+ 
++	ub = set_exec_ub(get_ub0());
+ 	do {
+ 		if (pending & 1) {
+ 			h->action(h);
+@@ -237,6 +244,7 @@ restart:
+ 		h++;
+ 		pending >>= 1;
+ 	} while (pending);
++	(void)set_exec_ub(ub);
+ 
+ 	local_irq_disable();
+ 
+@@ -250,6 +258,7 @@ restart:
+ 	trace_softirq_exit();
+ 
+ 	account_system_vtime(current);
++	(void)set_exec_env(envid);
+ 	_local_bh_enable();
+ }
+ 
+@@ -305,6 +314,7 @@ void irq_exit(void)
+ {
+ 	account_system_vtime(current);
+ 	trace_hardirq_exit();
++	restore_context();
+ 	sub_preempt_count(IRQ_EXIT_OFFSET);
+ 	if (!in_interrupt() && local_softirq_pending())
+ 		invoke_softirq();
+diff --git a/kernel/softlockup.c b/kernel/softlockup.c
+index a272d78..5332252 100644
+--- a/kernel/softlockup.c
++++ b/kernel/softlockup.c
+@@ -199,12 +199,12 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
+ 		return;
+ 
+ 	read_lock(&tasklist_lock);
+-	do_each_thread(g, t) {
++	do_each_thread_all(g, t) {
+ 		if (!--max_count)
+ 			goto unlock;
+ 		if (t->state & TASK_UNINTERRUPTIBLE)
+ 			check_hung_task(t, now);
+-	} while_each_thread(g, t);
++	} while_each_thread_all(g, t);
+  unlock:
+ 	read_unlock(&tasklist_lock);
+ }
+diff --git a/kernel/sys.c b/kernel/sys.c
+index 14e9728..34a0c70 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -10,6 +10,7 @@
+ #include <linux/mman.h>
+ #include <linux/smp_lock.h>
+ #include <linux/notifier.h>
++#include <linux/virtinfo.h>
+ #include <linux/reboot.h>
+ #include <linux/prctl.h>
+ #include <linux/highuid.h>
+@@ -33,6 +34,7 @@
+ #include <linux/task_io_accounting_ops.h>
+ #include <linux/seccomp.h>
+ #include <linux/cpu.h>
++#include <linux/pid_namespace.h>
+ 
+ #include <linux/compat.h>
+ #include <linux/syscalls.h>
+@@ -112,6 +114,102 @@ EXPORT_SYMBOL(cad_pid);
+ 
+ void (*pm_power_off_prepare)(void);
+ 
++DECLARE_MUTEX(virtinfo_sem);
++EXPORT_SYMBOL(virtinfo_sem);
++static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
++
++void __virtinfo_notifier_register(int type, struct vnotifier_block *nb)
++{
++	struct vnotifier_block **p;
++
++	for (p = &virtinfo_chain[type];
++	     *p != NULL && nb->priority < (*p)->priority;
++	     p = &(*p)->next);
++	nb->next = *p;
++	smp_wmb();
++	*p = nb;
++}
++
++EXPORT_SYMBOL(__virtinfo_notifier_register);
++
++void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
++{
++	down(&virtinfo_sem);
++	__virtinfo_notifier_register(type, nb);
++	up(&virtinfo_sem);
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_register);
++
++struct virtinfo_cnt_struct {
++	volatile unsigned long exit[NR_CPUS];
++	volatile unsigned long entry;
++};
++static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
++
++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
++{
++	struct vnotifier_block **p;
++	int entry_cpu, exit_cpu;
++	unsigned long cnt, ent;
++
++	down(&virtinfo_sem);
++	for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
++	*p = nb->next;
++	smp_mb();
++
++	for_each_cpu_mask(entry_cpu, cpu_possible_map) {
++		while (1) {
++			cnt = 0;
++			for_each_cpu_mask(exit_cpu, cpu_possible_map)
++				cnt +=
++				    per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
++			smp_rmb();
++			ent = per_cpu(virtcnt, entry_cpu).entry;
++			if (cnt == ent)
++				break;
++			__set_current_state(TASK_UNINTERRUPTIBLE);
++			schedule_timeout(HZ / 100);
++		}
++	}
++	up(&virtinfo_sem);
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_unregister);
++
++int virtinfo_notifier_call(int type, unsigned long n, void *data)
++{
++	int ret;
++	int entry_cpu, exit_cpu;
++	struct vnotifier_block *nb;
++
++	entry_cpu = get_cpu();
++	per_cpu(virtcnt, entry_cpu).entry++;
++	smp_wmb();
++	put_cpu();
++
++	nb = virtinfo_chain[type];
++	ret = NOTIFY_DONE;
++	while (nb)
++	{
++		ret = nb->notifier_call(nb, n, data, ret);
++		if(ret & NOTIFY_STOP_MASK) {
++			ret &= ~NOTIFY_STOP_MASK;
++			break;
++		}
++		nb = nb->next;
++	}
++
++	exit_cpu = get_cpu();
++	smp_wmb();
++	per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
++	put_cpu();
++
++	return ret;
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_call);
++
+ static int set_one_prio(struct task_struct *p, int niceval, int error)
+ {
+ 	int no_nice;
+@@ -181,10 +279,10 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
+ 				if ((who != current->uid) && !(user = find_user(who)))
+ 					goto out_unlock;	/* No processes for this user */
+ 
+-			do_each_thread(g, p)
++			do_each_thread_ve(g, p)
+ 				if (p->uid == who)
+ 					error = set_one_prio(p, niceval, error);
+-			while_each_thread(g, p);
++			while_each_thread_ve(g, p);
+ 			if (who != current->uid)
+ 				free_uid(user);		/* For find_user() */
+ 			break;
+@@ -243,13 +341,13 @@ asmlinkage long sys_getpriority(int which, int who)
+ 				if ((who != current->uid) && !(user = find_user(who)))
+ 					goto out_unlock;	/* No processes for this user */
+ 
+-			do_each_thread(g, p)
++			do_each_thread_ve(g, p)
+ 				if (p->uid == who) {
+ 					niceval = 20 - task_nice(p);
+ 					if (niceval > retval)
+ 						retval = niceval;
+ 				}
+-			while_each_thread(g, p);
++			while_each_thread_ve(g, p);
+ 			if (who != current->uid)
+ 				free_uid(user);		/* for find_user() */
+ 			break;
+@@ -383,6 +481,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
+ 	                magic2 != LINUX_REBOOT_MAGIC2C))
+ 		return -EINVAL;
+ 
++#ifdef CONFIG_VE
++	if (!ve_is_super(get_exec_env()))
++		switch (cmd) {
++		case LINUX_REBOOT_CMD_RESTART:
++		case LINUX_REBOOT_CMD_HALT:
++		case LINUX_REBOOT_CMD_POWER_OFF:
++		case LINUX_REBOOT_CMD_RESTART2:
++			force_sig(SIGKILL,
++				get_exec_env()->ve_ns->pid_ns->child_reaper);
++
++		case LINUX_REBOOT_CMD_CAD_ON:
++		case LINUX_REBOOT_CMD_CAD_OFF:
++			return 0;
++
++		default:
++			return -EINVAL;
++		}
++#endif
++
+ 	/* Instead of trying to make the power_off code look like
+ 	 * halt when pm_power_off is not set do it the easy way.
+ 	 */
+@@ -564,7 +681,7 @@ asmlinkage long sys_setgid(gid_t gid)
+ 	return 0;
+ }
+   
+-static int set_user(uid_t new_ruid, int dumpclear)
++int set_user(uid_t new_ruid, int dumpclear)
+ {
+ 	struct user_struct *new_user;
+ 
+@@ -868,8 +985,27 @@ asmlinkage long sys_setfsgid(gid_t gid)
+ 	return old_fsgid;
+ }
+ 
++#ifdef CONFIG_VE
++unsigned long long ve_relative_clock(struct timespec * ts)
++{
++	unsigned long long offset = 0;
++
++	if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec ||
++	    (ts->tv_sec == get_exec_env()->start_timespec.tv_sec &&
++	     ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec))
++		offset = (unsigned long long)(ts->tv_sec -
++			get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC
++			+ ts->tv_nsec -	get_exec_env()->start_timespec.tv_nsec;
++	return nsec_to_clock_t(offset);
++}
++#endif
++
+ asmlinkage long sys_times(struct tms __user * tbuf)
+ {
++#ifdef CONFIG_VE
++	struct timespec now;
++#endif
++
+ 	/*
+ 	 *	In the SMP world we might just be unlucky and have one of
+ 	 *	the times increment as we use it. Since the value is an
+@@ -903,7 +1039,13 @@ asmlinkage long sys_times(struct tms __user * tbuf)
+ 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+ 			return -EFAULT;
+ 	}
++#ifndef CONFIG_VE
+ 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
++#else
++	/* Compare to calculation in fs/proc/array.c */
++	do_posix_clock_monotonic_gettime(&now);
++	return ve_relative_clock(&now);
++#endif
+ }
+ 
+ /*
+@@ -1077,6 +1219,7 @@ asmlinkage long sys_setsid(void)
+ 
+ 	spin_lock(&group_leader->sighand->siglock);
+ 	group_leader->signal->tty = NULL;
++	group_leader->signal->tty_old_pgrp = 0;
+ 	spin_unlock(&group_leader->sighand->siglock);
+ 
+ 	err = session;
+@@ -1361,7 +1504,7 @@ asmlinkage long sys_sethostname(char __user *name, int len)
+ 	int errno;
+ 	char tmp[__NEW_UTS_LEN];
+ 
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	if (len < 0 || len > __NEW_UTS_LEN)
+ 		return -EINVAL;
+@@ -1406,7 +1549,7 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
+ 	int errno;
+ 	char tmp[__NEW_UTS_LEN];
+ 
+-	if (!capable(CAP_SYS_ADMIN))
++	if (!capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	if (len < 0 || len > __NEW_UTS_LEN)
+ 		return -EINVAL;
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index 5b9b467..7717f4d 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -161,3 +161,15 @@ cond_syscall(sys_timerfd_gettime);
+ cond_syscall(compat_sys_timerfd_settime);
+ cond_syscall(compat_sys_timerfd_gettime);
+ cond_syscall(sys_eventfd);
++cond_syscall(sys_getluid);
++cond_syscall(sys_setluid);
++cond_syscall(sys_setublimit);
++cond_syscall(sys_ubstat);
++
++/* fairsched compat */
++cond_syscall(sys_fairsched_mknod);
++cond_syscall(sys_fairsched_rmnod);
++cond_syscall(sys_fairsched_mvpr);
++cond_syscall(sys_fairsched_vcpus);
++cond_syscall(sys_fairsched_chwt);
++cond_syscall(sys_fairsched_rate);
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index 2911665..3692411 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -80,6 +80,7 @@ extern int percpu_pagelist_fraction;
+ extern int compat_log;
+ extern int maps_protect;
+ extern int sysctl_stat_interval;
++extern int ve_area_access_check; /* fs/namei.c */
+ extern int latencytop_enabled;
+ extern int sysctl_nr_open_min, sysctl_nr_open_max;
+ 
+@@ -106,6 +107,13 @@ static int min_percpu_pagelist_fract = 8;
+ 
+ static int ngroups_max = NGROUPS_MAX;
+ 
++int ve_allow_kthreads = 1;
++EXPORT_SYMBOL(ve_allow_kthreads);
++
++#ifdef CONFIG_MAGIC_SYSRQ
++extern int sysrq_key_scancode;
++#endif
++
+ #ifdef CONFIG_KMOD
+ extern char modprobe_path[];
+ #endif
+@@ -119,6 +127,8 @@ extern int stop_a_enabled;
+ extern int scons_pwroff;
+ #endif
+ 
++extern int alloc_fail_warn;
++
+ #ifdef __hppa__
+ extern int pwrsw_enabled;
+ extern int unaligned_enabled;
+@@ -133,6 +143,7 @@ extern int spin_retry;
+ #endif
+ 
+ extern int sysctl_hz_timer;
++int decode_call_traces = 1;
+ 
+ #ifdef CONFIG_BSD_PROCESS_ACCT
+ extern int acct_parm[];
+@@ -141,6 +152,10 @@ extern int acct_parm[];
+ #ifdef CONFIG_IA64
+ extern int no_unaligned_warning;
+ #endif
++#ifdef CONFIG_VE
++int glob_ve_meminfo = 0;
++EXPORT_SYMBOL(glob_ve_meminfo);
++#endif
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ extern int max_lock_depth;
+@@ -160,9 +175,59 @@ static struct ctl_table_header root_table_header = {
+ 	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+ 	.root = &sysctl_table_root,
+ };
++
++#ifdef CONFIG_VE
++static LIST_HEAD(empty_list);
++
++static struct list_head *sysctl_default_lookup(struct ctl_table_root *r,
++		struct nsproxy *namespaces)
++{
++	if (ve_is_super(get_exec_env()))
++		return &r->header_list;
++
++	BUG_ON(!list_empty(&empty_list));
++	return &empty_list;
++}
++
++/*
++ * default root:
++ * all new tables go to one by default
++ * visible rw in ve0 only
++ */
++static struct ctl_table_root sysctl_default_root = {
++	.header_list	= LIST_HEAD_INIT(sysctl_default_root.header_list),
++	.lookup		= sysctl_default_lookup,
++};
++
++/*
++ * virtual root:
++ * visible rw everywhere (glob 1)
++ */
++static struct ctl_table_root sysctl_virt_root = {
++	.header_list	= LIST_HEAD_INIT(sysctl_virt_root.header_list),
++};
++
++static int sysctl_root_perms(struct ctl_table_root *root,
++			struct nsproxy *namespaces, struct ctl_table *table)
++{
++	if (ve_is_super(get_exec_env()))
++		return table->mode;
++	else
++		return table->mode & ~0222;
++}
++#else
++#define sysctl_default_root	sysctl_table_root
++#define sysctl_root_perms	NULL
++#endif
++
++/*
++ * classical root:
++ * visible ro in ve and rw in ve0 (glob 0 && root_table_header)
++ */
+ static struct ctl_table_root sysctl_table_root = {
+ 	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+ 	.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
++	.permissions = sysctl_root_perms,
+ };
+ 
+ static struct ctl_table kern_table[];
+@@ -429,6 +494,20 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ #endif
++	{
++		.procname	= "silence-level",
++		.data		= &console_silence_loglevel,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{
++		.procname	= "alloc_fail_warn",
++		.data		= &alloc_fail_warn,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
+ #ifdef __hppa__
+ 	{
+ 		.ctl_name	= KERN_HPPA_PWRSW,
+@@ -593,6 +672,24 @@ static struct ctl_table kern_table[] = {
+ 		.extra1		= &pid_max_min,
+ 		.extra2		= &pid_max_max,
+ 	},
++#ifdef CONFIG_VE
++	{
++		.procname	= "ve_meminfo",
++		.data		= &glob_ve_meminfo,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++#endif
++#ifdef CONFIG_MAGIC_SYSRQ
++	{
++		.procname	= "sysrq-key",
++		.data		= &sysrq_key_scancode,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++#endif
+ 	{
+ 		.ctl_name	= KERN_PANIC_ON_OOPS,
+ 		.procname	= "panic_on_oops",
+@@ -1140,6 +1237,21 @@ static struct ctl_table vm_table[] = {
+ 		.extra2		= &one,
+ 	},
+ #endif
++	{
++		.procname	= "vsyscall",
++		.data		= &sysctl_at_vsyscall,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{
++		.ctl_name	= CTL_UNNUMBERED,
++		.procname	= "odirect_enable",
++		.data		= &odirect_enable,
++		.maxlen		= sizeof(int),
++		.mode           = 0644,
++		.proc_handler   = proc_dointvec,
++	},
+ /*
+  * NOTE: do not add new entries to this table unless you have read
+  * Documentation/sysctl/ctl_unnumbered.txt
+@@ -1302,6 +1414,13 @@ static struct ctl_table fs_table[] = {
+ };
+ 
+ static struct ctl_table debug_table[] = {
++	{
++		.procname	= "decode_call_traces",
++		.data		= &decode_call_traces,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
+ #if defined(CONFIG_X86) || defined(CONFIG_PPC)
+ 	{
+ 		.ctl_name	= CTL_UNNUMBERED,
+@@ -1598,6 +1717,10 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
+ 
+ static __init int sysctl_init(void)
+ {
++#ifdef CONFIG_VE
++	register_sysctl_root(&sysctl_default_root);
++	register_sysctl_root(&sysctl_virt_root);
++#endif
+ 	sysctl_set_parent(NULL, root_table);
+ #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+ 	{
+@@ -1758,10 +1881,18 @@ struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ 						struct ctl_table *table)
+ {
+-	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
++	return __register_sysctl_paths(&sysctl_default_root, current->nsproxy,
+ 					path, table);
+ }
+ 
++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
++		struct ctl_table *table, int virtual_handler)
++{
++	return __register_sysctl_paths(virtual_handler ? 
++			&sysctl_virt_root : &sysctl_table_root,
++			current->nsproxy, path, table);
++}
++
+ /**
+  * register_sysctl_table - register a sysctl table hierarchy
+  * @table: the top-level table structure
+@@ -1778,6 +1909,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+ 	return register_sysctl_paths(null_path, table);
+ }
+ 
++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
++		int virtual_handler)
++{
++	static const struct ctl_path null_path[] = { {} };
++
++	return register_sysctl_glob_paths(null_path, table, virtual_handler);
++}
++
+ /**
+  * unregister_sysctl_table - unregister a sysctl table hierarchy
+  * @header: the header returned from register_sysctl_table
+@@ -1810,6 +1949,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ 	return NULL;
+ }
+ 
++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
++		int vh)
++{
++	return NULL;
++}
++
++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
++						struct ctl_table *table, int vh)
++{
++	return NULL;
++}
++
+ void unregister_sysctl_table(struct ctl_table_header * table)
+ {
+ }
+@@ -2829,6 +2980,57 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
+ 	return 0;
+ }
+ 
++#ifdef CONFIG_PID_NS
++#include <linux/pid_namespace.h>
++
++static int proc_pid_ns_hide_child(struct ctl_table *table, int write,
++		struct file *filp, void __user *buffer,
++		size_t *lenp, loff_t *ppos)
++{
++	int tmp, res;
++
++	tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0;
++
++	res = __do_proc_dointvec(&tmp, table, write, filp, buffer,
++			       lenp, ppos, NULL, NULL);
++	if (res || !write)
++		return res;
++
++	if (tmp)
++		current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD;
++	else
++		current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD;
++	return 0;
++}
++
++static struct ctl_table pid_ns_kern_table[] = {
++	{
++		.procname	= "pid_ns_hide_child",
++		.maxlen		= sizeof(int),
++		.mode		= 0600,
++		.proc_handler	= proc_pid_ns_hide_child,
++	},
++	{}
++};
++
++static struct ctl_table pid_ns_root_table[] = {
++	{
++		.ctl_name	= CTL_KERN,
++		.procname	= "kernel",
++		.mode		= 0555,
++		.child		= pid_ns_kern_table,
++	},
++	{}
++};
++
++static __init int pid_ns_sysctl_init(void)
++{
++	register_sysctl_table(pid_ns_root_table);
++	return 0;
++}
++postcore_initcall(pid_ns_sysctl_init);
++#endif /* CONFIG_PID_NS */
++
+ /*
+  * No sense putting this after each symbol definition, twice,
+  * exception granted :-)
+@@ -2842,7 +3044,9 @@ EXPORT_SYMBOL(proc_dostring);
+ EXPORT_SYMBOL(proc_doulongvec_minmax);
+ EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+ EXPORT_SYMBOL(register_sysctl_table);
++EXPORT_SYMBOL(register_sysctl_glob_table);
+ EXPORT_SYMBOL(register_sysctl_paths);
++EXPORT_SYMBOL(register_sysctl_glob_paths);
+ EXPORT_SYMBOL(sysctl_intvec);
+ EXPORT_SYMBOL(sysctl_jiffies);
+ EXPORT_SYMBOL(sysctl_ms_jiffies);
+diff --git a/kernel/taskstats.c b/kernel/taskstats.c
+index 4a23517..590d37c 100644
+--- a/kernel/taskstats.c
++++ b/kernel/taskstats.c
+@@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
+ 
+ 		stats->nvcsw += tsk->nvcsw;
+ 		stats->nivcsw += tsk->nivcsw;
+-	} while_each_thread(first, tsk);
++	} while_each_thread_all(first, tsk);
+ 
+ 	unlock_task_sighand(first, &flags);
+ 	rc = 0;
+diff --git a/kernel/time.c b/kernel/time.c
+index 6a08660..c986346 100644
+--- a/kernel/time.c
++++ b/kernel/time.c
+@@ -601,10 +601,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t);
+ unsigned long clock_t_to_jiffies(unsigned long x)
+ {
+ #if (HZ % USER_HZ)==0
++	WARN_ON((long)x < 0);
+ 	if (x >= ~0UL / (HZ / USER_HZ))
+ 		return ~0UL;
+ 	return x * (HZ / USER_HZ);
+ #else
++	WARN_ON((long)x < 0);
+ 	/* Don't worry about loss of precision here .. */
+ 	if (x >= ~0UL / HZ * USER_HZ)
+ 		return ~0UL;
+@@ -617,6 +619,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
+ 
+ u64 jiffies_64_to_clock_t(u64 x)
+ {
++	WARN_ON((s64)x < 0);
+ #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+ # if HZ < USER_HZ
+ 	x = div_u64(x * USER_HZ, HZ);
+@@ -639,6 +642,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
+ 
+ u64 nsec_to_clock_t(u64 x)
+ {
++	WARN_ON((s64)x < 0);
+ #if (NSEC_PER_SEC % USER_HZ) == 0
+ 	return div_u64(x, NSEC_PER_SEC / USER_HZ);
+ #elif (USER_HZ % 512) == 0
+diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
+index e91c29f..3db0c59 100644
+--- a/kernel/time/timekeeping.c
++++ b/kernel/time/timekeeping.c
+@@ -43,6 +43,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+  * used instead.
+  */
+ struct timespec xtime __attribute__ ((aligned (16)));
++EXPORT_SYMBOL_GPL(xtime);
+ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+ static unsigned long total_sleep_time;		/* seconds */
+ 
+diff --git a/kernel/timer.c b/kernel/timer.c
+index ceacc66..83d6963 100644
+--- a/kernel/timer.c
++++ b/kernel/timer.c
+@@ -37,6 +37,8 @@
+ #include <linux/delay.h>
+ #include <linux/tick.h>
+ #include <linux/kallsyms.h>
++#include <linux/virtinfo.h>
++#include <linux/ve_proto.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -795,7 +797,11 @@ static inline void __run_timers(struct tvec_base *base)
+ 			spin_unlock_irq(&base->lock);
+ 			{
+ 				int preempt_count = preempt_count();
++				struct ve_struct *ve;
++
++				ve = set_exec_env(get_ve0());
+ 				fn(data);
++				(void)set_exec_env(ve);
+ 				if (preempt_count != preempt_count()) {
+ 					printk(KERN_ERR "huh, entered %p "
+ 					       "with preempt_count %08x, exited"
+@@ -1014,6 +1020,37 @@ EXPORT_SYMBOL(avenrun);
+  * calc_load - given tick count, update the avenrun load estimates.
+  * This is called while holding a write_lock on xtime_lock.
+  */
++
++
++#ifdef CONFIG_VE
++static void calc_load_ve(void)
++{
++	unsigned long flags, nr_unint, nr_active;
++	struct ve_struct *ve;
++
++	read_lock(&ve_list_lock);
++	for_each_ve(ve) {
++		nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve);
++		nr_active *= FIXED_1;
++
++		CALC_LOAD(ve->avenrun[0], EXP_1, nr_active);
++		CALC_LOAD(ve->avenrun[1], EXP_5, nr_active);
++		CALC_LOAD(ve->avenrun[2], EXP_15, nr_active);
++	}
++	read_unlock(&ve_list_lock);
++
++	nr_unint = nr_uninterruptible() * FIXED_1;
++	spin_lock_irqsave(&kstat_glb_lock, flags);
++	CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
++	CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
++	CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
++	spin_unlock_irqrestore(&kstat_glb_lock, flags);
++
++}
++#else
++#define calc_load_ve()	do { } while (0)
++#endif
++
+ static inline void calc_load(unsigned long ticks)
+ {
+ 	unsigned long active_tasks; /* fixed-point */
+@@ -1026,6 +1063,7 @@ static inline void calc_load(unsigned long ticks)
+ 			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ 			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ 			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
++			calc_load_ve();
+ 			count += LOAD_FREQ;
+ 		} while (count < 0);
+ 	}
+@@ -1275,11 +1313,12 @@ int do_sysinfo(struct sysinfo *info)
+ 	unsigned long mem_total, sav_total;
+ 	unsigned int mem_unit, bitcount;
+ 	unsigned long seq;
++	unsigned long *__avenrun;
++	struct timespec tp;
+ 
+ 	memset(info, 0, sizeof(struct sysinfo));
+ 
+ 	do {
+-		struct timespec tp;
+ 		seq = read_seqbegin(&xtime_lock);
+ 
+ 		/*
+@@ -1297,18 +1336,34 @@ int do_sysinfo(struct sysinfo *info)
+ 			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
+ 			tp.tv_sec++;
+ 		}
+-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+-
+-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
++	} while (read_seqretry(&xtime_lock, seq));
+ 
++	if (ve_is_super(get_exec_env())) {
++		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
++		__avenrun = &avenrun[0];
+ 		info->procs = nr_threads;
+-	} while (read_seqretry(&xtime_lock, seq));
++	}
++#ifdef CONFIG_VE
++	else {
++		struct ve_struct *ve;
++		ve = get_exec_env();
++		__avenrun = &ve->avenrun[0];
++		info->procs = atomic_read(&ve->pcounter);
++		info->uptime = tp.tv_sec - ve->start_timespec.tv_sec;
++	}
++#endif
++	info->loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
++	info->loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
++	info->loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ 
+ 	si_meminfo(info);
+ 	si_swapinfo(info);
+ 
++#ifdef CONFIG_BEANCOUNTERS
++	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info)
++			& NOTIFY_FAIL)
++		return -ENOMSG;
++#endif
+ 	/*
+ 	 * If the sum of all the available memory (i.e. ram + swap)
+ 	 * is less than can be stored in a 32 bit unsigned long then
+diff --git a/kernel/user.c b/kernel/user.c
+index 865ecf5..b1139b3 100644
+--- a/kernel/user.c
++++ b/kernel/user.c
+@@ -314,6 +314,7 @@ static void remove_user_sysfs_dir(struct work_struct *w)
+ done:
+ 	uids_mutex_unlock();
+ }
++EXPORT_SYMBOL_GPL(free_uid);
+ 
+ /* IRQs are disabled and uidhash_lock is held upon function entry.
+  * IRQ state (as stored in flags) is restored and uidhash_lock released
+@@ -383,6 +384,7 @@ void free_uid(struct user_struct *up)
+ 	else
+ 		local_irq_restore(flags);
+ }
++EXPORT_SYMBOL_GPL(free_uid);
+ 
+ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+ {
+@@ -447,6 +449,7 @@ out_unlock:
+ 	uids_mutex_unlock();
+ 	return NULL;
+ }
++EXPORT_SYMBOL_GPL(alloc_uid);
+ 
+ void switch_uid(struct user_struct *new_user)
+ {
+@@ -477,6 +480,7 @@ void switch_uid(struct user_struct *new_user)
+ 	free_uid(old_user);
+ 	suid_keys(current);
+ }
++EXPORT_SYMBOL_GPL(switch_uid);
+ 
+ #ifdef CONFIG_USER_NS
+ void release_uids(struct user_namespace *ns)
+@@ -510,7 +514,7 @@ static int __init uid_cache_init(void)
+ 	int n;
+ 
+ 	uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ 
+ 	for(n = 0; n < UIDHASH_SZ; ++n)
+ 		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
+index fe3a56c..22d14c2 100644
+--- a/kernel/utsname_sysctl.c
++++ b/kernel/utsname_sysctl.c
+@@ -27,6 +27,10 @@ static void *get_uts(ctl_table *table, int write)
+ 		down_read(&uts_sem);
+ 	else
+ 		down_write(&uts_sem);
++
++	if (strcmp(table->procname, "virt_osrelease") == 0)
++		return virt_utsname.release;
++
+ 	return which;
+ }
+ 
+@@ -128,19 +132,27 @@ static struct ctl_table uts_kern_table[] = {
+ 	{}
+ };
+ 
+-static struct ctl_table uts_root_table[] = {
++static struct ctl_table uts_virt_osrelease_table[] = {
+ 	{
+-		.ctl_name	= CTL_KERN,
+-		.procname	= "kernel",
+-		.mode		= 0555,
+-		.child		= uts_kern_table,
++		.procname	= "virt_osrelease",
++		.data		= virt_utsname.release,
++		.maxlen		= sizeof(virt_utsname.release),
++		.mode		= 0644,
++		.proc_handler	= &proc_do_uts_string,
++		.strategy	= sysctl_uts_string,
+ 	},
+ 	{}
+ };
+ 
++static struct ctl_path uts_path[] = {
++	{ .ctl_name = CTL_KERN, .procname = "kernel", },
++	{ }
++};
++
+ static int __init utsname_sysctl_init(void)
+ {
+-	register_sysctl_table(uts_root_table);
++	register_sysctl_glob_paths(uts_path, uts_kern_table, 1);
++	register_sysctl_paths(uts_path, uts_virt_osrelease_table);
+ 	return 0;
+ }
+ 
+diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile
+new file mode 100644
+index 0000000..9d60161
+--- /dev/null
++++ b/kernel/ve/Makefile
+@@ -0,0 +1,16 @@
++#
++#
++#  kernel/ve/Makefile
++#
++#  Copyright (C) 2000-2005  SWsoft
++#  All rights reserved.
++#
++#  Licensing governed by "linux/COPYING.SWsoft" file.
++
++obj-$(CONFIG_VE) = ve.o veowner.o hooks.o
++obj-$(CONFIG_VZ_WDOG) += vzwdog.o
++obj-$(CONFIG_VE_CALLS) += vzmon.o
++
++vzmon-objs = vecalls.o
++
++obj-$(CONFIG_VZ_DEV) += vzdev.o
+diff --git a/kernel/ve/hooks.c b/kernel/ve/hooks.c
+new file mode 100644
+index 0000000..1b82c35
+--- /dev/null
++++ b/kernel/ve/hooks.c
+@@ -0,0 +1,114 @@
++/*
++ *  linux/kernel/ve/hooks.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/ve.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/ve_proto.h>
++#include <linux/module.h>
++
++static struct list_head ve_hooks[VE_MAX_CHAINS];
++static DECLARE_RWSEM(ve_hook_sem);
++
++void ve_hook_register(int chain, struct ve_hook *vh)
++{
++	struct list_head *lh;
++	struct ve_hook *tmp;
++
++	BUG_ON(chain > VE_MAX_CHAINS);
++
++	down_write(&ve_hook_sem);
++	list_for_each(lh, &ve_hooks[chain]) {
++		tmp = list_entry(lh, struct ve_hook, list);
++		if (vh->priority < tmp->priority)
++			break;
++	}
++
++	list_add_tail(&vh->list, lh);
++	up_write(&ve_hook_sem);
++}
++
++EXPORT_SYMBOL(ve_hook_register);
++
++void ve_hook_unregister(struct ve_hook *vh)
++{
++	down_write(&ve_hook_sem);
++	list_del(&vh->list);
++	up_write(&ve_hook_sem);
++}
++
++EXPORT_SYMBOL(ve_hook_unregister);
++
++static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve)
++{
++	int err;
++
++	err = 0;
++	if (try_module_get(vh->owner)) {
++		err = vh->init(ve);
++		module_put(vh->owner);
++	}
++	return err;
++}
++
++static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve)
++{
++	if (vh->fini != NULL && try_module_get(vh->owner)) {
++		vh->fini(ve);
++		module_put(vh->owner);
++	}
++}
++
++int ve_hook_iterate_init(int chain, void *ve)
++{
++	struct ve_hook *vh;
++	int err;
++
++	err = 0;
++
++	down_read(&ve_hook_sem);
++	list_for_each_entry(vh, &ve_hooks[chain], list)
++		if ((err = ve_hook_init(vh, ve)) < 0)
++			break;
++
++	if (err)
++		list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list)
++			ve_hook_fini(vh, ve);
++
++	up_read(&ve_hook_sem);
++	return err;
++}
++
++EXPORT_SYMBOL(ve_hook_iterate_init);
++
++void ve_hook_iterate_fini(int chain, void *ve)
++{
++	struct ve_hook *vh;
++
++	down_read(&ve_hook_sem);
++	list_for_each_entry_reverse(vh, &ve_hooks[chain], list)
++		ve_hook_fini(vh, ve);
++	up_read(&ve_hook_sem);
++}
++
++EXPORT_SYMBOL(ve_hook_iterate_fini);
++
++static int __init ve_hooks_init(void)
++{
++	int i;
++
++	for (i = 0; i < VE_MAX_CHAINS; i++)
++		INIT_LIST_HEAD(&ve_hooks[i]);
++	return 0;
++}
++
++core_initcall(ve_hooks_init);
++
+diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
+new file mode 100644
+index 0000000..d4ba7b3
+--- /dev/null
++++ b/kernel/ve/ve.c
+@@ -0,0 +1,150 @@
++/*
++ *  linux/kernel/ve/ve.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * 've.c' helper file performing VE sub-system initialization
++ */
++
++#include <linux/sched.h>
++#include <linux/delay.h>
++#include <linux/capability.h>
++#include <linux/ve.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++
++#include <linux/errno.h>
++#include <linux/unistd.h>
++#include <linux/slab.h>
++#include <linux/sys.h>
++#include <linux/kdev_t.h>
++#include <linux/termios.h>
++#include <linux/tty_driver.h>
++#include <linux/netdevice.h>
++#include <linux/utsname.h>
++#include <linux/proc_fs.h>
++#include <linux/kernel_stat.h>
++#include <linux/module.h>
++#include <linux/rcupdate.h>
++#include <linux/ve_proto.h>
++#include <linux/devpts_fs.h>
++
++#include <linux/nfcalls.h>
++
++unsigned long vz_rstamp = 0x37e0f59d;
++
++#ifdef CONFIG_MODULES
++struct module no_module = { .state = MODULE_STATE_GOING };
++EXPORT_SYMBOL(no_module);
++#endif
++
++INIT_KSYM_MODULE(ip_tables);
++INIT_KSYM_MODULE(ip6_tables);
++INIT_KSYM_MODULE(iptable_filter);
++INIT_KSYM_MODULE(ip6table_filter);
++INIT_KSYM_MODULE(iptable_mangle);
++INIT_KSYM_MODULE(ip6table_mangle);
++INIT_KSYM_MODULE(ip_conntrack);
++INIT_KSYM_MODULE(nf_conntrack);
++INIT_KSYM_MODULE(nf_conntrack_ipv4);
++INIT_KSYM_MODULE(nf_conntrack_ipv6);
++INIT_KSYM_MODULE(ip_nat);
++INIT_KSYM_MODULE(nf_nat);
++INIT_KSYM_MODULE(iptable_nat);
++
++INIT_KSYM_CALL(int, init_iptable_conntrack, (void));
++INIT_KSYM_CALL(int, nf_conntrack_init_ve, (void));
++INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void));
++INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void));
++INIT_KSYM_CALL(int, nf_nat_init, (void));
++INIT_KSYM_CALL(int, init_iptable_nat, (void));
++INIT_KSYM_CALL(void, fini_iptable_nat, (void));
++INIT_KSYM_CALL(int, init_nftable_nat, (void));
++INIT_KSYM_CALL(void, fini_nftable_nat, (void));
++INIT_KSYM_CALL(void, nf_nat_cleanup, (void));
++INIT_KSYM_CALL(void, fini_iptable_conntrack, (void));
++INIT_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void));
++INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void));
++INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void));
++
++#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS)
++INIT_KSYM_MODULE(vzmon);
++INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env));
++
++void do_env_free(struct ve_struct *env)
++{
++	KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env));
++}
++EXPORT_SYMBOL(do_env_free);
++#endif
++
++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE)
++INIT_KSYM_MODULE(vzethdev);
++INIT_KSYM_CALL(int, veth_open, (struct net_device *dev));
++#endif
++
++struct ve_struct ve0 = {
++	.counter		= ATOMIC_INIT(1),
++	.pcounter		= ATOMIC_INIT(1),
++	.ve_list		= LIST_HEAD_INIT(ve0.ve_list),
++	.vetask_lh		= LIST_HEAD_INIT(ve0.vetask_lh),
++	.start_jiffies		= INITIAL_JIFFIES,
++#ifdef CONFIG_UNIX98_PTYS
++	.devpts_config		= &devpts_config,
++#endif
++	.ve_ns			= &init_nsproxy,
++	.ve_netns		= &init_net,
++	.is_running		= 1,
++	.op_sem			= __RWSEM_INITIALIZER(ve0.op_sem),
++#ifdef CONFIG_VE_IPTABLES
++	.ipt_mask 		= ~0ULL,
++#endif
++};
++
++EXPORT_SYMBOL(ve0);
++
++#ifdef CONFIG_SMP
++static struct {
++	void *ptrs[NR_CPUS];
++} ve0_cpu_stats;
++#endif
++static struct ve_cpu_stats ve0_cpu_stats_data[NR_CPUS];
++
++LIST_HEAD(ve_list_head);
++rwlock_t ve_list_lock = RW_LOCK_UNLOCKED;
++
++LIST_HEAD(ve_cleanup_list);
++DEFINE_SPINLOCK(ve_cleanup_lock);
++struct task_struct *ve_cleanup_thread;
++
++EXPORT_SYMBOL(ve_list_lock);
++EXPORT_SYMBOL(ve_list_head);
++EXPORT_SYMBOL(ve_cleanup_lock);
++EXPORT_SYMBOL(ve_cleanup_list);
++EXPORT_SYMBOL(ve_cleanup_thread);
++
++void init_ve0(void)
++{
++	struct ve_struct *ve;
++
++	ve = get_ve0();
++	ve->cpu_stats = static_percpu_ptr(&ve0_cpu_stats, ve0_cpu_stats_data);
++	list_add(&ve->ve_list, &ve_list_head);
++}
++
++void ve_cleanup_schedule(struct ve_struct *ve)
++{
++	BUG_ON(ve_cleanup_thread == NULL);
++
++	spin_lock(&ve_cleanup_lock);
++	list_add_tail(&ve->cleanup_list, &ve_cleanup_list);
++	spin_unlock(&ve_cleanup_lock);
++
++	wake_up_process(ve_cleanup_thread);
++}
+diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
+new file mode 100644
+index 0000000..5aab66c
+--- /dev/null
++++ b/kernel/ve/vecalls.c
+@@ -0,0 +1,2422 @@
++/*
++ *  linux/kernel/ve/vecalls.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *
++ */
++
++/*
++ * 'vecalls.c' is file with basic VE support. It provides basic primities
++ * along with initialization script
++ */
++
++#include <linux/sched.h>
++#include <linux/delay.h>
++#include <linux/capability.h>
++#include <linux/ve.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/errno.h>
++#include <linux/unistd.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/sys.h>
++#include <linux/fs.h>
++#include <linux/mnt_namespace.h>
++#include <linux/termios.h>
++#include <linux/tty_driver.h>
++#include <linux/netdevice.h>
++#include <linux/wait.h>
++#include <linux/inetdevice.h>
++#include <net/addrconf.h>
++#include <linux/utsname.h>
++#include <linux/sysctl.h>
++#include <linux/proc_fs.h>
++#include <linux/devpts_fs.h>
++#include <linux/shmem_fs.h>
++#include <linux/sysfs.h>
++#include <linux/seq_file.h>
++#include <linux/kernel_stat.h>
++#include <linux/module.h>
++#include <linux/suspend.h>
++#include <linux/rcupdate.h>
++#include <linux/in.h>
++#include <linux/idr.h>
++#include <linux/inetdevice.h>
++#include <linux/pid.h>
++#include <net/pkt_sched.h>
++#include <bc/beancounter.h>
++#include <linux/nsproxy.h>
++#include <linux/kobject.h>
++#include <linux/freezer.h>
++#include <linux/pid_namespace.h>
++#include <linux/tty.h>
++
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/ip6_route.h>
++#include <net/arp.h>
++#include <net/ipv6.h>
++
++#include <linux/ve_proto.h>
++#include <linux/venet.h>
++#include <linux/vzctl.h>
++#include <linux/vzcalluser.h>
++#ifdef CONFIG_VZ_FAIRSCHED
++#include <linux/fairsched.h>
++#endif
++
++#include <linux/nfcalls.h>
++#include <linux/virtinfo.h>
++#include <linux/utsrelease.h>
++#include <linux/major.h>
++
++int nr_ve = 1;	/* One VE always exists. Compatibility with vestat */
++EXPORT_SYMBOL(nr_ve);
++
++static int	do_env_enter(struct ve_struct *ve, unsigned int flags);
++static int	alloc_ve_tty_drivers(struct ve_struct* ve);
++static void	free_ve_tty_drivers(struct ve_struct* ve);
++static int	register_ve_tty_drivers(struct ve_struct* ve);
++static void	unregister_ve_tty_drivers(struct ve_struct* ve);
++static int	init_ve_tty_drivers(struct ve_struct *);
++static void	fini_ve_tty_drivers(struct ve_struct *);
++static void	clear_termios(struct tty_driver* driver );
++
++static void vecalls_exit(void);
++
++struct ve_struct *__find_ve_by_id(envid_t veid)
++{
++	struct ve_struct *ve;
++
++	for_each_ve(ve) {
++		if (ve->veid == veid)
++			return ve;
++	}
++	return NULL;
++}
++EXPORT_SYMBOL(__find_ve_by_id);
++
++struct ve_struct *get_ve_by_id(envid_t veid)
++{
++	struct ve_struct *ve;
++	read_lock(&ve_list_lock);
++	ve = __find_ve_by_id(veid);
++	get_ve(ve);
++	read_unlock(&ve_list_lock);
++	return ve;
++}
++EXPORT_SYMBOL(get_ve_by_id);
++
++/*
++ * real_put_ve() MUST be used instead of put_ve() inside vecalls.
++ */
++void real_do_env_free(struct ve_struct *ve);
++static inline void real_put_ve(struct ve_struct *ve)
++{
++	if (ve && atomic_dec_and_test(&ve->counter)) {
++		BUG_ON(atomic_read(&ve->pcounter) > 0);
++		BUG_ON(ve->is_running);
++		real_do_env_free(ve);
++	}
++}
++
++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
++{
++	struct ve_struct *ve;
++	struct vz_cpu_stat *vstat;
++	int retval;
++	int i, cpu;
++	unsigned long tmp;
++
++	if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
++		return -EPERM;
++	if (veid == 0)
++		return -ESRCH;
++
++	vstat = kzalloc(sizeof(*vstat), GFP_KERNEL);
++	if (!vstat)
++		return -ENOMEM;
++	
++	retval = -ESRCH;
++	read_lock(&ve_list_lock);
++	ve = __find_ve_by_id(veid);
++	if (ve == NULL)
++		goto out_unlock;
++	for_each_online_cpu(cpu) {
++		struct ve_cpu_stats *st;
++
++		st = VE_CPU_STATS(ve, cpu);
++		vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user);
++		vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice);
++		vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system);
++		vstat->idle_clk += ve_sched_get_idle_time(ve, cpu);
++	}
++	vstat->uptime_clk = get_cycles() - ve->start_cycles;
++	vstat->uptime_jif = (unsigned long)cputime64_to_clock_t(
++				get_jiffies_64() - ve->start_jiffies);
++	for (i = 0; i < 3; i++) {
++		tmp = ve->avenrun[i] + (FIXED_1/200);
++		vstat->avenrun[i].val_int = LOAD_INT(tmp);
++		vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
++	}
++	read_unlock(&ve_list_lock);
++
++	retval = 0;
++	if (copy_to_user(buf, vstat, sizeof(*vstat)))
++		retval = -EFAULT;
++out_free:
++	kfree(vstat);
++	return retval;
++
++out_unlock:
++	read_unlock(&ve_list_lock);
++	goto out_free;
++}
++
++static int real_setdevperms(envid_t veid, unsigned type,
++		dev_t dev, unsigned mask)
++{
++	struct ve_struct *ve;
++	int err;
++
++	if (!capable(CAP_SETVEID) || veid == 0)
++		return -EPERM;
++
++	if ((ve = get_ve_by_id(veid)) == NULL)
++		return -ESRCH;
++
++	down_read(&ve->op_sem);
++	err = -ESRCH;
++	if (ve->is_running)
++		err = set_device_perms_ve(ve, type, dev, mask);
++	up_read(&ve->op_sem);
++	real_put_ve(ve);
++	return err;
++}
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE start: subsystems
++ *
++ **********************************************************************
++ **********************************************************************/
++
++#ifdef CONFIG_INET
++#include <net/ip.h>
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <net/icmp.h>
++
++static int init_fini_ve_mibs(struct ve_struct *ve, int fini)
++{
++	if (fini)
++		goto fini;
++	if (init_ipv4_mibs())
++		goto err_ipv4;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	if (init_ipv6_mibs())
++		goto err_ipv6;
++#endif
++	return 0;
++
++fini:
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++	cleanup_ipv6_mibs();
++err_ipv6:
++#endif
++	cleanup_ipv4_mibs();
++err_ipv4:
++	return -ENOMEM;
++}
++
++static inline int init_ve_mibs(struct ve_struct *ve)
++{
++	return init_fini_ve_mibs(ve, 0);
++}
++
++static inline void fini_ve_mibs(struct ve_struct *ve)
++{
++	(void)init_fini_ve_mibs(ve, 1);
++}
++#else
++#define init_ve_mibs(ve)	(0)
++#define fini_ve_mibs(ve)	do { } while (0)
++#endif
++
++static int prepare_proc_root(struct ve_struct *ve)
++{
++	struct proc_dir_entry *de;
++
++	de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL);
++	if (de == NULL)
++		return -ENOMEM;
++
++	memcpy(de + 1, "/proc", 6);
++	de->name = (char *)(de + 1);
++	de->namelen = 5;
++	de->mode = S_IFDIR | S_IRUGO | S_IXUGO;
++	de->nlink = 2;
++	atomic_set(&de->count, 1);
++
++	ve->proc_root = de;
++	return 0;
++}
++
++#ifdef CONFIG_PROC_FS
++static int init_ve_proc(struct ve_struct *ve)
++{
++	int err;
++
++	err = prepare_proc_root(ve);
++	if (err)
++		goto out_root;
++
++	err = register_ve_fs_type(ve, &proc_fs_type,
++			&ve->proc_fstype, &ve->proc_mnt);
++	if (err)
++		goto out_reg;
++
++#ifdef CONFIG_PRINTK
++	proc_create("kmsg", S_IRUSR, ve->proc_root, &proc_kmsg_operations);
++#endif
++	proc_mkdir("vz", ve->proc_root);
++
++	ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt);
++	return 0;
++
++out_reg:
++	/* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */
++	;
++out_root:
++	return err;
++}
++
++static void fini_ve_proc(struct ve_struct *ve)
++{
++	remove_proc_entry("vz", ve->proc_root);
++	remove_proc_entry("kmsg", ve->proc_root);
++	unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt);
++	ve->proc_mnt = NULL;
++}
++
++static void free_ve_proc(struct ve_struct *ve)
++{
++	/* proc filesystem frees proc_dir_entries on remove_proc_entry() only,
++	   so we check that everything was removed and not lost */
++	if (ve->proc_root && ve->proc_root->subdir) {
++		struct proc_dir_entry *p = ve->proc_root;
++		printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid);
++		while ((p = p->subdir) != NULL)
++			printk("/%s", p->name);
++		printk(" is not removed!\n");
++	}
++
++	kfree(ve->proc_root);
++	kfree(ve->proc_fstype);
++
++	ve->proc_fstype = NULL;
++	ve->proc_root = NULL;
++}
++#else
++#define init_ve_proc(ve)	(0)
++#define fini_ve_proc(ve)	do { } while (0)
++#define free_ve_proc(ve)	do { } while (0)
++#endif
++
++#ifdef CONFIG_UNIX98_PTYS
++#include <linux/devpts_fs.h>
++
++/*
++ * DEVPTS needs a virtualization: each environment should see each own list of
++ * pseudo-terminals.
++ * To implement it we need to have separate devpts superblocks for each
++ * VE, and each VE should mount its own one.
++ * Thus, separate vfsmount structures are required.
++ * To minimize intrusion into vfsmount lookup code, separate file_system_type
++ * structures are created.
++ *
++ * In addition to this, patch fo character device itself is required, as file
++ * system itself is used only for MINOR/MAJOR lookup.
++ */
++
++static int init_ve_devpts(struct ve_struct *ve)
++{
++	int err;
++
++	err = -ENOMEM;
++	ve->devpts_config = kzalloc(sizeof(struct devpts_config), GFP_KERNEL);
++	if (ve->devpts_config == NULL)
++		goto out;
++
++	ve->devpts_config->mode = 0600;
++	err = register_ve_fs_type(ve, &devpts_fs_type,
++			&ve->devpts_fstype, &ve->devpts_mnt);
++	if (err) {
++		kfree(ve->devpts_config);
++		ve->devpts_config = NULL;
++	}
++out:
++	return err;
++}
++
++static void fini_ve_devpts(struct ve_struct *ve)
++{
++	unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt);
++	/* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */
++	ve->devpts_mnt = NULL;
++	kfree(ve->devpts_config);
++	ve->devpts_config = NULL;
++}
++#else
++#define init_ve_devpts(ve)	(0)
++#define fini_ve_devpts(ve)	do { } while (0)
++#endif
++
++static int init_ve_shmem(struct ve_struct *ve)
++{
++	return register_ve_fs_type(ve,
++				   &tmpfs_fs_type,
++				   &ve->shmem_fstype,
++				   &ve->shmem_mnt);
++}
++
++static void fini_ve_shmem(struct ve_struct *ve)
++{
++	unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt);
++	/* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */
++	ve->shmem_mnt = NULL;
++}
++
++#ifdef CONFIG_SYSFS
++static int init_ve_sysfs_root(struct ve_struct *ve)
++{
++	struct sysfs_dirent *sysfs_root;
++
++	sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL);
++	if (sysfs_root == NULL)
++		return -ENOMEM;
++	sysfs_root->s_name = "";
++	atomic_set(&sysfs_root->s_count, 1);
++	sysfs_root->s_flags = SYSFS_DIR;
++	sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
++	sysfs_root->s_ino = 1;
++
++	ve->_sysfs_root = sysfs_root;
++	return 0;
++}
++#endif
++
++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
++extern struct device_attribute ve_net_class_attributes[];
++static inline int init_ve_netclass(void)
++{
++	struct class *nc;
++	int err;
++
++	nc = kzalloc(sizeof(*nc), GFP_KERNEL);
++	if (!nc)
++		return -ENOMEM;
++
++	nc->name = net_class.name;
++	nc->dev_release = net_class.dev_release;
++	nc->dev_uevent = net_class.dev_uevent;
++	nc->dev_attrs = ve_net_class_attributes;
++
++	err = class_register(nc);
++	if (!err) {
++		get_exec_env()->net_class = nc;
++		return 0;
++	}
++	kfree(nc);	
++	return err;
++}
++
++static inline void fini_ve_netclass(void)
++{
++	struct ve_struct *ve = get_exec_env();
++
++	class_unregister(ve->net_class);
++	kfree(ve->net_class);
++	ve->net_class = NULL;
++}
++#else
++static inline int init_ve_netclass(void) { return 0; }
++static inline void fini_ve_netclass(void) { ; }
++#endif
++
++extern struct kset devices_subsys;
++
++static const struct {
++	unsigned	minor;
++	char		*name;
++} mem_class_devices [] = {
++	{3, "null"},
++	{5, "zero"},
++	{7, "full"},
++	{8, "random"},
++	{9, "urandom"},
++	{0, NULL},
++};
++
++static int init_ve_mem_class(void)
++{
++	int i;
++	struct class *ve_mem_class;
++
++	ve_mem_class = class_create(THIS_MODULE, "mem");
++	if (IS_ERR(ve_mem_class))
++		return -ENOMEM;
++
++	for (i = 0; mem_class_devices[i].name; i++)
++		device_create(ve_mem_class, NULL,
++				MKDEV(MEM_MAJOR, mem_class_devices[i].minor),
++				mem_class_devices[i].name);
++
++	get_exec_env()->mem_class = ve_mem_class;
++	return 0;
++}
++
++
++void fini_ve_mem_class(void)
++{
++	int i;
++	struct class *ve_mem_class = get_exec_env()->mem_class;
++
++	for (i = 0; mem_class_devices[i].name; i++)
++		device_destroy(ve_mem_class,
++				MKDEV(MEM_MAJOR, mem_class_devices[i].minor));
++	class_destroy(ve_mem_class);
++}
++
++static int init_ve_sysfs(struct ve_struct *ve)
++{
++	int err;
++
++#ifdef CONFIG_SYSFS
++	err = 0;
++	if (ve->features & VE_FEATURE_SYSFS) {
++		err = init_ve_sysfs_root(ve);
++		if (err != 0)
++			goto out;
++		err = register_ve_fs_type(ve,
++				   &sysfs_fs_type,
++				   &ve->sysfs_fstype,
++				   &ve->sysfs_mnt);
++		if (err != 0)
++			goto out_fs_type;
++	}
++#endif
++
++	err = classes_init();
++	if (err != 0)
++		goto err_classes;
++
++	err = devices_init();
++	if (err != 0)
++		goto err_devices;
++
++	err = init_ve_netclass();
++	if (err != 0)
++		goto err_net;
++
++	err = init_ve_tty_class();
++	if (err != 0)
++		goto err_tty;
++
++	err = init_ve_mem_class();
++	if (err != 0)
++		goto err_mem;
++
++	return 0;
++
++err_mem:
++	fini_ve_tty_class();
++err_tty:
++	fini_ve_netclass();
++err_net:
++	devices_fini();
++err_devices:
++	classes_fini();
++err_classes:
++#ifdef CONFIG_SYSFS
++	unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
++	/* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
++out_fs_type:
++	kfree(ve->_sysfs_root);
++	ve->_sysfs_root = NULL;
++out:
++#endif
++	return err;
++}
++
++static void fini_ve_sysfs(struct ve_struct *ve)
++{
++	fini_ve_mem_class();
++	fini_ve_tty_class();
++	fini_ve_netclass();
++	devices_fini();
++	classes_fini();
++#ifdef CONFIG_SYSFS
++	unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
++	ve->sysfs_mnt = NULL;
++	kfree(ve->_sysfs_root);
++	ve->_sysfs_root = NULL;
++	/* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
++#endif
++}
++
++static void free_ve_filesystems(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSFS
++	kfree(ve->sysfs_fstype);
++	ve->sysfs_fstype = NULL;
++#endif
++	kfree(ve->shmem_fstype);
++	ve->shmem_fstype = NULL;
++
++	kfree(ve->devpts_fstype);
++	ve->devpts_fstype = NULL;
++
++	free_ve_proc(ve);
++}
++
++static int init_printk(struct ve_struct *ve)
++{
++	struct ve_prep_printk {
++		wait_queue_head_t       log_wait;
++		unsigned		log_start;
++		unsigned		log_end;
++		unsigned		logged_chars;
++	} *tmp;
++
++	tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL);
++	if (!tmp)
++		return -ENOMEM;
++
++	init_waitqueue_head(&tmp->log_wait);
++	ve->_log_wait = &tmp->log_wait;
++	ve->_log_start = &tmp->log_start;
++	ve->_log_end = &tmp->log_end;
++	ve->_logged_chars = &tmp->logged_chars;
++	/* ve->log_buf will be initialized later by ve_log_init() */
++	return 0;
++}
++
++static void fini_printk(struct ve_struct *ve)
++{
++	/* 
++	 * there is no spinlock protection here because nobody can use
++	 * log_buf at the moments when this code is called. 
++	 */
++	kfree(ve->log_buf);
++	kfree(ve->_log_wait);
++}
++
++static void fini_venet(struct ve_struct *ve)
++{
++#ifdef CONFIG_INET
++	tcp_v4_kill_ve_sockets(ve);
++	synchronize_net();
++#endif
++}
++
++static int init_ve_sched(struct ve_struct *ve)
++{
++#ifdef CONFIG_VZ_FAIRSCHED
++	int err;
++
++	/*
++	 * We refuse to switch to an already existing node since nodes
++	 * keep a pointer to their ve_struct...
++	 */
++	err = sys_fairsched_mknod(0, 1, ve->veid);
++	if (err < 0) {
++		printk(KERN_WARNING "Can't create fairsched node %d\n",
++				ve->veid);
++		return err;
++	}
++	err = sys_fairsched_mvpr(current->pid, ve->veid);
++	if (err) {
++		printk(KERN_WARNING "Can't switch to fairsched node %d\n",
++				ve->veid);
++		if (sys_fairsched_rmnod(ve->veid))
++			printk(KERN_ERR "Can't clean fairsched node %d\n",
++					ve->veid);
++		return err;
++	}
++#endif
++	ve_sched_attach(ve);
++	return 0;
++}
++
++static void fini_ve_sched(struct ve_struct *ve)
++{
++#ifdef CONFIG_VZ_FAIRSCHED
++	if (task_fairsched_node_id(current) == ve->veid)
++		if (sys_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID))
++			printk(KERN_WARNING "Can't leave fairsched node %d\n",
++					ve->veid);
++	if (sys_fairsched_rmnod(ve->veid))
++		printk(KERN_ERR "Can't remove fairsched node %d\n",
++				ve->veid);
++#endif
++}
++
++/*
++ * Namespaces
++ */
++
++static inline int init_ve_namespaces(struct ve_struct *ve,
++		struct nsproxy **old)
++{
++	int err;
++	struct task_struct *tsk;
++	struct nsproxy *cur;
++
++	tsk = current;
++	cur = tsk->nsproxy;
++
++	err = copy_namespaces(CLONE_NAMESPACES_MASK & ~CLONE_NEWNET, tsk);
++	if (err < 0)
++		return err;
++
++	ve->ve_ns = get_nsproxy(tsk->nsproxy);
++	memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release,
++			sizeof(virt_utsname.release));
++
++	if (cur->pid_ns->flags & PID_NS_HIDE_CHILD)
++		ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN;
++
++	*old = cur;
++	return 0;
++}
++
++static inline void fini_ve_namespaces(struct ve_struct *ve,
++		struct nsproxy *old)
++{
++	struct task_struct *tsk = current;
++	struct nsproxy *tmp;
++
++	if (old) {
++		tmp = tsk->nsproxy;
++		tsk->nsproxy = get_nsproxy(old);
++		put_nsproxy(tmp);
++		tmp = ve->ve_ns;
++		ve->ve_ns = get_nsproxy(old);
++		put_nsproxy(tmp);
++	} else {
++		put_nsproxy(ve->ve_ns);
++		ve->ve_ns = NULL;
++	}
++}
++
++static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old)
++{
++	int err;
++	struct task_struct *tsk;
++	struct nsproxy *cur;
++
++	tsk = current;
++	cur = tsk->nsproxy;
++
++	err = copy_namespaces(CLONE_NEWNET, tsk);
++	if (err < 0)
++		return err;
++
++	put_nsproxy(ve->ve_ns);
++	ve->ve_ns = get_nsproxy(tsk->nsproxy);
++	ve->ve_netns = get_net(ve->ve_ns->net_ns);
++	*old = cur;
++	return 0;
++}
++
++static inline void switch_ve_namespaces(struct ve_struct *ve,
++		struct task_struct *tsk)
++{
++	struct nsproxy *old_ns;
++	struct nsproxy *new_ns;
++
++	BUG_ON(tsk != current);
++	old_ns = tsk->nsproxy;
++	new_ns = ve->ve_ns;
++
++	if (old_ns != new_ns) {
++		tsk->nsproxy = get_nsproxy(new_ns);
++		put_nsproxy(old_ns);
++	}
++}
++
++static __u64 get_ve_features(env_create_param_t *data, int datalen)
++{
++	__u64 known_features;
++
++	if (datalen < sizeof(struct env_create_param3))
++		/* this version of vzctl is aware of VE_FEATURES_OLD only */
++		known_features = VE_FEATURES_OLD;
++	else
++		known_features = data->known_features;
++
++	/*
++	 * known features are set as required
++	 * yet unknown features are set as in VE_FEATURES_DEF
++	 */
++	return (data->feature_mask & known_features) |
++		(VE_FEATURES_DEF & ~known_features);
++}
++
++static int init_ve_struct(struct ve_struct *ve, envid_t veid,
++		u32 class_id, env_create_param_t *data, int datalen)
++{
++	(void)get_ve(ve);
++	ve->veid = veid;
++	ve->class_id = class_id;
++	ve->features = get_ve_features(data, datalen);
++	INIT_LIST_HEAD(&ve->vetask_lh);
++	init_rwsem(&ve->op_sem);
++
++	ve->start_timespec = current->start_time;
++	/* The value is wrong, but it is never compared to process
++	 * start times */
++	ve->start_jiffies = get_jiffies_64();
++	ve->start_cycles = get_cycles();
++
++	return 0;
++}
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * /proc/meminfo virtualization
++ *
++ **********************************************************************
++ **********************************************************************/
++static int ve_set_meminfo(envid_t veid, unsigned long val)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	struct ve_struct *ve;
++
++	ve = get_ve_by_id(veid);
++	if (!ve)
++		return -EINVAL;
++
++	ve->meminfo_val = val;
++	real_put_ve(ve);
++	return 0;
++#else
++	return -ENOTTY;
++#endif
++}
++
++static int init_ve_meminfo(struct ve_struct *ve)
++{
++	ve->meminfo_val = 0;
++	return 0;
++}
++
++static inline void fini_ve_meminfo(struct ve_struct *ve)
++{
++}
++
++static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk)
++{
++	read_lock(&tsk->fs->lock);
++	ve->root_path = tsk->fs->root;
++	read_unlock(&tsk->fs->lock);
++	mark_tree_virtual(&ve->root_path);
++}
++
++static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk)
++{
++	/* required for real_setdevperms from register_ve_<fs> above */
++	memcpy(&ve->ve_cap_bset, &tsk->cap_effective, sizeof(kernel_cap_t));
++	cap_lower(ve->ve_cap_bset, CAP_SETVEID);
++}
++
++static int ve_list_add(struct ve_struct *ve)
++{
++	write_lock_irq(&ve_list_lock);
++	if (__find_ve_by_id(ve->veid) != NULL)
++		goto err_exists;
++
++	list_add(&ve->ve_list, &ve_list_head);
++	nr_ve++;
++	write_unlock_irq(&ve_list_lock);
++	return 0;
++
++err_exists:
++	write_unlock_irq(&ve_list_lock);
++	return -EEXIST;
++}
++
++static void ve_list_del(struct ve_struct *ve)
++{
++	write_lock_irq(&ve_list_lock);
++	list_del(&ve->ve_list);
++	nr_ve--;
++	write_unlock_irq(&ve_list_lock);
++}
++
++static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve)
++{
++	kernel_cap_t bset;
++
++	spin_lock(&task_capability_lock);
++	bset = ve->ve_cap_bset;
++	tsk->cap_effective = cap_intersect(tsk->cap_effective, bset);
++	tsk->cap_inheritable = cap_intersect(tsk->cap_inheritable, bset);
++	tsk->cap_permitted = cap_intersect(tsk->cap_permitted, bset);
++	spin_unlock(&task_capability_lock);
++}
++
++void ve_move_task(struct task_struct *tsk, struct ve_struct *new)
++{
++	struct ve_struct *old;
++
++	might_sleep();
++	BUG_ON(tsk != current);
++	BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk)));
++
++	/* this probihibts ptracing of task entered to VE from host system */
++	tsk->mm->vps_dumpable = 0;
++	/* setup capabilities before enter */
++	set_task_ve_caps(tsk, new);
++
++	old = tsk->ve_task_info.owner_env;
++	tsk->ve_task_info.owner_env = new;
++	tsk->ve_task_info.exec_env = new;
++
++	write_lock_irq(&tasklist_lock);
++	list_del_rcu(&tsk->ve_task_info.vetask_list);
++	write_unlock_irq(&tasklist_lock);
++
++	synchronize_rcu();
++
++	write_lock_irq(&tasklist_lock);
++	list_add_tail_rcu(&tsk->ve_task_info.vetask_list,
++			&new->vetask_lh);
++	write_unlock_irq(&tasklist_lock);
++
++	atomic_dec(&old->pcounter);
++	real_put_ve(old);
++
++	atomic_inc(&new->pcounter);
++	get_ve(new);
++
++	tsk->cgroups = new->ve_css_set;
++}
++
++EXPORT_SYMBOL(ve_move_task);
++
++#ifdef CONFIG_VE_IPTABLES
++
++#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args)	\
++({								\
++	int ret = 0;						\
++	if (VE_IPT_CMP(mask, full_mask) &&			\
++		VE_IPT_CMP((ve)->_iptables_modules, 		\
++			full_mask & ~(full_mask##_MOD))) {	\
++		ret = KSYMERRCALL(1, mod, name, args);		\
++		if (ret == 0)					\
++			(ve)->_iptables_modules |=		\
++					full_mask##_MOD;	\
++		if (ret == 1)					\
++			ret = 0;				\
++	}							\
++	ret;							\
++})
++
++#define KSYMIPTFINI(mask, full_mask, mod, name, args)		\
++({								\
++ 	if (VE_IPT_CMP(mask, full_mask##_MOD))			\
++		KSYMSAFECALL_VOID(mod, name, args);		\
++})
++
++
++static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask,
++		int init_or_cleanup)
++{
++	int err;
++
++	/* Remove when userspace will start supplying IPv6-related bits. */
++	init_mask &= ~VE_IP_IPTABLES6;
++	init_mask &= ~VE_IP_FILTER6;
++	init_mask &= ~VE_IP_MANGLE6;
++	init_mask &= ~VE_IP_IPTABLE_NAT_MOD;
++	init_mask &= ~VE_NF_CONNTRACK_MOD;
++	if ((init_mask & VE_IP_IPTABLES) == VE_IP_IPTABLES)
++		init_mask |= VE_IP_IPTABLES6;
++	if ((init_mask & VE_IP_FILTER) == VE_IP_FILTER)
++		init_mask |= VE_IP_FILTER6;
++	if ((init_mask & VE_IP_MANGLE) == VE_IP_MANGLE)
++		init_mask |= VE_IP_MANGLE6;
++	if ((init_mask & VE_IP_NAT) == VE_IP_NAT)
++		init_mask |= VE_IP_IPTABLE_NAT;
++
++	if ((init_mask & VE_IP_CONNTRACK) == VE_IP_CONNTRACK)
++		init_mask |= VE_NF_CONNTRACK;
++
++	err = 0;
++	if (!init_or_cleanup)
++		goto cleanup;
++
++	/* init part */
++#if defined(CONFIG_NF_CONNTRACK_IPV4) || \
++    defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
++	err = KSYMIPTINIT(init_mask, ve, VE_NF_CONNTRACK,
++			nf_conntrack, nf_conntrack_init_ve, ());
++	if (err < 0)
++		goto err_nf_conntrack;
++
++	err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK,
++			nf_conntrack_ipv4, init_nf_ct_l3proto_ipv4, ());
++	if (err < 0)
++		goto err_nf_conntrack_ipv4;
++#endif
++#if defined(CONFIG_NF_NAT) || \
++    defined(CONFIG_NF_NAT_MODULE)
++	err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT,
++			nf_nat, nf_nat_init, ());
++	if (err < 0)
++		goto err_nftable_nat;
++	err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLE_NAT,
++			iptable_nat, init_nftable_nat, ());
++	if (err < 0)
++		goto err_nftable_nat2;
++#endif
++	return 0;
++
++/* ------------------------------------------------------------------------- */
++
++cleanup:
++#if defined(CONFIG_NF_NAT) || \
++    defined(CONFIG_NF_NAT_MODULE)
++	KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLE_NAT,
++			iptable_nat, fini_nftable_nat, ());
++err_nftable_nat2:
++	KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT,
++			nf_nat, nf_nat_cleanup, ());
++err_nftable_nat:
++#endif
++#if defined(CONFIG_NF_CONNTRACK_IPV4) || \
++    defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
++	KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK,
++		nf_conntrack_ipv4, fini_nf_ct_l3proto_ipv4, ());
++err_nf_conntrack_ipv4:
++	KSYMIPTFINI(ve->_iptables_modules, VE_NF_CONNTRACK,
++		nf_conntrack, nf_conntrack_cleanup_ve, ());
++err_nf_conntrack:
++#endif
++	/* Do not reset _iptables_modules as
++	 * net hooks used one
++	 */
++	return err;
++}
++
++static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask)
++{
++	return do_ve_iptables(ve, init_mask, 1);
++}
++
++static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask)
++{
++	(void)do_ve_iptables(ve, init_mask, 0);
++}
++
++#else
++#define init_ve_iptables(x, y)	(0)
++#define fini_ve_iptables(x, y)	do { } while (0)
++#endif
++
++static inline int init_ve_cpustats(struct ve_struct *ve)
++{
++	ve->cpu_stats = alloc_percpu(struct ve_cpu_stats);
++	return ve->cpu_stats == NULL ? -ENOMEM : 0;
++}
++
++static inline void free_ve_cpustats(struct ve_struct *ve)
++{
++	free_percpu(ve->cpu_stats);
++	ve->cpu_stats = NULL;
++}
++
++static int alone_in_pgrp(struct task_struct *tsk)
++{
++	struct task_struct *p;
++	int alone = 0;
++
++	read_lock(&tasklist_lock);
++	do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) {
++		if (p != tsk)
++			goto out;
++	} while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p);
++	do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) {
++		if (p != tsk)
++			goto out;
++	} while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p);
++	alone = 1;
++out:
++	read_unlock(&tasklist_lock);
++	return alone;
++}
++
++static int do_env_create(envid_t veid, unsigned int flags, u32 class_id,
++			 env_create_param_t *data, int datalen)
++{
++	struct task_struct *tsk;
++	struct ve_struct *old;
++	struct ve_struct *old_exec;
++	struct ve_struct *ve;
++ 	__u64 init_mask;
++	int err;
++	struct nsproxy *old_ns, *old_ns_net;
++	DECLARE_COMPLETION_ONSTACK(sysfs_completion);
++
++	tsk = current;
++	old = VE_TASK_INFO(tsk)->owner_env;
++
++	if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
++		return -EINVAL;
++
++	if (tsk->signal->tty) {
++		printk("ERR: CT init has controlling terminal\n");
++		return -EINVAL;
++	}
++	if (task_pgrp(tsk) != task_pid(tsk) ||
++			task_session(tsk) != task_pid(tsk)) {
++		int may_setsid;
++
++		read_lock(&tasklist_lock);
++		may_setsid = !tsk->signal->leader &&
++			!find_task_by_pid_type_ns(PIDTYPE_PGID, task_pid_nr(tsk), &init_pid_ns);
++		read_unlock(&tasklist_lock);
++
++		if (!may_setsid) {
++			printk("ERR: CT init is process group leader\n");
++			return -EINVAL;
++		}
++	}
++	/* Check that the process is not a leader of non-empty group/session.
++	 * If it is, we cannot virtualize its PID and must fail. */
++	if (!alone_in_pgrp(tsk)) {
++		printk("ERR: CT init is not alone in process group\n");
++		return -EINVAL;
++	}
++
++
++	VZTRACE("%s: veid=%d classid=%d pid=%d\n",
++		__FUNCTION__, veid, class_id, current->pid);
++
++	err = -ENOMEM;
++	ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL);
++	if (ve == NULL)
++		goto err_struct;
++
++	init_ve_struct(ve, veid, class_id, data, datalen);
++	__module_get(THIS_MODULE);
++	down_write(&ve->op_sem);
++	if (flags & VE_LOCK)
++		ve->is_locked = 1;
++
++	/*
++	 * this should be done before adding to list
++	 * because if calc_load_ve finds this ve in
++	 * list it will be very surprised
++	 */
++	if ((err = init_ve_cpustats(ve)) < 0)
++		goto err_cpu_stats;
++
++	if ((err = ve_list_add(ve)) < 0)
++		goto err_exist;
++
++	/* this should be done before context switching */
++	if ((err = init_printk(ve)) < 0)
++		goto err_log_wait;
++
++	old_exec = set_exec_env(ve);
++
++	if ((err = init_ve_sched(ve)) < 0)
++		goto err_sched;
++
++	set_ve_root(ve, tsk);
++
++	if ((err = init_ve_sysfs(ve)))
++		goto err_sysfs;
++
++	if ((err = init_ve_mibs(ve)))
++		goto err_mibs;
++
++	if ((err = init_ve_namespaces(ve, &old_ns)))
++		goto err_ns;
++
++	if ((err = init_ve_proc(ve)))
++		goto err_proc;
++
++
++	init_mask = data ? data->iptables_mask : VE_IP_DEFAULT;
++
++#ifdef CONFIG_VE_IPTABLES
++	/* Set up ipt_mask as it will be used during
++	 * net namespace initialization
++	 */
++	ve->ipt_mask = init_mask;
++#endif
++
++	if ((err = init_ve_netns(ve, &old_ns_net)))
++		goto err_netns;
++
++	if ((err = init_ve_cgroups(ve)))
++		goto err_cgroup;
++
++	if ((err = init_ve_tty_drivers(ve)) < 0)
++		goto err_tty;
++
++	if ((err = init_ve_shmem(ve)))
++		goto err_shmem;
++
++	if ((err = init_ve_devpts(ve)))
++		goto err_devpts;
++
++	if((err = init_ve_meminfo(ve)))
++		goto err_meminf;
++
++	set_ve_caps(ve, tsk);
++
++	/* It is safe to initialize netfilter here as routing initialization and
++	   interface setup will be done below. This means that NO skb can be
++	   passed inside. Den */
++	/* iptables ve initialization for non ve0;
++	   ve0 init is in module_init */
++
++	if ((err = init_ve_iptables(ve, init_mask)) < 0)
++		goto err_iptables;
++
++	if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0)
++		goto err_vpid;
++
++	if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0)
++		goto err_ve_hook;
++
++	put_nsproxy(old_ns);
++	put_nsproxy(old_ns_net);
++
++	/* finally: set vpids and move inside */
++	ve_move_task(tsk, ve);
++
++	ve->is_running = 1;
++	up_write(&ve->op_sem);
++
++	printk(KERN_INFO "CT: %d: started\n", veid);
++	return veid;
++
++err_ve_hook:
++	mntget(ve->proc_mnt);
++err_vpid:
++	fini_venet(ve);
++	fini_ve_iptables(ve, init_mask);
++err_iptables:
++	fini_ve_meminfo(ve);
++err_meminf:
++	fini_ve_devpts(ve);
++err_devpts:
++	fini_ve_shmem(ve);
++err_shmem:
++	fini_ve_tty_drivers(ve);
++err_tty:
++	fini_ve_cgroups(ve);
++err_cgroup:
++	fini_ve_namespaces(ve, old_ns_net);
++	put_nsproxy(old_ns_net);
++	ve->ve_netns->sysfs_completion = &sysfs_completion;
++	put_net(ve->ve_netns);
++	wait_for_completion(&sysfs_completion);
++err_netns:
++	/*
++	 * If process hasn't become VE's init, proc_mnt won't be put during
++	 * pidns death, so this mntput by hand is needed. If it has, we
++	 * compensate with mntget above.
++	 */
++	mntput(ve->proc_mnt);
++	fini_ve_proc(ve);
++err_proc:
++	/* free_ve_utsname() is called inside real_put_ve() */
++	fini_ve_namespaces(ve, old_ns);
++	put_nsproxy(old_ns);
++	/*
++	 * We need to compensate, because fini_ve_namespaces() assumes
++	 * ve->ve_ns will continue to be used after, but VE will be freed soon
++	 * (in kfree() sense).
++	 */
++	put_nsproxy(ve->ve_ns);
++err_ns:
++	fini_ve_mibs(ve);
++err_mibs:
++	fini_ve_sysfs(ve);
++err_sysfs:
++	/* It is safe to restore current->envid here because
++	 * ve_fairsched_detach does not use current->envid. */
++	/* Really fairsched code uses current->envid in sys_fairsched_mknod 
++	 * only.  It is correct if sys_fairsched_mknod is called from
++	 * userspace.  If sys_fairsched_mknod is called from
++	 * ve_fairsched_attach, then node->envid and node->parent_node->envid
++	 * are explicitly set to valid value after the call. */
++	/* FIXME */
++	VE_TASK_INFO(tsk)->owner_env = old;
++	VE_TASK_INFO(tsk)->exec_env = old_exec;
++
++	fini_ve_sched(ve);
++err_sched:
++	(void)set_exec_env(old_exec);
++
++	/* we can jump here having incorrect envid */
++	VE_TASK_INFO(tsk)->owner_env = old;
++	fini_printk(ve);
++err_log_wait:
++	/* cpustats will be freed in do_env_free */
++	ve_list_del(ve);
++	up_write(&ve->op_sem);
++
++	real_put_ve(ve);
++err_struct:
++	printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err);
++	return err;
++
++err_exist:
++	free_ve_cpustats(ve);
++err_cpu_stats:
++	kfree(ve);
++	goto err_struct;
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE start/stop callbacks
++ *
++ **********************************************************************
++ **********************************************************************/
++
++int real_env_create(envid_t veid, unsigned flags, u32 class_id,
++			env_create_param_t *data, int datalen)
++{
++	int status;
++	struct ve_struct *ve;
++
++	if (!flags) {
++		status = get_exec_env()->veid;
++		goto out;
++	}
++
++	status = -EPERM;
++	if (!capable(CAP_SETVEID))
++		goto out;
++
++	status = -EINVAL;
++	if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE)))
++		goto out;
++
++	status = -EINVAL;
++	ve = get_ve_by_id(veid);
++	if (ve) {
++		if (flags & VE_TEST) {
++			status = 0;
++			goto out_put;
++		}
++		if (flags & VE_EXCLUSIVE) {
++			status = -EACCES;
++			goto out_put;
++		}
++		if (flags & VE_CREATE) {
++			flags &= ~VE_CREATE;
++			flags |= VE_ENTER;
++		}
++	} else {
++		if (flags & (VE_TEST|VE_ENTER)) {
++			status = -ESRCH;
++			goto out;
++		}
++	}
++
++	if (flags & VE_CREATE) {
++		status = do_env_create(veid, flags, class_id, data, datalen);
++		goto out;
++	} else if (flags & VE_ENTER)
++		status = do_env_enter(ve, flags);
++
++	/* else: returning EINVAL */
++
++out_put:
++	real_put_ve(ve);
++out:
++	return status;
++}
++EXPORT_SYMBOL(real_env_create);
++
++static int do_env_enter(struct ve_struct *ve, unsigned int flags)
++{
++	struct task_struct *tsk = current;
++	int err;
++
++	VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid);
++
++	err = -EBUSY;
++	down_read(&ve->op_sem);
++	if (!ve->is_running)
++		goto out_up;
++	if (ve->is_locked && !(flags & VE_SKIPLOCK))
++		goto out_up;
++	err = -EINVAL;
++	if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
++		goto out_up;
++
++#ifdef CONFIG_VZ_FAIRSCHED
++	err = sys_fairsched_mvpr(current->pid, ve->veid);
++	if (err)
++		goto out_up;
++#endif
++	ve_sched_attach(ve);
++	switch_ve_namespaces(ve, tsk);
++	ve_move_task(current, ve);
++
++	/* Check that the process is not a leader of non-empty group/session.
++	 * If it is, we cannot virtualize its PID. Do not fail, just leave
++	 * it non-virtual.
++	 */
++	if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK))
++		pid_ns_attach_task(ve->ve_ns->pid_ns, tsk);
++
++	/* Unlike VE_CREATE, we do not setsid() in VE_ENTER.
++	 * Process is allowed to be in an external group/session.
++	 * If user space callers wants, it will do setsid() after
++	 * VE_ENTER.
++	 */
++	err = VE_TASK_INFO(tsk)->owner_env->veid;
++	tsk->did_ve_enter = 1;
++
++out_up:
++	up_read(&ve->op_sem);
++	return err;
++}
++
++static void env_cleanup(struct ve_struct *ve)
++{
++	struct ve_struct *old_ve;
++	DECLARE_COMPLETION_ONSTACK(sysfs_completion);
++
++	VZTRACE("real_do_env_cleanup\n");
++
++	down_read(&ve->op_sem);
++	old_ve = set_exec_env(ve);
++
++	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
++
++	fini_venet(ve);
++
++	/* no new packets in flight beyond this point */
++
++	/* kill iptables */
++	/* No skb belonging to VE can exist at this point as unregister_netdev
++	   is an operation awaiting until ALL skb's gone */
++	fini_ve_iptables(ve, ve->_iptables_modules);
++
++	fini_ve_sched(ve);
++
++	fini_ve_devpts(ve);
++	fini_ve_shmem(ve);
++	unregister_ve_tty_drivers(ve);
++	fini_ve_meminfo(ve);
++
++	fini_ve_cgroups(ve);
++
++	fini_ve_namespaces(ve, NULL);
++	ve->ve_netns->sysfs_completion = &sysfs_completion;
++	put_net(ve->ve_netns);
++	wait_for_completion(&sysfs_completion);
++	fini_ve_mibs(ve);
++	fini_ve_proc(ve);
++	fini_ve_sysfs(ve);
++
++	(void)set_exec_env(old_ve);
++	fini_printk(ve);	/* no printk can happen in ve context anymore */
++
++	ve_list_del(ve);
++	up_read(&ve->op_sem);
++
++	real_put_ve(ve);
++}
++
++static DECLARE_COMPLETION(vzmond_complete);
++static volatile int stop_vzmond;
++
++static int vzmond_helper(void *arg)
++{
++	char name[18];
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)arg;
++	snprintf(name, sizeof(name), "vzmond/%d", ve->veid);
++	daemonize(name);
++	env_cleanup(ve);
++	module_put_and_exit(0);
++}
++
++static void do_pending_env_cleanups(void)
++{
++	int err;
++	struct ve_struct *ve;
++
++	spin_lock(&ve_cleanup_lock);
++	while (1) {
++		if (list_empty(&ve_cleanup_list) || need_resched())
++			break;
++
++		ve = list_first_entry(&ve_cleanup_list,
++				struct ve_struct, cleanup_list);
++		list_del(&ve->cleanup_list);
++		spin_unlock(&ve_cleanup_lock);
++
++		__module_get(THIS_MODULE);
++		err = kernel_thread(vzmond_helper, (void *)ve, 0);
++		if (err < 0) {
++			env_cleanup(ve);
++			module_put(THIS_MODULE);
++		}
++
++		spin_lock(&ve_cleanup_lock);
++	}
++	spin_unlock(&ve_cleanup_lock);
++}
++
++static inline int have_pending_cleanups(void)
++{
++	return !list_empty(&ve_cleanup_list);
++}
++
++static int vzmond(void *arg)
++{
++	daemonize("vzmond");
++	set_current_state(TASK_INTERRUPTIBLE);
++
++	while (!stop_vzmond || have_pending_cleanups()) {
++		schedule();
++		try_to_freeze();
++		if (signal_pending(current))
++			flush_signals(current);
++
++		do_pending_env_cleanups();
++		set_current_state(TASK_INTERRUPTIBLE);
++		if (have_pending_cleanups())
++			__set_current_state(TASK_RUNNING);
++	}
++
++	__set_task_state(current, TASK_RUNNING);
++	complete_and_exit(&vzmond_complete, 0);
++}
++
++static int __init init_vzmond(void)
++{
++	int pid;
++	struct task_struct *tsk;
++
++	pid = kernel_thread(vzmond, NULL, 0);
++	if (pid > 0) {
++		tsk = find_task_by_pid(pid);
++		BUG_ON(tsk == NULL);
++		ve_cleanup_thread = tsk;
++	}
++	return pid;
++}
++
++static void fini_vzmond(void)
++{
++	stop_vzmond = 1;
++	wake_up_process(ve_cleanup_thread);
++	wait_for_completion(&vzmond_complete);
++	ve_cleanup_thread = NULL;
++	WARN_ON(!list_empty(&ve_cleanup_list));
++}
++
++void real_do_env_free(struct ve_struct *ve)
++{
++	VZTRACE("real_do_env_free\n");
++
++	free_ve_tty_drivers(ve);
++	free_ve_filesystems(ve);
++	free_ve_cpustats(ve);
++	printk(KERN_INFO "CT: %d: stopped\n", VEID(ve));
++	kfree(ve);
++
++	module_put(THIS_MODULE);
++}
++EXPORT_SYMBOL(real_do_env_free);
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE TTY handling
++ *
++ **********************************************************************
++ **********************************************************************/
++
++static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base,
++					   struct ve_struct *ve)
++{
++	size_t size;
++	struct tty_driver *driver;
++
++	/* FIXME: make it a normal way (or wait till ms version) */
++
++	driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC);
++	if (!driver)
++		goto out;
++
++	memcpy(driver, base, sizeof(struct tty_driver));
++
++	driver->driver_state = NULL;
++
++	size = base->num * 3 * sizeof(void *);
++	if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) {
++		void **p;
++		p = kzalloc(size, GFP_KERNEL_UBC);
++		if (!p)
++			goto out_free;
++
++		driver->ttys = (struct tty_struct **)p;
++		driver->termios = (struct ktermios **)(p + driver->num);
++		driver->termios_locked = (struct ktermios **)
++			(p + driver->num * 2);
++	} else {
++		driver->ttys = NULL;
++		driver->termios = NULL;
++		driver->termios_locked = NULL;
++	}
++
++	driver->owner_env = ve;
++	driver->flags |= TTY_DRIVER_INSTALLED;
++	driver->refcount = 0;
++
++	return driver;
++
++out_free:
++	kfree(driver);
++out:
++	return NULL;
++}
++
++static void free_ve_tty_driver(struct tty_driver *driver)
++{
++	if (!driver)
++		return;
++
++	clear_termios(driver);
++	kfree(driver->ttys);
++	kfree(driver);
++}
++
++static int alloc_ve_tty_drivers(struct ve_struct* ve)
++{
++#ifdef CONFIG_LEGACY_PTYS
++	/* Traditional BSD devices */
++	ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve);
++	if (!ve->pty_driver)
++		goto out_mem;
++
++	ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve);
++	if (!ve->pty_slave_driver)
++		goto out_mem;
++
++	ve->pty_driver->other       = ve->pty_slave_driver;
++	ve->pty_slave_driver->other = ve->pty_driver;
++#endif	
++
++#ifdef CONFIG_UNIX98_PTYS
++	ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve);
++	if (!ve->ptm_driver)
++		goto out_mem;
++
++	ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve);
++	if (!ve->pts_driver)
++		goto out_mem;
++
++	ve->ptm_driver->other = ve->pts_driver;
++	ve->pts_driver->other = ve->ptm_driver;
++
++	ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys),
++			GFP_KERNEL_UBC);
++	if (!ve->allocated_ptys)
++		goto out_mem;
++	idr_init(ve->allocated_ptys);
++#endif
++	return 0;
++
++out_mem:
++	free_ve_tty_drivers(ve);
++	return -ENOMEM;
++}
++
++static void free_ve_tty_drivers(struct ve_struct* ve)
++{
++#ifdef CONFIG_LEGACY_PTYS
++	free_ve_tty_driver(ve->pty_driver);
++	free_ve_tty_driver(ve->pty_slave_driver);
++	ve->pty_driver = ve->pty_slave_driver = NULL;
++#endif	
++#ifdef CONFIG_UNIX98_PTYS
++	free_ve_tty_driver(ve->ptm_driver);
++	free_ve_tty_driver(ve->pts_driver);
++	kfree(ve->allocated_ptys);
++	ve->ptm_driver = ve->pts_driver = NULL;
++	ve->allocated_ptys = NULL;
++#endif
++}
++
++static inline void __register_tty_driver(struct tty_driver *driver)
++{
++	list_add(&driver->tty_drivers, &tty_drivers);
++}
++
++static inline void __unregister_tty_driver(struct tty_driver *driver)
++{
++	if (!driver)
++		return;
++	list_del(&driver->tty_drivers);
++}
++
++static int register_ve_tty_drivers(struct ve_struct* ve)
++{
++	mutex_lock(&tty_mutex);
++#ifdef CONFIG_UNIX98_PTYS
++	__register_tty_driver(ve->ptm_driver);
++	__register_tty_driver(ve->pts_driver);
++#endif
++#ifdef CONFIG_LEGACY_PTYS
++	__register_tty_driver(ve->pty_driver);
++	__register_tty_driver(ve->pty_slave_driver);
++#endif	
++	mutex_unlock(&tty_mutex);
++
++	return 0;
++}
++
++static void unregister_ve_tty_drivers(struct ve_struct* ve)
++{
++	VZTRACE("unregister_ve_tty_drivers\n");
++
++	mutex_lock(&tty_mutex);
++#ifdef CONFIG_LEGACY_PTYS
++	__unregister_tty_driver(ve->pty_driver);
++	__unregister_tty_driver(ve->pty_slave_driver);
++#endif
++#ifdef CONFIG_UNIX98_PTYS
++	__unregister_tty_driver(ve->ptm_driver);
++	__unregister_tty_driver(ve->pts_driver);
++#endif
++	mutex_unlock(&tty_mutex);
++}
++
++static int init_ve_tty_drivers(struct ve_struct *ve)
++{
++	int err;
++
++	if ((err = alloc_ve_tty_drivers(ve)))
++		goto err_ttyalloc;
++	if ((err = register_ve_tty_drivers(ve)))
++		goto err_ttyreg;
++	return 0;
++
++err_ttyreg:
++	free_ve_tty_drivers(ve);
++err_ttyalloc:
++	return err;
++}
++
++static void fini_ve_tty_drivers(struct ve_struct *ve)
++{
++	unregister_ve_tty_drivers(ve);
++	free_ve_tty_drivers(ve);
++}
++
++/*
++ * Free the termios and termios_locked structures because
++ * we don't want to get memory leaks when modular tty
++ * drivers are removed from the kernel.
++ */
++static void clear_termios(struct tty_driver *driver)
++{
++	int i;
++	struct ktermios *tp;
++
++	if (driver->termios == NULL)
++		return;
++	for (i = 0; i < driver->num; i++) {
++		tp = driver->termios[i];
++		if (tp) {
++			driver->termios[i] = NULL;
++			kfree(tp);
++		}
++		tp = driver->termios_locked[i];
++		if (tp) {
++			driver->termios_locked[i] = NULL;
++			kfree(tp);
++		}
++	}
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * Pieces of VE network
++ *
++ **********************************************************************
++ **********************************************************************/
++
++#ifdef CONFIG_NET
++#include <asm/uaccess.h>
++#include <net/sock.h>
++#include <linux/netlink.h>
++#include <linux/rtnetlink.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#endif
++
++static int ve_dev_add(envid_t veid, char *dev_name)
++{
++	struct net_device *dev;
++	struct ve_struct *dst_ve;
++	struct net *dst_net;
++	int err = -ESRCH;
++
++	dst_ve = get_ve_by_id(veid);
++	if (dst_ve == NULL)
++		goto out;
++
++	dst_net = dst_ve->ve_netns;
++
++	rtnl_lock();
++	read_lock(&dev_base_lock);
++	dev = __dev_get_by_name(&init_net, dev_name);
++	read_unlock(&dev_base_lock);
++	if (dev == NULL)
++		goto out_unlock;
++
++	err = __dev_change_net_namespace(dev, dst_net, dev_name,
++					get_ve0(), dst_ve, get_exec_ub());
++out_unlock:
++	rtnl_unlock();
++	real_put_ve(dst_ve);
++
++	if (dev == NULL)
++		printk(KERN_WARNING "%s: device %s not found\n",
++			__func__, dev_name);
++out:
++	return err;
++}
++
++static int ve_dev_del(envid_t veid, char *dev_name)
++{
++	struct net_device *dev;
++	struct ve_struct *src_ve;
++	struct net *src_net;
++	int err = -ESRCH;
++
++	src_ve = get_ve_by_id(veid);
++	if (src_ve == NULL)
++		goto out;
++
++	src_net = src_ve->ve_netns;
++
++	rtnl_lock();
++
++	read_lock(&dev_base_lock);
++	dev = __dev_get_by_name(src_net, dev_name);
++	read_unlock(&dev_base_lock);
++	if (dev == NULL)
++		goto out_unlock;
++
++	err = __dev_change_net_namespace(dev, &init_net, dev_name,
++				src_ve, get_ve0(), netdev_bc(dev)->owner_ub);
++out_unlock:
++	rtnl_unlock();
++	real_put_ve(src_ve);
++
++	if (dev == NULL)
++		printk(KERN_WARNING "%s: device %s not found\n",
++			__func__, dev_name);
++out:
++	return err;
++}
++
++int real_ve_dev_map(envid_t veid, int op, char *dev_name)
++{
++	if (!capable(CAP_SETVEID))
++		return -EPERM;
++	switch (op) {
++	case VE_NETDEV_ADD:
++		return ve_dev_add(veid, dev_name);
++	case VE_NETDEV_DEL:
++		return ve_dev_del(veid, dev_name);
++	default:
++		return -EINVAL;
++	}
++}
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE information via /proc
++ *
++ **********************************************************************
++ **********************************************************************/
++#ifdef CONFIG_PROC_FS
++#if BITS_PER_LONG == 32
++#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
++#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
++#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
++#else
++#define VESTAT_LINE_WIDTH (12 * 21)
++#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
++#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
++#endif
++
++static int vestat_seq_show(struct seq_file *m, void *v)
++{
++	struct list_head *entry;
++	struct ve_struct *ve;
++	struct ve_struct *curve;
++	int cpu;
++	unsigned long user_ve, nice_ve, system_ve;
++	unsigned long long uptime;
++	cycles_t uptime_cycles, idle_time, strv_time, used;
++
++	entry = (struct list_head *)v;
++	ve = list_entry(entry, struct ve_struct, ve_list);
++
++	curve = get_exec_env();
++	if (entry == ve_list_head.next ||
++	    (!ve_is_super(curve) && ve == curve)) {
++		/* print header */
++		seq_printf(m, "%-*s\n",
++			VESTAT_LINE_WIDTH - 1,
++			"Version: 2.2");
++		seq_printf(m, VESTAT_HEAD_FMT, "VEID",
++					"user", "nice", "system",
++					"uptime", "idle",
++					"strv", "uptime", "used",
++					"maxlat", "totlat", "numsched");
++	}
++
++	if (ve == get_ve0())
++		return 0;
++
++	user_ve = nice_ve = system_ve = 0;
++	idle_time = strv_time = used = 0;
++
++	for_each_online_cpu(cpu) {
++		struct ve_cpu_stats *st;
++
++		st = VE_CPU_STATS(ve, cpu);
++		user_ve += st->user;
++		nice_ve += st->nice;
++		system_ve += st->system;
++		used += st->used_time;
++		idle_time += ve_sched_get_idle_time(ve, cpu);
++	}
++	uptime_cycles = get_cycles() - ve->start_cycles;
++	uptime = get_jiffies_64() - ve->start_jiffies;
++
++	seq_printf(m, VESTAT_LINE_FMT, ve->veid,
++				user_ve, nice_ve, system_ve,
++				(unsigned long long)uptime,
++				(unsigned long long)idle_time, 
++				(unsigned long long)strv_time,
++				(unsigned long long)uptime_cycles,
++				(unsigned long long)used,
++				(unsigned long long)ve->sched_lat_ve.last.maxlat,
++				(unsigned long long)ve->sched_lat_ve.last.totlat,
++				ve->sched_lat_ve.last.count);
++	return 0;
++}
++
++void *ve_seq_start(struct seq_file *m, loff_t *pos)
++{
++	struct ve_struct *curve;
++
++	curve = get_exec_env();
++	read_lock(&ve_list_lock);
++	if (!ve_is_super(curve)) {
++		if (*pos != 0)
++			return NULL;
++		return curve;
++	}
++
++	return seq_list_start(&ve_list_head, *pos);
++}
++EXPORT_SYMBOL(ve_seq_start);
++
++void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	if (!ve_is_super(get_exec_env()))
++		return NULL;
++	else
++		return seq_list_next(v, &ve_list_head, pos);
++}
++EXPORT_SYMBOL(ve_seq_next);
++
++void ve_seq_stop(struct seq_file *m, void *v)
++{
++	read_unlock(&ve_list_lock);
++}
++EXPORT_SYMBOL(ve_seq_stop);
++
++static struct seq_operations vestat_seq_op = {
++        .start	= ve_seq_start,
++        .next	= ve_seq_next,
++        .stop	= ve_seq_stop,
++        .show	= vestat_seq_show
++};
++
++static int vestat_open(struct inode *inode, struct file *file)
++{
++        return seq_open(file, &vestat_seq_op);
++}
++
++static struct file_operations proc_vestat_operations = {
++        .open	 = vestat_open,
++        .read	 = seq_read,
++        .llseek	 = seq_lseek,
++        .release = seq_release
++};
++
++static struct seq_operations devperms_seq_op = {
++	.start  = ve_seq_start,
++	.next   = ve_seq_next,
++	.stop   = ve_seq_stop,
++	.show   = devperms_seq_show,
++};
++
++static int devperms_open(struct inode *inode, struct file *file)
++{
++	return seq_open(file, &devperms_seq_op);
++}
++
++static struct file_operations proc_devperms_ops = {
++	.open           = devperms_open,
++	.read           = seq_read,
++	.llseek         = seq_lseek,
++	.release        = seq_release,
++};
++
++static int vz_version_show(struct seq_file *file, void* v)
++{
++	static const char ver[] = VZVERSION "\n";
++
++	return seq_puts(file, ver);
++}
++
++static int vz_version_open(struct inode *inode, struct file *file)
++{
++	return single_open(file, vz_version_show, NULL);
++}
++
++static struct file_operations proc_vz_version_oparations = {
++	.open    = vz_version_open,
++	.read    = seq_read,
++	.llseek  = seq_lseek,
++	.release = single_release,
++};
++
++static inline unsigned long ve_used_mem(struct user_beancounter *ub)
++{
++	extern int glob_ve_meminfo;
++	return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held :
++				 ub->ub_parms[UB_PRIVVMPAGES].held ;
++}
++
++static inline void ve_mi_replace(struct meminfo *mi)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	struct user_beancounter *ub;
++	unsigned long meminfo_val;
++	unsigned long nodettram;
++	unsigned long usedmem;
++
++	meminfo_val = get_exec_env()->meminfo_val;
++
++	if(!meminfo_val)
++		return; /* No virtualization */
++
++	nodettram = mi->si.totalram;
++	ub = current->mm->mm_ub;
++	usedmem = ve_used_mem(ub);
++
++	memset(mi, 0, sizeof(*mi));
++
++	mi->si.totalram = (meminfo_val > nodettram) ?
++			nodettram : meminfo_val;
++	mi->si.freeram = (mi->si.totalram > usedmem) ?
++			(mi->si.totalram - usedmem) : 0;
++#else
++	return;
++#endif
++}
++
++static int meminfo_call(struct vnotifier_block *self,
++                unsigned long event, void *arg, int old_ret)
++{
++	if (event != VIRTINFO_MEMINFO)
++		return old_ret;
++
++	ve_mi_replace((struct meminfo *)arg);
++
++	return NOTIFY_OK;
++}
++
++
++static struct vnotifier_block meminfo_notifier_block = {
++	.notifier_call = meminfo_call
++};
++
++static int __init init_vecalls_proc(void)
++{
++	struct proc_dir_entry *de;
++
++	de = proc_create("vestat", S_IFREG | S_IRUSR, proc_vz_dir,
++			&proc_vestat_operations);
++	if (!de)
++		printk(KERN_WARNING "VZMON: can't make vestat proc entry\n");
++
++	de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
++			&proc_devperms_ops);
++	if (!de)
++		printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
++
++	de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
++			&proc_vz_version_oparations);
++	if (!de)
++		printk(KERN_WARNING "VZMON: can't make version proc entry\n");
++
++	virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block);
++	return 0;
++}
++
++static void fini_vecalls_proc(void)
++{
++	remove_proc_entry("version", proc_vz_dir);
++	remove_proc_entry("devperms", proc_vz_dir);
++	remove_proc_entry("vestat", proc_vz_dir);
++	virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block);
++}
++#else
++#define init_vecalls_proc()	(0)
++#define fini_vecalls_proc()	do { } while (0)
++#endif /* CONFIG_PROC_FS */
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * User ctl
++ *
++ **********************************************************************
++ **********************************************************************/
++
++int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	int err;
++
++	err = -ENOTTY;
++	switch(cmd) {
++	    case VZCTL_MARK_ENV_TO_DOWN: {
++		        /* Compatibility issue */
++		        err = 0;
++		}
++		break;
++	    case VZCTL_SETDEVPERMS: {
++			/* Device type was mistakenly declared as dev_t
++			 * in the old user-kernel interface.
++			 * That's wrong, dev_t is a kernel internal type.
++			 * I use `unsigned' not having anything better in mind.
++			 * 2001/08/11  SAW  */
++			struct vzctl_setdevperms s;
++			err = -EFAULT;
++			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++				break;
++			err = real_setdevperms(s.veid, s.type,
++					new_decode_dev(s.dev), s.mask);
++		}
++		break;
++#ifdef CONFIG_INET
++	    case VZCTL_VE_NETDEV: {
++			struct vzctl_ve_netdev d;
++			char *s;
++			err = -EFAULT;
++			if (copy_from_user(&d, (void __user *)arg, sizeof(d)))
++				break;
++			err = -ENOMEM;
++			s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
++			if (s == NULL)
++				break;
++			err = -EFAULT;
++			if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
++				s[IFNAMSIZ] = 0;
++				err = real_ve_dev_map(d.veid, d.op, s);
++			}
++			kfree(s);
++		}
++		break;
++#endif
++	    case VZCTL_ENV_CREATE: {
++			struct vzctl_env_create s;
++			err = -EFAULT;
++			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++				break;
++			err = real_env_create(s.veid, s.flags, s.class_id,
++				NULL, 0);
++		}
++		break;
++	    case VZCTL_ENV_CREATE_DATA: {
++			struct vzctl_env_create_data s;
++			env_create_param_t *data;
++			err = -EFAULT;
++			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++				break;
++			err=-EINVAL;
++			if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN ||
++			    s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN ||
++			    s.data == 0)
++				break;
++			err = -ENOMEM;
++			data = kzalloc(sizeof(*data), GFP_KERNEL);
++			if (!data)
++				break;
++
++			err = -EFAULT;
++			if (copy_from_user(data, (void __user *)s.data,
++						s.datalen))
++				goto free_data;
++			err = real_env_create(s.veid, s.flags, s.class_id,
++				data, s.datalen);
++free_data:
++			kfree(data);
++		}
++		break;
++	    case VZCTL_GET_CPU_STAT: {
++			struct vzctl_cpustatctl s;
++			err = -EFAULT;
++			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++				break;
++			err = ve_get_cpu_stat(s.veid, s.cpustat);
++		}
++		break;
++	    case VZCTL_VE_MEMINFO: {
++			struct vzctl_ve_meminfo s;
++			err = -EFAULT;
++			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
++				break;
++			err = ve_set_meminfo(s.veid, s.val);
++		}
++		break;
++	}
++	return err;
++}
++
++#ifdef CONFIG_COMPAT
++int compat_vzcalls_ioctl(struct file *file, unsigned int cmd,
++		unsigned long arg)
++{
++	int err;
++
++	switch(cmd) {
++	case VZCTL_GET_CPU_STAT: {
++		/* FIXME */
++	}
++	case VZCTL_COMPAT_ENV_CREATE_DATA: {
++		struct compat_vzctl_env_create_data cs;
++		struct vzctl_env_create_data __user *s;
++
++		s = compat_alloc_user_space(sizeof(*s));
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++
++		if (put_user(cs.veid, &s->veid) ||
++		    put_user(cs.flags, &s->flags) ||
++		    put_user(cs.class_id, &s->class_id) ||
++		    put_user(compat_ptr(cs.data), &s->data) ||
++		    put_user(cs.datalen, &s->datalen))
++			break;
++		err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA,
++						(unsigned long)s);
++		break;
++	}
++#ifdef CONFIG_NET
++	case VZCTL_COMPAT_VE_NETDEV: {
++		struct compat_vzctl_ve_netdev cs;
++		struct vzctl_ve_netdev __user *s;
++
++		s = compat_alloc_user_space(sizeof(*s));
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++
++		if (put_user(cs.veid, &s->veid) ||
++		    put_user(cs.op, &s->op) ||
++		    put_user(compat_ptr(cs.dev_name), &s->dev_name))
++			break;
++		err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s);
++		break;
++	}
++#endif
++	case VZCTL_COMPAT_VE_MEMINFO: {
++		struct compat_vzctl_ve_meminfo cs;
++		err = -EFAULT;
++		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
++			break;
++		err = ve_set_meminfo(cs.veid, cs.val);
++		break;
++	}
++	default:
++		err = vzcalls_ioctl(file, cmd, arg);
++		break;
++	}
++	return err;
++}
++#endif
++
++static struct vzioctlinfo vzcalls = {
++	.type		= VZCTLTYPE,
++	.ioctl		= vzcalls_ioctl,
++#ifdef CONFIG_COMPAT
++	.compat_ioctl	= compat_vzcalls_ioctl,
++#endif
++	.owner		= THIS_MODULE,
++};
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * Init/exit stuff
++ *
++ **********************************************************************
++ **********************************************************************/
++
++static int __init init_vecalls_symbols(void)
++{
++	KSYMRESOLVE(real_do_env_free);
++	KSYMMODRESOLVE(vzmon);
++	return 0;
++}
++
++static void fini_vecalls_symbols(void)
++{
++	KSYMMODUNRESOLVE(vzmon);
++	KSYMUNRESOLVE(real_do_env_free);
++}
++
++static inline __init int init_vecalls_ioctls(void)
++{
++	vzioctl_register(&vzcalls);
++	return 0;
++}
++
++static inline void fini_vecalls_ioctls(void)
++{
++	vzioctl_unregister(&vzcalls);
++}
++
++#ifdef CONFIG_SYSCTL
++static struct ctl_table_header *table_header;
++
++static ctl_table kernel_table[] = {
++	{
++		.procname	= "ve_allow_kthreads",
++		.data		= &ve_allow_kthreads,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= &proc_dointvec,
++	},
++	{ 0 }
++};
++
++static ctl_table root_table[] =  {
++	{CTL_KERN, "kernel",  NULL, 0, 0555, kernel_table},
++	{ 0 }
++};
++
++static int init_vecalls_sysctl(void)
++{
++	table_header = register_sysctl_table(root_table);
++	if (!table_header)
++		return -ENOMEM ;
++	return 0;
++}
++
++static void fini_vecalls_sysctl(void)
++{
++	unregister_sysctl_table(table_header);
++} 
++#else
++static int init_vecalls_sysctl(void) { return 0; }
++static void fini_vecalls_sysctl(void) { ; }
++#endif
++
++static int __init vecalls_init(void)
++{
++	int err;
++
++	err = init_vecalls_sysctl();
++	if (err)
++		goto out_vzmond;
++
++	err = init_vzmond();
++	if (err < 0)
++		goto out_sysctl;
++
++	err = init_vecalls_symbols();
++	if (err < 0)
++		goto out_sym;
++
++	err = init_vecalls_proc();
++	if (err < 0)
++		goto out_proc;
++
++	err = init_vecalls_ioctls();
++	if (err < 0)
++		goto out_ioctls;
++
++	return 0;
++
++out_ioctls:
++	fini_vecalls_proc();
++out_proc:
++	fini_vecalls_symbols();
++out_sym:
++	fini_vzmond();
++out_sysctl:
++	fini_vecalls_sysctl();
++out_vzmond:
++	return err;
++}
++
++static void vecalls_exit(void)
++{
++	fini_vecalls_ioctls();
++	fini_vecalls_proc();
++	fini_vecalls_symbols();
++	fini_vzmond();
++	fini_vecalls_sysctl();
++}
++
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Control");
++MODULE_LICENSE("GPL v2");
++
++module_init(vecalls_init)
++module_exit(vecalls_exit)
+diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
+new file mode 100644
+index 0000000..8774e9c
+--- /dev/null
++++ b/kernel/ve/veowner.c
+@@ -0,0 +1,149 @@
++/*
++ *  kernel/ve/veowner.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/ipc.h>
++#include <linux/fs.h>
++#include <linux/proc_fs.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <linux/vmalloc.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inetdevice.h>
++#include <linux/pid_namespace.h>
++#include <asm/system.h>
++#include <asm/io.h>
++
++#include <net/tcp.h>
++
++void prepare_ve0_process(struct task_struct *tsk)
++{
++	VE_TASK_INFO(tsk)->exec_env = get_ve0();
++	VE_TASK_INFO(tsk)->owner_env = get_ve0();
++	VE_TASK_INFO(tsk)->sleep_time = 0;
++	VE_TASK_INFO(tsk)->wakeup_stamp = 0;
++	VE_TASK_INFO(tsk)->sched_time = 0;
++	seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock);
++
++	if (tsk->pid) {
++		list_add_rcu(&tsk->ve_task_info.vetask_list,
++				&get_ve0()->vetask_lh);
++		atomic_inc(&get_ve0()->pcounter);
++	}
++}
++
++/*
++ * ------------------------------------------------------------------------
++ * proc entries
++ * ------------------------------------------------------------------------
++ */
++
++#ifdef CONFIG_PROC_FS
++struct proc_dir_entry *proc_vz_dir;
++EXPORT_SYMBOL(proc_vz_dir);
++
++struct proc_dir_entry *glob_proc_vz_dir;
++EXPORT_SYMBOL(glob_proc_vz_dir);
++
++static void prepare_proc(void)
++{
++	proc_vz_dir = proc_mkdir("vz", NULL);
++	if (!proc_vz_dir)
++		panic("Can't create /proc/vz dir\n");
++
++	glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root);
++	if (!proc_vz_dir)
++		panic("Can't create /proc/vz dir\n");
++}
++#endif
++
++/*
++ * ------------------------------------------------------------------------
++ * OpenVZ sysctl
++ * ------------------------------------------------------------------------
++ */
++extern int ve_area_access_check;
++
++#ifdef CONFIG_INET
++static struct ctl_table vz_ipv4_route_table[] = {
++	{
++		.procname	= "src_check",
++		.data		= &ip_rt_src_check,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{ 0 }
++};
++
++static struct ctl_path net_ipv4_route_path[] = {
++	{ .ctl_name = CTL_NET, .procname = "net", },
++	{ .ctl_name = NET_IPV4, .procname = "ipv4", },
++	{ .ctl_name = NET_IPV4_ROUTE, .procname = "route", },
++	{ }
++};
++#endif
++
++static struct ctl_table vz_fs_table[] = {
++	{
++		.procname	= "ve-area-access-check",
++		.data		= &ve_area_access_check,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{ 0 }
++};
++
++static struct ctl_path fs_path[] = {
++	{ .ctl_name = CTL_FS, .procname = "fs", },
++	{ }
++};
++
++static void prepare_sysctl(void)
++{
++#ifdef CONFIG_INET
++	register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table);
++#endif
++	register_sysctl_paths(fs_path, vz_fs_table);
++}
++
++/*
++ * ------------------------------------------------------------------------
++ * XXX init_ve_system
++ * ------------------------------------------------------------------------
++ */
++
++void init_ve_system(void)
++{
++	struct task_struct *init_entry;
++	struct ve_struct *ve;
++
++	ve = get_ve0();
++
++	init_entry = init_pid_ns.child_reaper;
++	/* if ve_move_task to VE0 (e.g. in cpt code)	*
++	 * occurs, ve_cap_bset on VE0 is required	*/
++	ve->ve_cap_bset = CAP_INIT_EFF_SET;
++
++	read_lock(&init_entry->fs->lock);
++	ve->root_path = init_entry->fs->root;
++	read_unlock(&init_entry->fs->lock);
++
++#ifdef CONFIG_PROC_FS
++	prepare_proc();
++#endif
++	prepare_sysctl();
++}
+diff --git a/kernel/ve/vzdev.c b/kernel/ve/vzdev.c
+new file mode 100644
+index 0000000..b2f010c
+--- /dev/null
++++ b/kernel/ve/vzdev.c
+@@ -0,0 +1,154 @@
++/*
++ *  kernel/ve/vzdev.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/fs.h>
++#include <linux/list.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/vzctl.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/vzcalluser.h>
++#include <asm/uaccess.h>
++#include <asm/pgalloc.h>
++#include <linux/device.h>
++#include <linux/smp_lock.h>
++
++#define VZCTL_MAJOR 126
++#define VZCTL_NAME "vzctl"
++
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Interface");
++MODULE_LICENSE("GPL v2");
++
++static LIST_HEAD(ioctls);
++static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED;
++
++static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd)
++{
++	struct vzioctlinfo *h;
++
++	spin_lock(&ioctl_lock);
++	list_for_each_entry(h, &ioctls, list) {
++		if (h->type == _IOC_TYPE(cmd))
++			goto found;
++	}
++	h = NULL;
++found:
++	if (h && !try_module_get(h->owner))
++		h = NULL;
++	spin_unlock(&ioctl_lock);
++	return h;
++}
++
++static void vzctl_put_handler(struct vzioctlinfo *h)
++{
++	if (!h)
++		return;
++
++	module_put(h->owner);
++}
++
++long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	struct vzioctlinfo *h;
++	int err;
++
++	err = -ENOTTY;
++	h = vzctl_get_handler(cmd);
++	if (h && h->ioctl)
++		err = (*h->ioctl)(file, cmd, arg);
++	vzctl_put_handler(h);
++
++	return err;
++}
++
++long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
++{
++	struct vzioctlinfo *h;
++	int err;
++
++	err = -ENOIOCTLCMD;
++	h = vzctl_get_handler(cmd);
++	if (h && h->compat_ioctl)
++		err = (*h->compat_ioctl)(file, cmd, arg);
++	vzctl_put_handler(h);
++
++	return err;
++}
++
++void vzioctl_register(struct vzioctlinfo *inf)
++{
++	spin_lock(&ioctl_lock);
++	list_add(&inf->list, &ioctls);
++	spin_unlock(&ioctl_lock);
++}
++EXPORT_SYMBOL(vzioctl_register);
++
++void vzioctl_unregister(struct vzioctlinfo *inf)
++{
++	spin_lock(&ioctl_lock);
++	list_del_init(&inf->list);
++	spin_unlock(&ioctl_lock);
++}
++EXPORT_SYMBOL(vzioctl_unregister);
++
++/*
++ * Init/exit stuff.
++ */
++static struct file_operations vzctl_fops = {
++	.owner		= THIS_MODULE,
++	.unlocked_ioctl	= vzctl_ioctl,
++	.compat_ioctl	= compat_vzctl_ioctl,
++};
++
++static struct class *vzctl_class;
++
++static void __exit vzctl_exit(void)
++{
++	device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
++	class_destroy(vzctl_class);
++	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
++}
++
++static int __init vzctl_init(void)
++{
++	int ret;
++	struct device *class_err;
++
++	ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
++	if (ret < 0)
++		goto out;
++
++	vzctl_class = class_create(THIS_MODULE, "vzctl");
++	if (IS_ERR(vzctl_class)) {
++		ret = PTR_ERR(vzctl_class);
++		goto out_cleandev;
++	}
++
++	class_err = device_create(vzctl_class, NULL,
++			MKDEV(VZCTL_MAJOR, 0), VZCTL_NAME);
++	if (IS_ERR(class_err)) {
++		ret = PTR_ERR(class_err);
++		goto out_rmclass;
++	}
++
++	goto out;
++
++out_rmclass:
++	class_destroy(vzctl_class);
++out_cleandev:
++	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
++out:
++	return ret;
++}
++
++module_init(vzctl_init)
++module_exit(vzctl_exit);
+diff --git a/kernel/ve/vzevent.c b/kernel/ve/vzevent.c
+new file mode 100644
+index 0000000..554f169
+--- /dev/null
++++ b/kernel/ve/vzevent.c
+@@ -0,0 +1,125 @@
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <linux/netlink.h>
++#include <linux/errno.h>
++#include <linux/ve_proto.h>
++#include <linux/vzevent.h>
++
++#define NETLINK_UEVENT	31
++#define VZ_EVGRP_ALL	0x01
++
++/*
++ * NOTE: the original idea was to send events via kobject_uevent(),
++ * however, it turns out that it has negative consequences like
++ * start of /sbin/hotplug which tries to react on our events in inadequate manner.
++ */
++
++static struct sock *vzev_sock;
++
++static char *action_to_string(int action)
++{
++	switch (action) {
++	case KOBJ_MOUNT:
++		return "ve-mount";
++	case KOBJ_UMOUNT:
++		return "ve-umount";
++	case KOBJ_START:
++		return "ve-start";
++	case KOBJ_STOP:
++		return "ve-stop";
++	default:
++		return NULL;
++	}
++}
++
++static int do_vzevent_send(int event, char *msg, int len)
++{
++	struct sk_buff *skb;
++	char *buf, *action;
++	int alen;
++
++	action = action_to_string(event);
++	alen = strlen(action);
++
++	skb = alloc_skb(len + 1 + alen, GFP_KERNEL);
++	if (!skb)
++		return -ENOMEM;
++
++	buf = skb_put(skb, len + 1 + alen);
++	memcpy(buf, action, alen);
++	buf[alen] = '@';
++	memcpy(buf + alen + 1, msg, len);
++	(void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL);
++	return 0;
++}
++
++int vzevent_send(int event, const char *attrs_fmt, ...)
++{
++	va_list args;
++	int len, err;
++	struct ve_struct *ve;
++	char *page;
++
++	err = -ENOMEM;
++	page = (char *)__get_free_page(GFP_KERNEL);
++	if (!page)
++		goto out;
++
++	va_start(args, attrs_fmt);
++	len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args);
++	va_end(args);
++
++	ve = set_exec_env(get_ve0());
++	err = do_vzevent_send(event, page, len);
++	(void)set_exec_env(ve);
++	free_page((unsigned long)page);
++out:
++	return err;
++}
++EXPORT_SYMBOL(vzevent_send);
++
++static int ve_start(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	vzevent_send(KOBJ_START, "%d", ve->veid);
++	return 0;
++}
++
++static void ve_stop(void *data)
++{
++	struct ve_struct *ve;
++
++	ve = (struct ve_struct *)data;
++	vzevent_send(KOBJ_STOP, "%d", ve->veid);
++}
++
++static struct ve_hook ve_start_stop_hook = {
++	.init		= ve_start,
++	.fini		= ve_stop,
++	.owner		= THIS_MODULE,
++	.priority	= HOOK_PRIO_AFTERALL,
++};
++
++static int __init init_vzevent(void)
++{
++	vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE);
++	if (vzev_sock == NULL)
++		return -ENOMEM;
++	ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook);
++	return 0;
++}
++
++static void __exit exit_vzevent(void)
++{
++	ve_hook_unregister(&ve_start_stop_hook);
++	sock_release(vzev_sock->sk_socket);
++}
++
++MODULE_LICENSE("GPL");
++
++module_init(init_vzevent);
++module_exit(exit_vzevent);
+diff --git a/kernel/ve/vzwdog.c b/kernel/ve/vzwdog.c
+new file mode 100644
+index 0000000..7117365
+--- /dev/null
++++ b/kernel/ve/vzwdog.c
+@@ -0,0 +1,283 @@
++/*
++ *  kernel/ve/vzwdog.c
++ *
++ *  Copyright (C) 2000-2005  SWsoft
++ *  All rights reserved.
++ *  
++ *  Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/list.h>
++#include <linux/ctype.h>
++#include <linux/kobject.h>
++#include <linux/genhd.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/kernel_stat.h>
++#include <linux/smp_lock.h>
++#include <linux/errno.h>
++#include <linux/suspend.h>
++#include <linux/ve.h>
++#include <linux/vzstat.h>
++#include <asm/uaccess.h>
++#include <linux/kthread.h>
++#include <linux/freezer.h>
++
++/* Staff regading kernel thread polling VE validity */
++static int sleep_timeout = 60;
++static struct task_struct *wdog_thread_tsk;
++
++extern void show_mem(void);
++
++static struct file *intr_file;
++static char page[PAGE_SIZE];
++
++static void parse_irq_list(int len)
++{
++	int i, k, skip;
++	for (i = 0; i < len; ) {
++		k = i;
++		while (i < len && page[i] != '\n' && page[i] != ':')
++			i++;
++		skip = 0;
++		if (i < len && page[i] != '\n') {
++			i++; /* skip ':' */
++			while (i < len && (page[i] == ' ' || page[i] == '0'))
++				i++;
++			skip = (i < len && (page[i] < '0' || page[i] > '9'));
++			while (i < len && page[i] != '\n')
++				i++;
++		}
++		if (!skip)
++			printk("%.*s\n", i - k, page + k);
++		if (i < len)
++			i++; /* skip '\n' */
++	}
++}
++
++extern loff_t vfs_llseek(struct file *file, loff_t, int);
++extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *);
++extern struct file *filp_open(const char *filename, int flags, int mode);
++extern int filp_close(struct file *filp, fl_owner_t id);
++static void show_irq_list(void)
++{
++	mm_segment_t fs;
++	int r;
++
++	fs = get_fs();
++	set_fs(KERNEL_DS);
++	vfs_llseek(intr_file, 0, 0);
++	r = vfs_read(intr_file, (void __user *)page, sizeof(page),
++			&intr_file->f_pos);
++	set_fs(fs);
++
++	if (r > 0)
++		parse_irq_list(r);
++}
++
++static void show_alloc_latency(void)
++{
++	static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
++		"A0",
++		"L0",
++		"H0",
++		"L1",
++		"H1"
++	};
++	int i;
++
++	printk("lat: ");
++	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
++		struct kstat_lat_struct *p;
++		cycles_t maxlat, avg0, avg1, avg2;
++
++		p = &kstat_glob.alloc_lat[i];
++		spin_lock_irq(&kstat_glb_lock);
++		maxlat = p->last.maxlat;
++		avg0 = p->avg[0];
++		avg1 = p->avg[1];
++		avg2 = p->avg[2];
++		spin_unlock_irq(&kstat_glb_lock);
++
++		printk("%s %Lu (%Lu %Lu %Lu)",
++				alloc_descr[i],
++				(unsigned long long)maxlat,
++				(unsigned long long)avg0,
++				(unsigned long long)avg1,
++				(unsigned long long)avg2);
++	}
++	printk("\n");
++}
++
++static void show_schedule_latency(void)
++{
++	struct kstat_lat_pcpu_struct *p;
++	cycles_t maxlat, totlat, avg0, avg1, avg2;
++	unsigned long count;
++
++	p = &kstat_glob.sched_lat;
++	spin_lock_irq(&kstat_glb_lock);
++	maxlat = p->last.maxlat;
++	totlat = p->last.totlat;
++	count = p->last.count;
++	avg0 = p->avg[0];
++	avg1 = p->avg[1];
++	avg2 = p->avg[2];
++	spin_unlock_irq(&kstat_glb_lock);
++
++	printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
++			(unsigned long long)maxlat,
++			(unsigned long long)totlat,
++			count,
++			(unsigned long long)avg0,
++			(unsigned long long)avg1,
++			(unsigned long long)avg2);
++}
++
++static void show_header(void)
++{
++	struct timeval tv;
++
++	do_gettimeofday(&tv);
++	preempt_disable();
++	printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
++			tv.tv_sec, (long)tv.tv_usec,
++			(unsigned long long)get_jiffies_64(),
++			smp_processor_id());
++#ifdef CONFIG_FAIRSCHED
++	printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n",
++			cycles_per_jiffy, HZ);
++#else
++	printk("*** jiffies_per_second %u ***\n", HZ);
++#endif
++	preempt_enable();
++}
++
++static void show_pgdatinfo(void)
++{
++	pg_data_t *pgdat;
++
++	printk("pgdat:");
++	for_each_online_pgdat(pgdat) {
++		printk(" %d: %lu,%lu,%lu",
++				pgdat->node_id,
++				pgdat->node_start_pfn,
++				pgdat->node_present_pages,
++				pgdat->node_spanned_pages);
++#ifdef CONFIG_FLAT_NODE_MEM_MAP
++		printk(",%p", pgdat->node_mem_map);
++#endif
++	}
++	printk("\n");
++}
++
++static void show_diskio(void)
++{
++	struct device *dev;
++	char buf[BDEVNAME_SIZE];
++
++	printk("disk_io: ");
++
++	list_for_each_entry(dev, &block_class.devices, node) {
++		char *name;
++		struct gendisk *gd = dev_to_disk(dev);
++
++		name = disk_name(gd, 0, buf);
++		if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
++		    isdigit(name[4]))
++			continue;
++		if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
++		    isdigit(name[3]))
++			continue;
++		printk("(%u,%u) %s r(%lu %lu %lu) w(%lu %lu %lu)\n",
++			gd->major, gd->first_minor,
++			name,
++			disk_stat_read(gd, ios[READ]),
++			disk_stat_read(gd, sectors[READ]),
++			disk_stat_read(gd, merges[READ]),
++			disk_stat_read(gd, ios[WRITE]),
++			disk_stat_read(gd, sectors[WRITE]),
++			disk_stat_read(gd, merges[WRITE]));
++	}
++
++	printk("\n");
++}
++
++static void show_nrprocs(void)
++{
++	unsigned long _nr_running, _nr_sleeping,
++			_nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
++
++	_nr_running = nr_running();
++	_nr_unint = nr_uninterruptible();
++	_nr_sleeping = nr_sleeping();
++	_nr_zombie = nr_zombie;
++	_nr_dead = atomic_read(&nr_dead);
++	_nr_stopped = nr_stopped();
++
++	printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
++		"Z %lu, X %lu, T %lu (tot %d)\n",
++		nr_ve,	_nr_running, _nr_sleeping, _nr_unint,
++		_nr_zombie, _nr_dead, _nr_stopped, nr_threads);
++}
++
++static void wdog_print(void)
++{
++	show_header();
++	show_irq_list();
++	show_pgdatinfo();
++	show_mem();
++	show_diskio();
++	show_schedule_latency();
++	show_alloc_latency();
++	show_nrprocs();
++}
++
++static int wdog_loop(void* data)
++{
++	while (1) {
++		wdog_print();
++		try_to_freeze();
++
++		set_current_state(TASK_UNINTERRUPTIBLE);
++		if (kthread_should_stop())
++			break;
++		schedule_timeout(sleep_timeout*HZ);
++	}
++	return 0;
++}
++
++static int __init wdog_init(void)
++{
++	struct file *file;
++
++	file = filp_open("/proc/interrupts", 0, 0);
++	if (IS_ERR(file))
++		return PTR_ERR(file);
++	intr_file = file;
++
++	wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog");
++	if (IS_ERR(wdog_thread_tsk)) {
++		filp_close(intr_file, NULL);
++		return -EBUSY;
++	}
++	return 0;
++}
++
++static void __exit wdog_exit(void)
++{
++	kthread_stop(wdog_thread_tsk);
++	filp_close(intr_file, NULL);
++}
++
++module_param(sleep_timeout, int, 0660);
++MODULE_AUTHOR("SWsoft <info at sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo WDOG");
++MODULE_LICENSE("GPL v2");
++
++module_init(wdog_init)
++module_exit(wdog_exit)
+diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
+index d2099f4..eefdf6c 100644
+--- a/lib/Kconfig.debug
++++ b/lib/Kconfig.debug
+@@ -125,6 +125,15 @@ config DEBUG_SECTION_MISMATCH
+ 	  - Enable verbose reporting from modpost to help solving
+ 	    the section mismatches reported.
+ 
++config SYSRQ_DEBUG
++	bool "Debugging via sysrq keys"
++	depends on MAGIC_SYSRQ
++	default y
++	help
++	  Say Y if you want to extend functionality of magic key. It will
++	  provide you with some debugging facilities such as dumping and
++	  writing memory, resolving symbols and some other.
++
+ config DEBUG_KERNEL
+ 	bool "Kernel debugging"
+ 	help
+diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
+index 2fa545a..fe9fa6a 100644
+--- a/lib/kobject_uevent.c
++++ b/lib/kobject_uevent.c
+@@ -38,6 +38,8 @@ static const char *kobject_actions[] = {
+ 	[KOBJ_REMOVE] =		"remove",
+ 	[KOBJ_CHANGE] =		"change",
+ 	[KOBJ_MOVE] =		"move",
++	[KOBJ_START] =		"start",
++	[KOBJ_STOP] =		"stop",
+ 	[KOBJ_ONLINE] =		"online",
+ 	[KOBJ_OFFLINE] =	"offline",
+ };
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 1e6a7d3..a49f9ea 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -42,6 +42,8 @@
+ 
+ #include <asm/mman.h>
+ 
++#include <bc/io_acct.h>
++
+ static ssize_t
+ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+ 	loff_t offset, unsigned long nr_segs);
+@@ -121,6 +123,7 @@ void __remove_from_page_cache(struct page *page)
+ 	mem_cgroup_uncharge_page(page);
+ 	radix_tree_delete(&mapping->page_tree, page->index);
+ 	page->mapping = NULL;
++	ub_io_release_debug(page);
+ 	mapping->nrpages--;
+ 	__dec_zone_page_state(page, NR_FILE_PAGES);
+ 	BUG_ON(page_mapped(page));
+diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
+index 3e744ab..974fe0d 100644
+--- a/mm/filemap_xip.c
++++ b/mm/filemap_xip.c
+@@ -16,6 +16,7 @@
+ #include <linux/sched.h>
+ #include <asm/tlbflush.h>
+ #include <asm/io.h>
++#include <bc/vmpages.h>
+ 
+ /*
+  * We do use our own empty page to avoid interference with other users
+@@ -190,6 +191,8 @@ __xip_unmap (struct address_space * mapping,
+ 			flush_cache_page(vma, address, pte_pfn(*pte));
+ 			pteval = ptep_clear_flush(vma, address, pte);
+ 			page_remove_rmap(page, vma);
++			pb_remove_ref(page, mm);
++			ub_unused_privvm_inc(mm, vma);
+ 			dec_mm_counter(mm, file_rss);
+ 			BUG_ON(pte_dirty(pteval));
+ 			pte_unmap_unlock(pte, ptl);
+diff --git a/mm/fremap.c b/mm/fremap.c
+index 07a9c82..e2733ba 100644
+--- a/mm/fremap.c
++++ b/mm/fremap.c
+@@ -20,6 +20,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <bc/vmpages.h>
++
+ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ 			unsigned long addr, pte_t *ptep)
+ {
+@@ -35,6 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ 			if (pte_dirty(pte))
+ 				set_page_dirty(page);
+ 			page_remove_rmap(page, vma);
++			pb_remove_ref(page, mm);
+ 			page_cache_release(page);
+ 			update_hiwater_rss(mm);
+ 			dec_mm_counter(mm, file_rss);
+@@ -61,8 +64,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	if (!pte)
+ 		goto out;
+ 
+-	if (!pte_none(*pte))
++	if (!pte_none(*pte)) {
+ 		zap_pte(mm, vma, addr, pte);
++		ub_unused_privvm_inc(mm, vma);
++	}
+ 
+ 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+ 	/*
+@@ -237,4 +242,5 @@ out:
+ 
+ 	return err;
+ }
++EXPORT_SYMBOL_GPL(sys_remap_file_pages);
+ 
+diff --git a/mm/memory.c b/mm/memory.c
+index 2302d22..06180fe 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -42,6 +42,7 @@
+ #include <linux/mm.h>
+ #include <linux/hugetlb.h>
+ #include <linux/mman.h>
++#include <linux/virtinfo.h>
+ #include <linux/swap.h>
+ #include <linux/highmem.h>
+ #include <linux/pagemap.h>
+@@ -51,6 +52,7 @@
+ #include <linux/init.h>
+ #include <linux/writeback.h>
+ #include <linux/memcontrol.h>
++#include <linux/vzstat.h>
+ 
+ #include <asm/pgalloc.h>
+ #include <asm/uaccess.h>
+@@ -61,6 +63,11 @@
+ #include <linux/swapops.h>
+ #include <linux/elf.h>
+ 
++#include <bc/beancounter.h>
++#include <bc/io_acct.h>
++#include <bc/kmem.h>
++#include <bc/vmpages.h>
++
+ #ifndef CONFIG_NEED_MULTIPLE_NODES
+ /* use the per-pgdat data instead for discontigmem - mbligh */
+ unsigned long max_mapnr;
+@@ -115,18 +122,21 @@ void pgd_clear_bad(pgd_t *pgd)
+ 	pgd_ERROR(*pgd);
+ 	pgd_clear(pgd);
+ }
++EXPORT_SYMBOL_GPL(pgd_clear_bad);
+ 
+ void pud_clear_bad(pud_t *pud)
+ {
+ 	pud_ERROR(*pud);
+ 	pud_clear(pud);
+ }
++EXPORT_SYMBOL_GPL(pud_clear_bad);
+ 
+ void pmd_clear_bad(pmd_t *pmd)
+ {
+ 	pmd_ERROR(*pmd);
+ 	pmd_clear(pmd);
+ }
++EXPORT_SYMBOL_GPL(pmd_clear_bad);
+ 
+ /*
+  * Note: this doesn't free the actual pages themselves. That
+@@ -337,6 +347,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+ 		pte_free(mm, new);
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(__pte_alloc);
+ 
+ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+ {
+@@ -477,6 +488,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ out:
+ 	return pfn_to_page(pfn);
+ }
++EXPORT_SYMBOL_GPL(vm_normal_page);
+ 
+ /*
+  * copy one vm_area from one task to the other. Assumes the page tables
+@@ -487,7 +499,7 @@ out:
+ static inline void
+ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+-		unsigned long addr, int *rss)
++		unsigned long addr, int *rss, struct page_beancounter **pbc)
+ {
+ 	unsigned long vm_flags = vma->vm_flags;
+ 	pte_t pte = *src_pte;
+@@ -542,6 +554,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ 	if (page) {
+ 		get_page(page);
+ 		page_dup_rmap(page, vma, addr);
++		pb_dup_ref(page, dst_mm, pbc);
+ 		rss[!!PageAnon(page)]++;
+ 	}
+ 
+@@ -549,20 +562,35 @@ out_set_pte:
+ 	set_pte_at(dst_mm, addr, dst_pte, pte);
+ }
+ 
++#define pte_ptrs(a)	(PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1)))
++#ifdef CONFIG_BEANCOUNTERS
++#define same_ub(mm1, mm2)      ((mm1)->mm_ub == (mm2)->mm_ub)
++#else
++#define same_ub(mm1, mm2)      1
++#endif
++
+ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+-		pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
++		pmd_t *dst_pmd, pmd_t *src_pmd,
++		struct vm_area_struct *dst_vma,
++		struct vm_area_struct *vma,
+ 		unsigned long addr, unsigned long end)
+ {
+ 	pte_t *src_pte, *dst_pte;
+ 	spinlock_t *src_ptl, *dst_ptl;
+ 	int progress = 0;
+-	int rss[2];
++	int rss[2], rss_tot;
++	struct page_beancounter *pbc;
++	int err;
+ 
++	err = -ENOMEM;
++	pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL;
+ again:
++	if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr)))
++		goto out;
+ 	rss[1] = rss[0] = 0;
+ 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+ 	if (!dst_pte)
+-		return -ENOMEM;
++		goto out;
+ 	src_pte = pte_offset_map_nested(src_pmd, addr);
+ 	src_ptl = pte_lockptr(src_mm, src_pmd);
+ 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+@@ -583,23 +611,32 @@ again:
+ 			progress++;
+ 			continue;
+ 		}
+-		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
++		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss,
++				&pbc);
+ 		progress += 8;
+ 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ 
+ 	arch_leave_lazy_mmu_mode();
+ 	spin_unlock(src_ptl);
+ 	pte_unmap_nested(src_pte - 1);
++	rss_tot = rss[0] + rss[1];
++	ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot);
+ 	add_mm_rss(dst_mm, rss[0], rss[1]);
+ 	pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ 	cond_resched();
+ 	if (addr != end)
+ 		goto again;
+-	return 0;
++
++	err = 0;
++out:
++	pb_free_list(&pbc);
++	return err;
+ }
+ 
+ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+-		pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
++		pud_t *dst_pud, pud_t *src_pud,
++		struct vm_area_struct *dst_vma,
++		struct vm_area_struct *vma,
+ 		unsigned long addr, unsigned long end)
+ {
+ 	pmd_t *src_pmd, *dst_pmd;
+@@ -614,14 +651,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
+ 		if (pmd_none_or_clear_bad(src_pmd))
+ 			continue;
+ 		if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+-						vma, addr, next))
++						dst_vma, vma, addr, next))
+ 			return -ENOMEM;
+ 	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
+ 	return 0;
+ }
+ 
+ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+-		pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
++		pgd_t *dst_pgd, pgd_t *src_pgd,
++		struct vm_area_struct *dst_vma,
++		struct vm_area_struct *vma,
+ 		unsigned long addr, unsigned long end)
+ {
+ 	pud_t *src_pud, *dst_pud;
+@@ -636,19 +675,21 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
+ 		if (pud_none_or_clear_bad(src_pud))
+ 			continue;
+ 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+-						vma, addr, next))
++						dst_vma, vma, addr, next))
+ 			return -ENOMEM;
+ 	} while (dst_pud++, src_pud++, addr = next, addr != end);
+ 	return 0;
+ }
+ 
+-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+-		struct vm_area_struct *vma)
++int __copy_page_range(struct vm_area_struct *dst_vma,
++		      struct vm_area_struct *vma,
++		      unsigned long addr, size_t size)
+ {
++	struct mm_struct *dst_mm = dst_vma->vm_mm;
++	struct mm_struct *src_mm = vma->vm_mm;
+ 	pgd_t *src_pgd, *dst_pgd;
+ 	unsigned long next;
+-	unsigned long addr = vma->vm_start;
+-	unsigned long end = vma->vm_end;
++	unsigned long end = addr + size;
+ 
+ 	/*
+ 	 * Don't copy ptes where a page fault will fill them correctly.
+@@ -671,11 +712,22 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ 		if (pgd_none_or_clear_bad(src_pgd))
+ 			continue;
+ 		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+-						vma, addr, next))
++						dst_vma, vma, addr, next))
+ 			return -ENOMEM;
+ 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(__copy_page_range);
++
++int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
++		    struct vm_area_struct *dst_vma, struct vm_area_struct *vma)
++{
++	if (dst_vma->vm_mm != dst)
++		BUG();
++	if (vma->vm_mm != src)
++		BUG();
++	return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start);
++}
+ 
+ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 				struct vm_area_struct *vma, pmd_t *pmd,
+@@ -687,6 +739,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 	spinlock_t *ptl;
+ 	int file_rss = 0;
+ 	int anon_rss = 0;
++	int rss;
+ 
+ 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ 	arch_enter_lazy_mmu_mode();
+@@ -741,6 +794,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 				file_rss--;
+ 			}
+ 			page_remove_rmap(page, vma);
++			pb_remove_ref(page, mm);
+ 			tlb_remove_page(tlb, page);
+ 			continue;
+ 		}
+@@ -755,6 +809,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ 
++	rss = -(file_rss + anon_rss);
++	ub_unused_privvm_add(mm, vma, rss);
+ 	add_mm_rss(mm, file_rss, anon_rss);
+ 	arch_leave_lazy_mmu_mode();
+ 	pte_unmap_unlock(pte - 1, ptl);
+@@ -1695,6 +1751,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	int reuse = 0, ret = 0;
+ 	int page_mkwrite = 0;
+ 	struct page *dirty_page = NULL;
++	struct page_beancounter *pbc;
+ 
+ 	old_page = vm_normal_page(vma, address, orig_pte);
+ 	if (!old_page) {
+@@ -1766,6 +1823,7 @@ reuse:
+ 		flush_cache_page(vma, address, pte_pfn(orig_pte));
+ 		entry = pte_mkyoung(orig_pte);
+ 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
++		ClearPageCheckpointed(old_page);
+ 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
+ 			update_mmu_cache(vma, address, entry);
+ 		ret |= VM_FAULT_WRITE;
+@@ -1779,6 +1837,9 @@ reuse:
+ gotten:
+ 	pte_unmap_unlock(page_table, ptl);
+ 
++	if (unlikely(pb_alloc(&pbc)))
++		goto oom_nopb;
++
+ 	if (unlikely(anon_vma_prepare(vma)))
+ 		goto oom;
+ 	VM_BUG_ON(old_page == ZERO_PAGE(0));
+@@ -1797,12 +1858,15 @@ gotten:
+ 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ 	if (likely(pte_same(*page_table, orig_pte))) {
+ 		if (old_page) {
++			pb_remove_ref(old_page, mm);
+ 			if (!PageAnon(old_page)) {
+ 				dec_mm_counter(mm, file_rss);
+ 				inc_mm_counter(mm, anon_rss);
+ 			}
+-		} else
++		} else {
++			ub_unused_privvm_dec(mm, vma);
+ 			inc_mm_counter(mm, anon_rss);
++		}
+ 		flush_cache_page(vma, address, pte_pfn(orig_pte));
+ 		entry = mk_pte(new_page, vma->vm_page_prot);
+ 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+@@ -1817,6 +1881,7 @@ gotten:
+ 		update_mmu_cache(vma, address, entry);
+ 		lru_cache_add_active(new_page);
+ 		page_add_new_anon_rmap(new_page, vma, address);
++		pb_add_ref(new_page, mm, &pbc);
+ 
+ 		if (old_page) {
+ 			/*
+@@ -1854,6 +1919,7 @@ gotten:
+ 		page_cache_release(new_page);
+ 	if (old_page)
+ 		page_cache_release(old_page);
++	pb_free(&pbc);
+ unlock:
+ 	pte_unmap_unlock(page_table, ptl);
+ 	if (dirty_page) {
+@@ -1876,6 +1942,8 @@ unlock:
+ oom_free_new:
+ 	page_cache_release(new_page);
+ oom:
++	pb_free(&pbc);
++oom_nopb:
+ 	if (old_page)
+ 		page_cache_release(old_page);
+ 	return VM_FAULT_OOM;
+@@ -2183,10 +2251,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	swp_entry_t entry;
+ 	pte_t pte;
+ 	int ret = 0;
++	struct page_beancounter *pbc;
++	cycles_t start;
+ 
+ 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+-		goto out;
++		goto out_nostat;
+ 
++	if (unlikely(pb_alloc(&pbc)))
++		return VM_FAULT_OOM;
++
++	start = get_cycles();
+ 	entry = pte_to_swp_entry(orig_pte);
+ 	if (is_migration_entry(entry)) {
+ 		migration_entry_wait(mm, pmd, address);
+@@ -2240,6 +2314,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	/* The page isn't present yet, go ahead with the fault. */
+ 
+ 	inc_mm_counter(mm, anon_rss);
++	ub_percpu_inc(mm->mm_ub, swapin);
+ 	pte = mk_pte(page, vma->vm_page_prot);
+ 	if (write_access && can_share_swap_page(page)) {
+ 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+@@ -2249,10 +2324,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	flush_icache_page(vma, page);
+ 	set_pte_at(mm, address, page_table, pte);
+ 	page_add_anon_rmap(page, vma, address);
++	pb_add_ref(page, mm, &pbc);
++	ub_unused_privvm_dec(mm, vma);
+ 
+ 	swap_free(entry);
+-	if (vm_swap_full())
+-		remove_exclusive_swap_page(page);
++	try_to_remove_exclusive_swap_page(page);
+ 	unlock_page(page);
+ 
+ 	if (write_access) {
+@@ -2267,10 +2343,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unlock:
+ 	pte_unmap_unlock(page_table, ptl);
+ out:
++	pb_free(&pbc);
++	spin_lock_irq(&kstat_glb_lock);
++	KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
++	spin_unlock_irq(&kstat_glb_lock);
++out_nostat:
+ 	return ret;
+ out_nomap:
+ 	mem_cgroup_uncharge_page(page);
+ 	pte_unmap_unlock(page_table, ptl);
++	pb_free(&pbc);
+ 	unlock_page(page);
+ 	page_cache_release(page);
+ 	return ret;
+@@ -2288,10 +2370,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	struct page *page;
+ 	spinlock_t *ptl;
+ 	pte_t entry;
++	struct page_beancounter *pbc;
+ 
+ 	/* Allocate our own private page. */
+ 	pte_unmap(page_table);
+ 
++	if (unlikely(pb_alloc(&pbc)))
++		goto oom_nopb;
++
+ 	if (unlikely(anon_vma_prepare(vma)))
+ 		goto oom;
+ 	page = alloc_zeroed_user_highpage_movable(vma, address);
+@@ -2311,11 +2397,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	inc_mm_counter(mm, anon_rss);
+ 	lru_cache_add_active(page);
+ 	page_add_new_anon_rmap(page, vma, address);
++	pb_add_ref(page, mm, &pbc);
++	ub_unused_privvm_dec(mm, vma);
+ 	set_pte_at(mm, address, page_table, entry);
+ 
+ 	/* No need to invalidate - it was non-present before */
+ 	update_mmu_cache(vma, address, entry);
+ unlock:
++	pb_free(&pbc);
+ 	pte_unmap_unlock(page_table, ptl);
+ 	return 0;
+ release:
+@@ -2325,6 +2414,8 @@ release:
+ oom_free_page:
+ 	page_cache_release(page);
+ oom:
++	pb_free(&pbc);
++oom_nopb:
+ 	return VM_FAULT_OOM;
+ }
+ 
+@@ -2351,6 +2442,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	pte_t entry;
+ 	int anon = 0;
+ 	struct page *dirty_page = NULL;
++	struct page_beancounter *pbc;
+ 	struct vm_fault vmf;
+ 	int ret;
+ 	int page_mkwrite = 0;
+@@ -2360,9 +2452,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	vmf.flags = flags;
+ 	vmf.page = NULL;
+ 
++	ret = VM_FAULT_OOM;
++	if (unlikely(pb_alloc(&pbc)))
++		goto oom_nopb;
++
+ 	ret = vma->vm_ops->fault(vma, &vmf);
+ 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+-		return ret;
++		goto out_fault;
+ 
+ 	/*
+ 	 * For consistency in subsequent calls, make the faulted page always
+@@ -2443,6 +2539,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	 */
+ 	/* Only go through if we didn't race with anybody else... */
+ 	if (likely(pte_same(*page_table, orig_pte))) {
++		struct user_beancounter *ub;
++
+ 		flush_icache_page(vma, page);
+ 		entry = mk_pte(page, vma->vm_page_prot);
+ 		if (flags & FAULT_FLAG_WRITE)
+@@ -2460,6 +2558,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 				get_page(dirty_page);
+ 			}
+ 		}
++		ub = page_ub(page);
++		if (ub != NULL &&
++#ifdef CONFIG_BC_IO_ACCOUNTING
++		    !((unsigned long)ub & PAGE_IO_MARK) &&
++#endif
++		    ub->ub_magic == UB_MAGIC) {
++			/*
++			 * WOW: Page was already charged as page_ub. This may
++			 * happens for example then some driver export its low
++			 * memory pages to user space. We can't account page as
++			 * page_ub and page_bp at the same time. So uncharge
++			 * page from UB counter.
++			 */
++			WARN_ON_ONCE(1);
++			ub_page_uncharge(page, 0);
++		}
++
++		pb_add_ref(page, mm, &pbc);
++		ub_unused_privvm_dec(mm, vma);
+ 
+ 		/* no need to invalidate: a not-present page won't be cached */
+ 		update_mmu_cache(vma, address, entry);
+@@ -2485,7 +2602,9 @@ out_unlocked:
+ 		set_page_dirty_balance(dirty_page, page_mkwrite);
+ 		put_page(dirty_page);
+ 	}
+-
++out_fault:
++	pb_free(&pbc);
++oom_nopb:
+ 	return ret;
+ }
+ 
+@@ -2667,6 +2786,27 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 
++#ifdef CONFIG_VZ_GENCALLS
++	do {
++		int ret;
++#ifdef CONFIG_BEANCOUNTERS
++		struct task_beancounter *tbc;
++
++		tbc = &current->task_bc;
++		if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) &&
++				tbc->pgfault_allot) {
++			tbc->pgfault_allot--;
++			break; /* skip notifier */
++		}
++#endif
++		ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN,
++				(void *)1);
++		if (ret & NOTIFY_FAIL)
++			return VM_FAULT_SIGBUS;
++		if (ret & NOTIFY_OK)
++			return VM_FAULT_MINOR; /* retry */
++	} while (0);
++#endif
+ 	__set_current_state(TASK_RUNNING);
+ 
+ 	count_vm_event(PGFAULT);
+@@ -2711,6 +2851,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+ }
+ #endif /* __PAGETABLE_PUD_FOLDED */
+ 
++EXPORT_SYMBOL_GPL(__pud_alloc);
++
+ #ifndef __PAGETABLE_PMD_FOLDED
+ /*
+  * Allocate page middle directory.
+@@ -2741,6 +2883,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+ }
+ #endif /* __PAGETABLE_PMD_FOLDED */
+ 
++EXPORT_SYMBOL_GPL(__pmd_alloc);
++
+ int make_pages_present(unsigned long addr, unsigned long end)
+ {
+ 	int ret, len, write;
+@@ -2760,6 +2904,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
+ 	return ret == len ? 0 : -1;
+ }
+ 
++EXPORT_SYMBOL(make_pages_present);
++
+ #if !defined(__HAVE_ARCH_GATE_AREA)
+ 
+ #if defined(AT_SYSINFO_EHDR)
+diff --git a/mm/mempool.c b/mm/mempool.c
+index a46eb1b..0e1a6bf 100644
+--- a/mm/mempool.c
++++ b/mm/mempool.c
+@@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+ 	init_waitqueue_head(&pool->wait);
+ 	pool->alloc = alloc_fn;
+ 	pool->free = free_fn;
++	if (alloc_fn == mempool_alloc_slab)
++		kmem_mark_nocharge((struct kmem_cache *)pool_data);
+ 
+ 	/*
+ 	 * First pre-allocate the guaranteed number of buffers.
+@@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+ 	unsigned long flags;
+ 
+ 	BUG_ON(new_min_nr <= 0);
++	gfp_mask &= ~__GFP_UBC;
+ 
+ 	spin_lock_irqsave(&pool->lock, flags);
+ 	if (new_min_nr <= pool->min_nr) {
+@@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+ 	gfp_mask |= __GFP_NOMEMALLOC;	/* don't allocate emergency reserves */
+ 	gfp_mask |= __GFP_NORETRY;	/* don't loop in __alloc_pages */
+ 	gfp_mask |= __GFP_NOWARN;	/* failures are OK */
++	gfp_mask &= ~__GFP_UBC;
+ 
+ 	gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+ 
+diff --git a/mm/mlock.c b/mm/mlock.c
+index 7b26560..ea357f4 100644
+--- a/mm/mlock.c
++++ b/mm/mlock.c
+@@ -8,10 +8,12 @@
+ #include <linux/capability.h>
+ #include <linux/mman.h>
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/mempolicy.h>
+ #include <linux/syscalls.h>
+ #include <linux/sched.h>
+ #include <linux/module.h>
++#include <bc/vmpages.h>
+ 
+ int can_do_mlock(void)
+ {
+@@ -36,6 +38,14 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ 		goto out;
+ 	}
+ 
++	if (newflags & VM_LOCKED) {
++		ret = ub_locked_charge(mm, end - start);
++		if (ret < 0) {
++			*prev = vma;
++			goto out;
++		}
++	}
++
+ 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+ 			  vma->vm_file, pgoff, vma_policy(vma));
+@@ -49,13 +59,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ 	if (start != vma->vm_start) {
+ 		ret = split_vma(mm, vma, start, 1);
+ 		if (ret)
+-			goto out;
++			goto out_uncharge;
+ 	}
+ 
+ 	if (end != vma->vm_end) {
+ 		ret = split_vma(mm, vma, end, 0);
+ 		if (ret)
+-			goto out;
++			goto out_uncharge;
+ 	}
+ 
+ success:
+@@ -74,13 +84,19 @@ success:
+ 		pages = -pages;
+ 		if (!(newflags & VM_IO))
+ 			ret = make_pages_present(start, end);
+-	}
++	} else
++		ub_locked_uncharge(mm, end - start);
+ 
+ 	mm->locked_vm -= pages;
+ out:
+ 	if (ret == -ENOMEM)
+ 		ret = -EAGAIN;
+ 	return ret;
++
++out_uncharge:
++	if (newflags & VM_LOCKED)
++		ub_locked_uncharge(mm, end - start);
++	goto out;
+ }
+ 
+ static int do_mlock(unsigned long start, size_t len, int on)
+@@ -157,6 +173,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
+ 	up_write(&current->mm->mmap_sem);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(sys_mlock);
+ 
+ asmlinkage long sys_munlock(unsigned long start, size_t len)
+ {
+@@ -169,6 +186,7 @@ asmlinkage long sys_munlock(unsigned long start, size_t len)
+ 	up_write(&current->mm->mmap_sem);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(sys_munlock);
+ 
+ static int do_mlockall(int flags)
+ {
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 3354fdd..89b2ef2 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -26,6 +26,7 @@
+ #include <linux/mount.h>
+ #include <linux/mempolicy.h>
+ #include <linux/rmap.h>
++#include <linux/virtinfo.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -36,10 +37,13 @@
+ #define arch_mmap_check(addr, len, flags)	(0)
+ #endif
+ 
++#include <bc/vmpages.h>
++
+ #ifndef arch_rebalance_pgtables
+ #define arch_rebalance_pgtables(addr, len)		(addr)
+ #endif
+ 
++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft);
+ static void unmap_region(struct mm_struct *mm,
+ 		struct vm_area_struct *vma, struct vm_area_struct *prev,
+ 		unsigned long start, unsigned long end);
+@@ -104,6 +108,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+ 
+ 	vm_acct_memory(pages);
+ 
++#ifdef CONFIG_BEANCOUNTERS
++	switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM,
++				(void *)pages)
++			& (NOTIFY_OK | NOTIFY_FAIL)) {
++		case NOTIFY_OK:
++			return 0;
++		case NOTIFY_FAIL:
++			vm_unacct_memory(pages);
++			return -ENOMEM;
++	}
++#endif
++
+ 	/*
+ 	 * Sometimes we want to use more memory than we have
+ 	 */
+@@ -228,6 +244,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+ 	struct vm_area_struct *next = vma->vm_next;
+ 
+ 	might_sleep();
++
++	ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
++			vma->vm_flags, vma->vm_file);
+ 	if (vma->vm_ops && vma->vm_ops->close)
+ 		vma->vm_ops->close(vma);
+ 	if (vma->vm_file) {
+@@ -285,7 +304,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
+ 		goto out;
+ 
+ 	/* Ok, looks good - let it rip. */
+-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
++	if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk)
+ 		goto out;
+ set_brk:
+ 	mm->brk = brk;
+@@ -927,7 +946,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+ 			prot |= PROT_EXEC;
+ 
+ 	if (!len)
+-		return -EINVAL;
++		return addr;
+ 
+ 	if (!(flags & MAP_FIXED))
+ 		addr = round_hint_to_min(addr);
+@@ -1092,6 +1111,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
+ 	struct rb_node **rb_link, *rb_parent;
+ 	unsigned long charged = 0;
+ 	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
++	unsigned long ub_charged = 0;
+ 
+ 	/* Clear old maps */
+ 	error = -ENOMEM;
+@@ -1123,6 +1143,11 @@ munmap_back:
+ 		}
+ 	}
+ 
++	if (ub_memory_charge(mm, len, vm_flags, file,
++				(flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD)))
++		goto charge_error;
++	ub_charged = 1;
++
+ 	/*
+ 	 * Can we just expand an old private anonymous mapping?
+ 	 * The VM_SHARED test is necessary because shmem_zero_setup
+@@ -1138,7 +1163,8 @@ munmap_back:
+ 	 * specific mapper. the address has already been validated, but
+ 	 * not unmapped, but the maps are removed from the list.
+ 	 */
+-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
++	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL |
++			(flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0));
+ 	if (!vma) {
+ 		error = -ENOMEM;
+ 		goto unacct_error;
+@@ -1168,6 +1194,19 @@ munmap_back:
+ 			goto unmap_and_free_vma;
+ 		if (vm_flags & VM_EXECUTABLE)
+ 			added_exe_file_vma(mm);
++		if (vm_flags != vma->vm_flags) {
++		/* 
++		 * ->vm_flags has been changed in f_op->mmap method.
++		 * We have to recharge ub memory.
++		 */
++			ub_memory_uncharge(mm, len, vm_flags, file);
++			if (ub_memory_charge(mm, len, vma->vm_flags, file,
++				(flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) {
++				ub_charged = 0;
++				error = -ENOMEM;
++				goto unmap_and_free_vma;
++			}
++		}
+ 	} else if (vm_flags & VM_SHARED) {
+ 		error = shmem_zero_setup(vma);
+ 		if (error)
+@@ -1232,6 +1271,9 @@ unmap_and_free_vma:
+ free_vma:
+ 	kmem_cache_free(vm_area_cachep, vma);
+ unacct_error:
++	if (ub_charged)
++		ub_memory_uncharge(mm, len, vm_flags, file);
++charge_error:
+ 	if (charged)
+ 		vm_unacct_memory(charged);
+ 	return error;
+@@ -1554,12 +1596,16 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
+ 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ 		return -EFAULT;
+ 
++	if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
++				vma->vm_file, UB_SOFT))
++		goto fail_charge;
++
+ 	/*
+ 	 * Overcommit..  This must be the final test, as it will
+ 	 * update security statistics.
+ 	 */
+ 	if (security_vm_enough_memory(grow))
+-		return -ENOMEM;
++		goto fail_sec;
+ 
+ 	/* Ok, everything looks good - let it rip */
+ 	mm->total_vm += grow;
+@@ -1567,6 +1613,11 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
+ 		mm->locked_vm += grow;
+ 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+ 	return 0;
++
++fail_sec:
++	ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
++fail_charge:
++	return -ENOMEM;
+ }
+ 
+ #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+@@ -1850,6 +1901,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(split_vma);
+ 
+ /* Munmap is split into 2 main parts -- this part which finds
+  * what needs doing, and the areas themselves, which do the
+@@ -1943,7 +1995,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
+  *  anonymous maps.  eventually we may be able to do some
+  *  brk-specific accounting here.
+  */
+-unsigned long do_brk(unsigned long addr, unsigned long len)
++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft)
+ {
+ 	struct mm_struct * mm = current->mm;
+ 	struct vm_area_struct * vma, * prev;
+@@ -2009,8 +2061,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
+ 	if (mm->map_count > sysctl_max_map_count)
+ 		return -ENOMEM;
+ 
++	if (ub_memory_charge(mm, len, flags, NULL, soft))
++		goto fail_charge;
++
+ 	if (security_vm_enough_memory(len >> PAGE_SHIFT))
+-		return -ENOMEM;
++		goto fail_sec;
+ 
+ 	/* Can we just expand an old private anonymous mapping? */
+ 	if (vma_merge(mm, prev, addr, addr + len, flags,
+@@ -2020,11 +2075,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
+ 	/*
+ 	 * create a vma struct for an anonymous mapping
+ 	 */
+-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+-	if (!vma) {
+-		vm_unacct_memory(len >> PAGE_SHIFT);
+-		return -ENOMEM;
+-	}
++	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL |
++			(soft == UB_SOFT ? __GFP_SOFT_UBC : 0));
++	if (!vma)
++		goto fail_alloc;
+ 
+ 	vma->vm_mm = mm;
+ 	vma->vm_start = addr;
+@@ -2040,8 +2094,19 @@ out:
+ 		make_pages_present(addr, addr + len);
+ 	}
+ 	return addr;
++
++fail_alloc:
++	vm_unacct_memory(len >> PAGE_SHIFT);
++fail_sec:
++	ub_memory_uncharge(mm, len, flags, NULL);
++fail_charge:
++	return -ENOMEM;
+ }
+ 
++unsigned long do_brk(unsigned long addr, unsigned long len)
++{
++	return __do_brk(addr, len, UB_SOFT);
++}
+ EXPORT_SYMBOL(do_brk);
+ 
+ /* Release all mmaps. */
+@@ -2218,10 +2283,11 @@ static void special_mapping_close(struct vm_area_struct *vma)
+ {
+ }
+ 
+-static struct vm_operations_struct special_mapping_vmops = {
++struct vm_operations_struct special_mapping_vmops = {
+ 	.close = special_mapping_close,
+ 	.fault = special_mapping_fault,
+ };
++EXPORT_SYMBOL_GPL(special_mapping_vmops);
+ 
+ /*
+  * Called with mm->mmap_sem held for writing.
+diff --git a/mm/mmzone.c b/mm/mmzone.c
+index 486ed59..8cd9f7a 100644
+--- a/mm/mmzone.c
++++ b/mm/mmzone.c
+@@ -13,6 +13,7 @@ struct pglist_data *first_online_pgdat(void)
+ {
+ 	return NODE_DATA(first_online_node);
+ }
++EXPORT_SYMBOL_GPL(first_online_pgdat);
+ 
+ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+ {
+@@ -22,6 +23,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+ 		return NULL;
+ 	return NODE_DATA(nid);
+ }
++EXPORT_SYMBOL_GPL(next_online_pgdat);
+ 
+ /*
+  * next_zone - helper magic for for_each_zone()
+diff --git a/mm/mprotect.c b/mm/mprotect.c
+index a5bf31c..e0073cd 100644
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -9,6 +9,7 @@
+  */
+ 
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/hugetlb.h>
+ #include <linux/slab.h>
+ #include <linux/shm.h>
+@@ -26,6 +27,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <bc/vmpages.h>
++
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+@@ -144,6 +147,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ 	unsigned long charged = 0;
+ 	pgoff_t pgoff;
+ 	int error;
++	unsigned long ch_size;
++	int ch_dir;
+ 	int dirty_accountable = 0;
+ 
+ 	if (newflags == oldflags) {
+@@ -151,6 +156,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ 		return 0;
+ 	}
+ 
++	error = -ENOMEM;
++	ch_size = nrpages - pages_in_vma_range(vma, start, end);
++	ch_dir = ub_protected_charge(mm, ch_size, newflags, vma);
++	if (ch_dir == PRIVVM_ERROR)
++		goto fail_ch;
++
+ 	/*
+ 	 * If we make a private mapping writable we increase our commit;
+ 	 * but (without finer accounting) cannot reduce our commit if we
+@@ -163,7 +174,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
+ 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+ 			charged = nrpages;
+ 			if (security_vm_enough_memory(charged))
+-				return -ENOMEM;
++				goto fail_sec;
+ 			newflags |= VM_ACCOUNT;
+ 		}
+ 	}
+@@ -213,10 +224,16 @@ success:
+ 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
++	if (ch_dir == PRIVVM_TO_SHARED)
++		__ub_unused_privvm_dec(mm, ch_size);
+ 	return 0;
+ 
+ fail:
+ 	vm_unacct_memory(charged);
++fail_sec:
++	if (ch_dir == PRIVVM_TO_PRIVATE)
++		__ub_unused_privvm_dec(mm, ch_size);
++fail_ch:
+ 	return error;
+ }
+ 
+@@ -318,3 +335,4 @@ out:
+ 	up_write(&current->mm->mmap_sem);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(sys_mprotect);
+diff --git a/mm/mremap.c b/mm/mremap.c
+index 08e3c7f..67511b9 100644
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -23,6 +23,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ 
++#include <bc/vmpages.h>
++
+ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+ {
+ 	pgd_t *pgd;
+@@ -167,17 +169,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
+ 	unsigned long hiwater_vm;
+ 	int split = 0;
+ 
++	if (ub_memory_charge(mm, new_len, vm_flags,
++				vma->vm_file, UB_HARD))
++		goto err;
++
+ 	/*
+ 	 * We'd prefer to avoid failure later on in do_munmap:
+ 	 * which may split one vma into three before unmapping.
+ 	 */
+ 	if (mm->map_count >= sysctl_max_map_count - 3)
+-		return -ENOMEM;
++		goto err_nomem;
+ 
+ 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ 	if (!new_vma)
+-		return -ENOMEM;
++		goto err_nomem;
+ 
+ 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ 	if (moved_len < old_len) {
+@@ -236,7 +242,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
+ 					   new_addr + new_len);
+ 	}
+ 
+-	return new_addr;
++	if (new_addr != -ENOMEM)
++		return new_addr;
++
++err_nomem:
++	ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
++err:
++	return -ENOMEM;
+ }
+ 
+ /*
+@@ -364,7 +376,15 @@ unsigned long do_mremap(unsigned long addr,
+ 			max_addr = vma->vm_next->vm_start;
+ 		/* can we just expand the current mapping? */
+ 		if (max_addr - addr >= new_len) {
+-			int pages = (new_len - old_len) >> PAGE_SHIFT;
++			unsigned long len;
++			int pages;
++
++			len = new_len - old_len;
++			pages = len >> PAGE_SHIFT;
++			ret = -ENOMEM;
++			if (ub_memory_charge(mm, len, vma->vm_flags,
++						vma->vm_file, UB_HARD))
++				goto out;
+ 
+ 			vma_adjust(vma, vma->vm_start,
+ 				addr + new_len, vma->vm_pgoff, NULL);
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index 8a5467e..916243b 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -19,6 +19,8 @@
+ #include <linux/mm.h>
+ #include <linux/err.h>
+ #include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/virtinfo.h>
+ #include <linux/swap.h>
+ #include <linux/timex.h>
+ #include <linux/jiffies.h>
+@@ -27,6 +29,9 @@
+ #include <linux/notifier.h>
+ #include <linux/memcontrol.h>
+ 
++#include <bc/beancounter.h>
++#include <bc/oom_kill.h>
++
+ int sysctl_panic_on_oom;
+ int sysctl_oom_kill_allocating_task;
+ int sysctl_oom_dump_tasks;
+@@ -198,16 +203,16 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+  *
+  * (not docbooked, we don't want this one cluttering up the manual)
+  */
+-static struct task_struct *select_bad_process(unsigned long *ppoints,
++struct task_struct *select_bad_process(struct user_beancounter *ub,
+ 						struct mem_cgroup *mem)
+ {
+ 	struct task_struct *g, *p;
+ 	struct task_struct *chosen = NULL;
+ 	struct timespec uptime;
+-	*ppoints = 0;
++	unsigned long chosen_points = 0;
+ 
+ 	do_posix_clock_monotonic_gettime(&uptime);
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		unsigned long points;
+ 
+ 		/*
+@@ -221,6 +226,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
+ 			continue;
+ 		if (mem && !task_in_mem_cgroup(p, mem))
+ 			continue;
++		if (ub_oom_task_skip(ub, p))
++			continue;
+ 
+ 		/*
+ 		 * This task already has access to memory reserves and is
+@@ -249,18 +256,18 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
+ 				return ERR_PTR(-1UL);
+ 
+ 			chosen = p;
+-			*ppoints = ULONG_MAX;
++			chosen_points = ULONG_MAX;
+ 		}
+ 
+ 		if (p->oomkilladj == OOM_DISABLE)
+ 			continue;
+ 
+ 		points = badness(p, uptime.tv_sec);
+-		if (points > *ppoints || !chosen) {
++		if (points > chosen_points || !chosen) {
+ 			chosen = p;
+-			*ppoints = points;
++			chosen_points = points;
+ 		}
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ 
+ 	return chosen;
+ }
+@@ -284,7 +291,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
+ 
+ 	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
+ 	       "name\n");
+-	do_each_thread(g, p) {
++	do_each_thread_all(g, p) {
+ 		/*
+ 		 * total_vm and rss sizes do not exist for tasks with a
+ 		 * detached mm so there's no need to report them.
+@@ -300,7 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
+ 		       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
+ 		       p->comm);
+ 		task_unlock(p);
+-	} while_each_thread(g, p);
++	} while_each_thread_all(g, p);
+ }
+ 
+ /*
+@@ -335,13 +342,16 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
+ 	set_tsk_thread_flag(p, TIF_MEMDIE);
+ 
+ 	force_sig(SIGKILL, p);
++	ub_oom_task_killed(p);
+ }
+ 
+ static int oom_kill_task(struct task_struct *p)
+ {
+ 	struct mm_struct *mm;
++	struct user_beancounter *ub;
+ 	struct task_struct *g, *q;
+ 
++	task_lock(p);
+ 	mm = p->mm;
+ 
+ 	/* WARNING: mm may not be dereferenced since we did not obtain its
+@@ -353,16 +363,21 @@ static int oom_kill_task(struct task_struct *p)
+ 	 * However, this is of no concern to us.
+ 	 */
+ 
+-	if (mm == NULL)
++	if (mm == NULL) {
++		task_unlock(p);
+ 		return 1;
++	}
++
++	ub = get_beancounter(mm_ub(mm));
++	task_unlock(p);
+ 
+ 	/*
+ 	 * Don't kill the process if any threads are set to OOM_DISABLE
+ 	 */
+-	do_each_thread(g, q) {
++	do_each_thread_all(g, q) {
+ 		if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+ 			return 1;
+-	} while_each_thread(g, q);
++	} while_each_thread_all(g, q);
+ 
+ 	__oom_kill_task(p, 1);
+ 
+@@ -371,17 +386,18 @@ static int oom_kill_task(struct task_struct *p)
+ 	 * but are in a different thread group. Don't let them have access
+ 	 * to memory reserves though, otherwise we might deplete all memory.
+ 	 */
+-	do_each_thread(g, q) {
++	do_each_thread_all(g, q) {
+ 		if (q->mm == mm && !same_thread_group(q, p))
+ 			force_sig(SIGKILL, q);
+-	} while_each_thread(g, q);
++	} while_each_thread_all(g, q);
+ 
++	ub_oom_mm_killed(ub);
++	put_beancounter(ub);
+ 	return 0;
+ }
+ 
+-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+-			    unsigned long points, struct mem_cgroup *mem,
+-			    const char *message)
++int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
++			    struct mem_cgroup *mem, const char *message)
+ {
+ 	struct task_struct *c;
+ 
+@@ -404,8 +420,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ 		return 0;
+ 	}
+ 
+-	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+-					message, task_pid_nr(p), p->comm, points);
++	printk(KERN_ERR "%s: kill process %d (%s) or a child\n",
++					message, task_pid_nr(p), p->comm);
+ 
+ 	/* Try to kill a child first */
+ 	list_for_each_entry(c, &p->children, sibling) {
+@@ -520,9 +536,9 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+ {
+ 	struct task_struct *p;
+-	unsigned long points = 0;
+ 	unsigned long freed = 0;
+ 	enum oom_constraint constraint;
++	struct user_beancounter *ub;
+ 
+ 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ 	if (freed > 0)
+@@ -532,16 +548,34 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+ 	if (sysctl_panic_on_oom == 2)
+ 		panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ 
++	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL)
++			& (NOTIFY_OK | NOTIFY_FAIL))
++		return;
++
++	ub = NULL;
++	if (ub_oom_lock())
++		goto out_oom_lock;
++
++	read_lock(&tasklist_lock);
++
++	if (printk_ratelimit()) {
++		printk(KERN_WARNING "%s invoked oom-killer: "
++			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
++			current->comm, gfp_mask, order, current->oomkilladj);
++		dump_stack();
++		show_mem();
++		show_slab_info();
++	}
++
+ 	/*
+ 	 * Check if there were limitations on the allocation (only relevant for
+ 	 * NUMA) that may require different handling.
+ 	 */
+ 	constraint = constrained_alloc(zonelist, gfp_mask);
+-	read_lock(&tasklist_lock);
+ 
+ 	switch (constraint) {
+ 	case CONSTRAINT_MEMORY_POLICY:
+-		oom_kill_process(current, gfp_mask, order, points, NULL,
++		oom_kill_process(current, gfp_mask, order, NULL,
+ 				"No available memory (MPOL_BIND)");
+ 		break;
+ 
+@@ -551,27 +585,33 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+ 		/* Fall-through */
+ 	case CONSTRAINT_CPUSET:
+ 		if (sysctl_oom_kill_allocating_task) {
+-			oom_kill_process(current, gfp_mask, order, points, NULL,
++			oom_kill_process(current, gfp_mask, order, NULL,
+ 					"Out of memory (oom_kill_allocating_task)");
+ 			break;
+ 		}
+ retry:
++		put_beancounter(ub);
++
+ 		/*
+ 		 * Rambo mode: Shoot down a process and hope it solves whatever
+ 		 * issues we may have.
+ 		 */
+-		p = select_bad_process(&points, NULL);
++		ub = ub_oom_select_worst();
++		p = select_bad_process(ub, NULL);
+ 
+ 		if (PTR_ERR(p) == -1UL)
+ 			goto out;
+ 
+ 		/* Found nothing?!?! Either we hang forever, or we panic. */
+ 		if (!p) {
++			if (ub != NULL)
++				goto retry;
+ 			read_unlock(&tasklist_lock);
++			ub_oom_unlock();
+ 			panic("Out of memory and no killable processes...\n");
+ 		}
+ 
+-		if (oom_kill_process(p, gfp_mask, order, points, NULL,
++		if (oom_kill_process(p, gfp_mask, order, NULL,
+ 				     "Out of memory"))
+ 			goto retry;
+ 
+@@ -580,7 +620,10 @@ retry:
+ 
+ out:
+ 	read_unlock(&tasklist_lock);
++	ub_oom_unlock();
++	put_beancounter(ub);
+ 
++out_oom_lock:
+ 	/*
+ 	 * Give "p" a good chance of killing itself before we
+ 	 * retry to allocate memory unless "p" is current
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 789b6ad..e1883c0 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -35,6 +35,9 @@
+ #include <linux/buffer_head.h>
+ #include <linux/pagevec.h>
+ 
++#include <bc/io_prio.h>
++#include <bc/io_acct.h>
++
+ /*
+  * The maximum number of pages to writeout in a single bdflush/kupdate
+  * operation.  We do this so we don't hold I_SYNC against an inode for
+@@ -899,6 +902,7 @@ retry:
+ 		scanned = 1;
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page *page = pvec.pages[i];
++			struct user_beancounter *old_ub;
+ 
+ 			/*
+ 			 * At this point we hold neither mapping->tree_lock nor
+@@ -929,7 +933,9 @@ retry:
+ 				continue;
+ 			}
+ 
++			old_ub = bc_io_switch_context(page);
+ 			ret = (*writepage)(page, wbc, data);
++			bc_io_restore_context(old_ub);
+ 
+ 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+ 				unlock_page(page);
+@@ -1025,12 +1031,15 @@ int write_one_page(struct page *page, int wait)
+ 		.sync_mode = WB_SYNC_ALL,
+ 		.nr_to_write = 1,
+ 	};
++	struct user_beancounter *old_ub;
+ 
+ 	BUG_ON(!PageLocked(page));
+ 
+ 	if (wait)
+ 		wait_on_page_writeback(page);
+ 
++	old_ub = bc_io_switch_context(page);
++
+ 	if (clear_page_dirty_for_io(page)) {
+ 		page_cache_get(page);
+ 		ret = mapping->a_ops->writepage(page, &wbc);
+@@ -1043,6 +1052,9 @@ int write_one_page(struct page *page, int wait)
+ 	} else {
+ 		unlock_page(page);
+ 	}
++
++	bc_io_restore_context(old_ub);
++	
+ 	return ret;
+ }
+ EXPORT_SYMBOL(write_one_page);
+@@ -1074,6 +1086,9 @@ int __set_page_dirty_no_writeback(struct page *page)
+  */
+ int __set_page_dirty_nobuffers(struct page *page)
+ {
++	int acct;
++
++	acct = 0;
+ 	if (!TestSetPageDirty(page)) {
+ 		struct address_space *mapping = page_mapping(page);
+ 		struct address_space *mapping2;
+@@ -1081,6 +1096,7 @@ int __set_page_dirty_nobuffers(struct page *page)
+ 		if (!mapping)
+ 			return 1;
+ 
++		acct = 0;
+ 		write_lock_irq(&mapping->tree_lock);
+ 		mapping2 = page_mapping(page);
+ 		if (mapping2) { /* Race with truncate? */
+@@ -1090,12 +1106,14 @@ int __set_page_dirty_nobuffers(struct page *page)
+ 				__inc_zone_page_state(page, NR_FILE_DIRTY);
+ 				__inc_bdi_stat(mapping->backing_dev_info,
+ 						BDI_RECLAIMABLE);
+-				task_io_account_write(PAGE_CACHE_SIZE);
++				acct = 1;
+ 			}
+ 			radix_tree_tag_set(&mapping->page_tree,
+ 				page_index(page), PAGECACHE_TAG_DIRTY);
+ 		}
+ 		write_unlock_irq(&mapping->tree_lock);
++		if (acct)
++			task_io_account_write(page, PAGE_CACHE_SIZE, 0);
+ 		if (mapping->host) {
+ 			/* !PageAnon && !swapper_space */
+ 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+@@ -1234,6 +1252,7 @@ int clear_page_dirty_for_io(struct page *page)
+ 			dec_zone_page_state(page, NR_FILE_DIRTY);
+ 			dec_bdi_stat(mapping->backing_dev_info,
+ 					BDI_RECLAIMABLE);
++			ub_io_release_context(page, PAGE_CACHE_SIZE);
+ 			return 1;
+ 		}
+ 		return 0;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f32fae3..1d172d0 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -51,6 +51,9 @@
+ #include <asm/div64.h>
+ #include "internal.h"
+ 
++#include <bc/kmem.h>
++#include <bc/io_acct.h>
++
+ /*
+  * Array of node states.
+  */
+@@ -102,6 +105,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+ 	 32,
+ };
+ 
++EXPORT_SYMBOL(nr_swap_pages);
+ EXPORT_SYMBOL(totalram_pages);
+ 
+ static char * const zone_names[MAX_NR_ZONES] = {
+@@ -456,8 +460,11 @@ static inline int free_pages_check(struct page *page)
+ 		(page_count(page) != 0)  |
+ 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+ 		bad_page(page);
+-	if (PageDirty(page))
++	if (PageDirty(page)) {
++		ub_io_release_context(page, 0);
+ 		__ClearPageDirty(page);
++	} else
++		ub_io_release_debug(page);
+ 	/*
+ 	 * For now, we report if PG_reserved was found set, but do not
+ 	 * clear it, and do not free the page.  But we shall soon need
+@@ -523,6 +530,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
+ 	arch_free_page(page, order);
+ 	kernel_map_pages(page, 1 << order, 0);
+ 
++	ub_page_uncharge(page, order);
+ 	local_irq_save(flags);
+ 	__count_vm_events(PGFREE, 1 << order);
+ 	free_one_page(page_zone(page), page, order);
+@@ -979,6 +987,7 @@ static void free_hot_cold_page(struct page *page, int cold)
+ 	kernel_map_pages(page, 1, 0);
+ 
+ 	pcp = &zone_pcp(zone, get_cpu())->pcp;
++	ub_page_uncharge(page, 0);
+ 	local_irq_save(flags);
+ 	__count_vm_event(PGFREE);
+ 	if (cold)
+@@ -1426,6 +1435,31 @@ try_next_zone:
+ 	return page;
+ }
+ 
++extern unsigned long cycles_per_jiffy;
++static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
++		struct page *page, cycles_t time)
++{
++#ifdef CONFIG_VE
++	int ind;
++	unsigned long flags;
++
++	time = (jiffies - time) * cycles_per_jiffy;
++	if (!(gfp_mask & __GFP_WAIT))
++		ind = 0;
++	else if (!(gfp_mask & __GFP_HIGHMEM))
++		ind = (order > 0 ? 2 : 1);
++	else
++		ind = (order > 0 ? 4 : 3);
++	spin_lock_irqsave(&kstat_glb_lock, flags);
++	KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time);
++	if (!page)
++		kstat_glob.alloc_fails[ind]++;
++	spin_unlock_irqrestore(&kstat_glb_lock, flags);
++#endif
++}
++
++int alloc_fail_warn;
++
+ /*
+  * This is the 'heart' of the zoned buddy allocator.
+  */
+@@ -1444,6 +1478,7 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+ 	int alloc_flags;
+ 	unsigned long did_some_progress;
+ 	unsigned long pages_reclaimed = 0;
++	cycles_t start;
+ 
+ 	might_sleep_if(wait);
+ 
+@@ -1461,6 +1496,7 @@ restart:
+ 		return NULL;
+ 	}
+ 
++	start = jiffies;
+ 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ 	if (page)
+@@ -1617,19 +1653,32 @@ nofail_alloc:
+ 			do_retry = 1;
+ 	}
+ 	if (do_retry) {
++		if (total_swap_pages > 0 && nr_swap_pages == 0) {
++			out_of_memory(zonelist, gfp_mask, order);
++			goto restart;
++		}
+ 		congestion_wait(WRITE, HZ/50);
+ 		goto rebalance;
+ 	}
+ 
+ nopage:
+-	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
++	__alloc_collect_stats(gfp_mask, order, NULL, start);
++	if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && 
++			printk_ratelimit()) {
+ 		printk(KERN_WARNING "%s: page allocation failure."
+ 			" order:%d, mode:0x%x\n",
+ 			p->comm, order, gfp_mask);
+ 		dump_stack();
+ 		show_mem();
+ 	}
++	return NULL;
++
+ got_pg:
++	__alloc_collect_stats(gfp_mask, order, page, start);
++	if (ub_page_charge(page, order, gfp_mask)) {
++		__free_pages(page, order);
++		page = NULL;
++	}
+ 	return page;
+ }
+ 
+diff --git a/mm/rmap.c b/mm/rmap.c
+index bf0a5b7..679a575 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -50,6 +50,9 @@
+ #include <linux/kallsyms.h>
+ #include <linux/memcontrol.h>
+ 
++#include <bc/beancounter.h>
++#include <bc/vmpages.h>
++
+ #include <asm/tlbflush.h>
+ 
+ struct kmem_cache *anon_vma_cachep;
+@@ -93,6 +96,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
+ 	}
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(anon_vma_prepare);
+ 
+ void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+ {
+@@ -118,6 +122,7 @@ void anon_vma_link(struct vm_area_struct *vma)
+ 		spin_unlock(&anon_vma->lock);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(anon_vma_link);
+ 
+ void anon_vma_unlink(struct vm_area_struct *vma)
+ {
+@@ -149,14 +154,14 @@ static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+ void __init anon_vma_init(void)
+ {
+ 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
++			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor);
+ }
+ 
+ /*
+  * Getting a lock on a stable anon_vma from a page off the LRU is
+  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+  */
+-static struct anon_vma *page_lock_anon_vma(struct page *page)
++struct anon_vma *page_lock_anon_vma(struct page *page)
+ {
+ 	struct anon_vma *anon_vma;
+ 	unsigned long anon_mapping;
+@@ -175,12 +180,14 @@ out:
+ 	rcu_read_unlock();
+ 	return NULL;
+ }
++EXPORT_SYMBOL_GPL(page_lock_anon_vma);
+ 
+-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
++void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ {
+ 	spin_unlock(&anon_vma->lock);
+ 	rcu_read_unlock();
+ }
++EXPORT_SYMBOL_GPL(page_unlock_anon_vma);
+ 
+ /*
+  * At what user virtual address is page expected in @vma?
+@@ -684,6 +691,12 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+ 		}
+ 		mem_cgroup_uncharge_page(page);
+ 
++		/*
++		 * Well, when a page is unmapped, we cannot keep PG_checkpointed
++		 * flag, it is not accessible via process VM and we have no way
++		 * to reset its state
++		 */
++		ClearPageCheckpointed(page);
+ 		__dec_zone_page_state(page,
+ 				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ 	}
+@@ -775,6 +788,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ 
+ 
+ 	page_remove_rmap(page, vma);
++	ub_unused_privvm_inc(mm, vma);
++	ub_percpu_inc(mm->mm_ub, unmap);
++	pb_remove_ref(page, mm);
+ 	page_cache_release(page);
+ 
+ out_unmap:
+@@ -865,6 +881,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
+ 			set_page_dirty(page);
+ 
+ 		page_remove_rmap(page, vma);
++		ub_percpu_inc(mm->mm_ub, unmap);
++		pb_remove_ref(page, mm);
++		ub_unused_privvm_inc(mm, vma);
+ 		page_cache_release(page);
+ 		dec_mm_counter(mm, file_rss);
+ 		(*mapcount)--;
+diff --git a/mm/shmem.c b/mm/shmem.c
+index e2a6ae1..c7dc238 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -55,6 +55,8 @@
+ #include <asm/div64.h>
+ #include <asm/pgtable.h>
+ 
++#include <bc/vmpages.h>
++
+ /* This magic number is used in glibc for posix shared memory */
+ #define TMPFS_MAGIC	0x01021994
+ 
+@@ -193,7 +195,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+ 
+ static const struct super_operations shmem_ops;
+ static const struct address_space_operations shmem_aops;
+-static const struct file_operations shmem_file_operations;
++const struct file_operations shmem_file_operations;
+ static const struct inode_operations shmem_inode_operations;
+ static const struct inode_operations shmem_dir_inode_operations;
+ static const struct inode_operations shmem_special_inode_operations;
+@@ -256,7 +258,7 @@ static void shmem_free_inode(struct super_block *sb)
+  *
+  * It has to be called with the spinlock held.
+  */
+-static void shmem_recalc_inode(struct inode *inode)
++static void shmem_recalc_inode(struct inode *inode, long swp_freed)
+ {
+ 	struct shmem_inode_info *info = SHMEM_I(inode);
+ 	long freed;
+@@ -266,6 +268,8 @@ static void shmem_recalc_inode(struct inode *inode)
+ 		info->alloced -= freed;
+ 		shmem_unacct_blocks(info->flags, freed);
+ 		shmem_free_blocks(inode, freed);
++		if (freed > swp_freed)
++			ub_tmpfs_respages_sub(info, freed - swp_freed);
+ 	}
+ }
+ 
+@@ -370,6 +374,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
+ 		struct page *page = kmap_atomic_to_page(entry);
+ 		set_page_private(page, page_private(page) + incdec);
+ 	}
++
++	if (incdec == 1)
++		ub_tmpfs_respages_dec(info);
++	else
++		ub_tmpfs_respages_inc(info);
+ }
+ 
+ /**
+@@ -386,14 +395,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
+ 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ 	struct page *page = NULL;
+ 	swp_entry_t *entry;
++	unsigned long ub_val;
+ 
+ 	if (sgp != SGP_WRITE &&
+ 	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ 		return ERR_PTR(-EINVAL);
+ 
++	ub_val = 0;
++	if (info->next_index <= index) {
++		ub_val = index + 1 - info->next_index;
++		if (ub_shmpages_charge(info, ub_val))
++			return ERR_PTR(-ENOSPC);
++	}
++
+ 	while (!(entry = shmem_swp_entry(info, index, &page))) {
+-		if (sgp == SGP_READ)
+-			return shmem_swp_map(ZERO_PAGE(0));
++		if (sgp == SGP_READ) {
++			entry = shmem_swp_map(ZERO_PAGE(0));
++			goto out;
++		}
+ 		/*
+ 		 * Test free_blocks against 1 not 0, since we have 1 data
+ 		 * page (and perhaps indirect index pages) yet to allocate:
+@@ -403,7 +422,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
+ 			spin_lock(&sbinfo->stat_lock);
+ 			if (sbinfo->free_blocks <= 1) {
+ 				spin_unlock(&sbinfo->stat_lock);
+-				return ERR_PTR(-ENOSPC);
++				entry = ERR_PTR(-ENOSPC);
++				goto out;
+ 			}
+ 			sbinfo->free_blocks--;
+ 			inode->i_blocks += BLOCKS_PER_PAGE;
+@@ -411,31 +431,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
+ 		}
+ 
+ 		spin_unlock(&info->lock);
+-		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
++		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) |
++					__GFP_UBC);
+ 		if (page)
+ 			set_page_private(page, 0);
+ 		spin_lock(&info->lock);
+ 
+ 		if (!page) {
+-			shmem_free_blocks(inode, 1);
+-			return ERR_PTR(-ENOMEM);
++			entry = ERR_PTR(-ENOMEM);
++			goto out_block;
+ 		}
+ 		if (sgp != SGP_WRITE &&
+ 		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ 			entry = ERR_PTR(-EINVAL);
+-			break;
++			goto out_dir;
+ 		}
+-		if (info->next_index <= index)
++		if (info->next_index <= index) {
++			ub_val = 0;
+ 			info->next_index = index + 1;
++		}
+ 	}
+ 	if (page) {
+ 		/* another task gave its page, or truncated the file */
+ 		shmem_free_blocks(inode, 1);
+ 		shmem_dir_free(page);
+ 	}
+-	if (info->next_index <= index && !IS_ERR(entry))
++	if (info->next_index <= index)
+ 		info->next_index = index + 1;
+ 	return entry;
++
++out_dir:
++	shmem_dir_free(page);
++out_block:
++	shmem_free_blocks(inode, 1);
++out:
++	if (ub_val)
++		ub_shmpages_uncharge(info, ub_val);
++	return entry;
+ }
+ 
+ /**
+@@ -543,6 +575,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+ 		return;
+ 
+ 	spin_lock(&info->lock);
++	ub_shmpages_uncharge(info, info->next_index - idx);
+ 	info->flags |= SHMEM_TRUNCATE;
+ 	if (likely(end == (loff_t) -1)) {
+ 		limit = info->next_index;
+@@ -729,7 +762,7 @@ done2:
+ 	info->swapped -= nr_swaps_freed;
+ 	if (nr_pages_to_free)
+ 		shmem_free_blocks(inode, nr_pages_to_free);
+-	shmem_recalc_inode(inode);
++	shmem_recalc_inode(inode, nr_swaps_freed);
+ 	spin_unlock(&info->lock);
+ 
+ 	/*
+@@ -812,6 +845,7 @@ static void shmem_delete_inode(struct inode *inode)
+ 		}
+ 	}
+ 	BUG_ON(inode->i_blocks);
++	shmi_ub_put(info);
+ 	shmem_free_inode(inode->i_sb);
+ 	clear_inode(inode);
+ }
+@@ -991,6 +1025,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
+ out:	return found;	/* 0 or 1 or -ENOMEM */
+ }
+ 
++#ifdef CONFIG_BEANCOUNTERS
++#define shm_get_swap_page(info)	(get_swap_page((info)->shmi_ub))
++#else
++#define shm_get_swap_page(info)	(get_swap_page(NULL))
++#endif
++
+ /*
+  * Move the page from the page cache to the swap cache.
+  */
+@@ -1021,7 +1061,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+ 	 * want to check if there's a redundant swappage to be discarded.
+ 	 */
+ 	if (wbc->for_reclaim)
+-		swap = get_swap_page();
++		swap = shm_get_swap_page(info);
+ 	else
+ 		swap.val = 0;
+ 
+@@ -1039,7 +1079,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+ 		free_swap_and_cache(*entry);
+ 		shmem_swp_set(info, entry, 0);
+ 	}
+-	shmem_recalc_inode(inode);
++	shmem_recalc_inode(inode, 0);
+ 
+ 	if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ 		remove_from_page_cache(page);
+@@ -1077,6 +1117,54 @@ redirty:
+ 	return 0;
+ }
+ 
++/* Insert a swap entry to shmem inode address space. */
++int shmem_insertpage(struct inode * inode, unsigned long index,
++		     swp_entry_t swap)
++{
++	struct shmem_inode_info *info;
++	swp_entry_t *entry;
++	int err;
++
++	info = SHMEM_I(inode);
++
++	spin_lock(&info->lock);
++	shmem_recalc_inode(inode, 0);
++	entry = shmem_swp_alloc(info, index, SGP_WRITE);
++	err = PTR_ERR(entry);
++	if (IS_ERR(entry))
++		goto unlock;
++
++	err = -EBUSY;
++	if (entry->val)
++		goto unlock_unmap;
++
++	err = -EINVAL;
++	if (!swap_duplicate(swap))
++		goto unlock_unmap;
++
++	info->alloced++;
++	ub_tmpfs_respages_inc(info);
++	inode->i_blocks += BLOCKS_PER_PAGE;
++	shmem_swp_set(info, entry, swap.val);
++	shmem_swp_unmap(entry);
++	spin_unlock(&info->lock);
++	if (list_empty(&info->swaplist)) {
++		mutex_lock(&shmem_swaplist_mutex);
++		/* move instead of add in case we're racing */
++		list_move_tail(&info->swaplist, &shmem_swaplist);
++		mutex_unlock(&shmem_swaplist_mutex);
++	}
++	return 0;
++
++unlock_unmap:
++	shmem_swp_unmap(entry);
++unlock:
++	spin_unlock(&info->lock);
++	return err;
++}
++EXPORT_SYMBOL(shmem_insertpage);
++
++
+ #ifdef CONFIG_NUMA
+ #ifdef CONFIG_TMPFS
+ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+@@ -1219,7 +1307,7 @@ repeat:
+ 	}
+ 
+ 	spin_lock(&info->lock);
+-	shmem_recalc_inode(inode);
++	shmem_recalc_inode(inode, 0);
+ 	entry = shmem_swp_alloc(info, idx, sgp);
+ 	if (IS_ERR(entry)) {
+ 		spin_unlock(&info->lock);
+@@ -1407,6 +1495,7 @@ repeat:
+ 		clear_highpage(filepage);
+ 		flush_dcache_page(filepage);
+ 		SetPageUptodate(filepage);
++		ub_tmpfs_respages_inc(info);
+ 		if (sgp == SGP_DIRTY)
+ 			set_page_dirty(filepage);
+ 	}
+@@ -1509,6 +1598,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
+ 		inode->i_generation = get_seconds();
+ 		info = SHMEM_I(inode);
+ 		memset(info, 0, (char *)inode - (char *)info);
++		shmi_ub_set(info, get_exec_ub());
+ 		spin_lock_init(&info->lock);
+ 		INIT_LIST_HEAD(&info->swaplist);
+ 
+@@ -2365,7 +2455,7 @@ static const struct address_space_operations shmem_aops = {
+ 	.migratepage	= migrate_page,
+ };
+ 
+-static const struct file_operations shmem_file_operations = {
++const struct file_operations shmem_file_operations = {
+ 	.mmap		= shmem_mmap,
+ #ifdef CONFIG_TMPFS
+ 	.llseek		= generic_file_llseek,
+@@ -2377,6 +2467,7 @@ static const struct file_operations shmem_file_operations = {
+ 	.splice_write	= generic_file_splice_write,
+ #endif
+ };
++EXPORT_SYMBOL_GPL(shmem_file_operations);
+ 
+ static const struct inode_operations shmem_inode_operations = {
+ 	.truncate	= shmem_truncate,
+@@ -2446,6 +2537,10 @@ static struct vm_operations_struct shmem_vm_ops = {
+ #endif
+ };
+ 
++int is_shmem_mapping(struct address_space *map)
++{
++	return (map != NULL && map->a_ops == &shmem_aops);
++}
+ 
+ static int shmem_get_sb(struct file_system_type *fs_type,
+ 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+@@ -2453,13 +2548,19 @@ static int shmem_get_sb(struct file_system_type *fs_type,
+ 	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+ }
+ 
+-static struct file_system_type tmpfs_fs_type = {
++struct file_system_type tmpfs_fs_type = {
+ 	.owner		= THIS_MODULE,
+ 	.name		= "tmpfs",
+ 	.get_sb		= shmem_get_sb,
+ 	.kill_sb	= kill_litter_super,
+ };
++EXPORT_SYMBOL(tmpfs_fs_type);
++
++#ifdef CONFIG_VE
++#define shm_mnt	(get_exec_env()->shmem_mnt)
++#else
+ static struct vfsmount *shm_mnt;
++#endif
+ 
+ static int __init init_tmpfs(void)
+ {
+@@ -2500,6 +2601,36 @@ out4:
+ }
+ module_init(init_tmpfs)
+ 
++static inline int shm_charge_ahead(struct inode *inode)
++{
++#ifdef CONFIG_BEANCOUNTERS
++	struct shmem_inode_info *info = SHMEM_I(inode);
++	unsigned long idx;
++	swp_entry_t *entry;
++
++	if (!inode->i_size)
++		return 0;
++	idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
++	/* 
++	 * Just touch info to allocate space for entry and
++	 * make all UBC checks 
++	 */
++	spin_lock(&info->lock);
++	entry = shmem_swp_alloc(info, idx, SGP_CACHE);
++	if (IS_ERR(entry))
++		goto err;
++	shmem_swp_unmap(entry);
++	spin_unlock(&info->lock);
++	return 0;
++
++err:
++	spin_unlock(&info->lock);
++	return PTR_ERR(entry);
++#else
++	return 0;
++#endif
++}
++
+ /**
+  * shmem_file_setup - get an unlinked file living in tmpfs
+  * @name: name for dentry (to be seen in /proc/<pid>/maps
+@@ -2546,6 +2677,9 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+ 	d_instantiate(dentry, inode);
+ 	inode->i_size = size;
+ 	inode->i_nlink = 0;	/* It is unlinked */
++	error = shm_charge_ahead(inode);
++	if (error)
++		goto close_file;
+ 	init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+ 			&shmem_file_operations);
+ 	return file;
+@@ -2558,6 +2692,7 @@ put_memory:
+ 	shmem_unacct_size(flags, size);
+ 	return ERR_PTR(error);
+ }
++EXPORT_SYMBOL_GPL(shmem_file_setup);
+ 
+ /**
+  * shmem_zero_setup - setup a shared anonymous mapping
+@@ -2574,6 +2709,8 @@ int shmem_zero_setup(struct vm_area_struct *vma)
+ 
+ 	if (vma->vm_file)
+ 		fput(vma->vm_file);
++	else if (vma->vm_flags & VM_WRITE)
++		__ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT);
+ 	vma->vm_file = file;
+ 	vma->vm_ops = &shmem_vm_ops;
+ 	return 0;
+diff --git a/mm/slab.c b/mm/slab.c
+index 046607f..bf82112 100644
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -111,30 +111,14 @@
+ #include	<linux/rtmutex.h>
+ #include	<linux/reciprocal_div.h>
+ #include	<linux/debugobjects.h>
++#include	<linux/nmi.h>
++#include	<linux/vzstat.h>
+ 
+ #include	<asm/cacheflush.h>
+ #include	<asm/tlbflush.h>
+ #include	<asm/page.h>
+ 
+-/*
+- * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+- *		  0 for faster, smaller code (especially in the critical paths).
+- *
+- * STATS	- 1 to collect stats for /proc/slabinfo.
+- *		  0 for faster, smaller code (especially in the critical paths).
+- *
+- * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+- */
+-
+-#ifdef CONFIG_DEBUG_SLAB
+-#define	DEBUG		1
+-#define	STATS		1
+-#define	FORCED_DEBUG	1
+-#else
+-#define	DEBUG		0
+-#define	STATS		0
+-#define	FORCED_DEBUG	0
+-#endif
++#include	<bc/kmem.h>
+ 
+ /* Shouldn't this be in a header file somewhere? */
+ #define	BYTES_PER_WORD		sizeof(void *)
+@@ -169,19 +153,21 @@
+ #endif
+ 
+ /* Legal flag mask for kmem_cache_create(). */
+-#if DEBUG
++#if SLAB_DEBUG
+ # define CREATE_MASK	(SLAB_RED_ZONE | \
+ 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
+ 			 SLAB_CACHE_DMA | \
+ 			 SLAB_STORE_USER | \
+ 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
++			 SLAB_UBC | SLAB_NO_CHARGE)
+ 			 SLAB_DEBUG_OBJECTS)
+ #else
+ # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
+ 			 SLAB_CACHE_DMA | \
+ 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+ 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
++			 SLAB_UBC | SLAB_NO_CHARGE | \
+ 			 SLAB_DEBUG_OBJECTS)
+ #endif
+ 
+@@ -371,87 +357,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
+ 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
+ 	} while (0)
+ 
+-/*
+- * struct kmem_cache
+- *
+- * manages a cache.
+- */
+-
+-struct kmem_cache {
+-/* 1) per-cpu data, touched during every alloc/free */
+-	struct array_cache *array[NR_CPUS];
+-/* 2) Cache tunables. Protected by cache_chain_mutex */
+-	unsigned int batchcount;
+-	unsigned int limit;
+-	unsigned int shared;
+-
+-	unsigned int buffer_size;
+-	u32 reciprocal_buffer_size;
+-/* 3) touched by every alloc & free from the backend */
+-
+-	unsigned int flags;		/* constant flags */
+-	unsigned int num;		/* # of objs per slab */
+-
+-/* 4) cache_grow/shrink */
+-	/* order of pgs per slab (2^n) */
+-	unsigned int gfporder;
+-
+-	/* force GFP flags, e.g. GFP_DMA */
+-	gfp_t gfpflags;
+-
+-	size_t colour;			/* cache colouring range */
+-	unsigned int colour_off;	/* colour offset */
+-	struct kmem_cache *slabp_cache;
+-	unsigned int slab_size;
+-	unsigned int dflags;		/* dynamic flags */
+-
+-	/* constructor func */
+-	void (*ctor)(struct kmem_cache *, void *);
+-
+-/* 5) cache creation/removal */
+-	const char *name;
+-	struct list_head next;
+-
+-/* 6) statistics */
+-#if STATS
+-	unsigned long num_active;
+-	unsigned long num_allocations;
+-	unsigned long high_mark;
+-	unsigned long grown;
+-	unsigned long reaped;
+-	unsigned long errors;
+-	unsigned long max_freeable;
+-	unsigned long node_allocs;
+-	unsigned long node_frees;
+-	unsigned long node_overflow;
+-	atomic_t allochit;
+-	atomic_t allocmiss;
+-	atomic_t freehit;
+-	atomic_t freemiss;
+-#endif
+-#if DEBUG
+-	/*
+-	 * If debugging is enabled, then the allocator can add additional
+-	 * fields and/or padding to every object. buffer_size contains the total
+-	 * object size including these internal fields, the following two
+-	 * variables contain the offset to the user object and its size.
+-	 */
+-	int obj_offset;
+-	int obj_size;
+-#endif
+-	/*
+-	 * We put nodelists[] at the end of kmem_cache, because we want to size
+-	 * this array to nr_node_ids slots instead of MAX_NUMNODES
+-	 * (see kmem_cache_init())
+-	 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
+-	 * is statically defined, so we reserve the max number of nodes.
+-	 */
+-	struct kmem_list3 *nodelists[MAX_NUMNODES];
+-	/*
+-	 * Do not add fields after nodelists[]
+-	 */
+-};
+-
+ #define CFLGS_OFF_SLAB		(0x80000000UL)
+ #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
+ 
+@@ -466,12 +371,14 @@ struct kmem_cache {
+ #define REAPTIMEOUT_CPUC	(2*HZ)
+ #define REAPTIMEOUT_LIST3	(4*HZ)
+ 
+-#if STATS
++#define	STATS_INC_GROWN(x)	((x)->grown++)
++#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
++#define	STATS_INC_SHRUNK(x)	((x)->shrunk++)
++
++#if SLAB_STATS
+ #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
+ #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
+ #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
+-#define	STATS_INC_GROWN(x)	((x)->grown++)
+-#define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
+ #define	STATS_SET_HIGH(x)						\
+ 	do {								\
+ 		if ((x)->num_active > (x)->high_mark)			\
+@@ -494,8 +401,6 @@ struct kmem_cache {
+ #define	STATS_INC_ACTIVE(x)	do { } while (0)
+ #define	STATS_DEC_ACTIVE(x)	do { } while (0)
+ #define	STATS_INC_ALLOCED(x)	do { } while (0)
+-#define	STATS_INC_GROWN(x)	do { } while (0)
+-#define	STATS_ADD_REAPED(x,y)	do { } while (0)
+ #define	STATS_SET_HIGH(x)	do { } while (0)
+ #define	STATS_INC_ERR(x)	do { } while (0)
+ #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
+@@ -508,7 +413,7 @@ struct kmem_cache {
+ #define STATS_INC_FREEMISS(x)	do { } while (0)
+ #endif
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 
+ /*
+  * memory layout of objects:
+@@ -640,6 +545,8 @@ struct cache_sizes malloc_sizes[] = {
+ #define CACHE(x) { .cs_size = (x) },
+ #include <linux/kmalloc_sizes.h>
+ 	CACHE(ULONG_MAX)
++#include <linux/kmalloc_sizes.h>
++	CACHE(ULONG_MAX)
+ #undef CACHE
+ };
+ EXPORT_SYMBOL(malloc_sizes);
+@@ -653,10 +560,17 @@ struct cache_names {
+ static struct cache_names __initdata cache_names[] = {
+ #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
+ #include <linux/kmalloc_sizes.h>
++	{NULL,},
++#undef CACHE
++#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" },
++#include <linux/kmalloc_sizes.h>
+ 	{NULL,}
+ #undef CACHE
+ };
+ 
++int malloc_cache_num;
++EXPORT_SYMBOL(malloc_cache_num);
++
+ static struct arraycache_init initarray_cache __initdata =
+     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+ static struct arraycache_init initarray_generic =
+@@ -733,6 +647,7 @@ static inline void init_lock_keys(void)
+  */
+ static DEFINE_MUTEX(cache_chain_mutex);
+ static struct list_head cache_chain;
++static spinlock_t cache_chain_lock;
+ 
+ /*
+  * chicken and egg problem: delay the per-cpu array allocation
+@@ -765,7 +680,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
+ {
+ 	struct cache_sizes *csizep = malloc_sizes;
+ 
+-#if DEBUG
++	if (gfpflags & __GFP_UBC)
++		csizep += malloc_cache_num;
++#if SLAB_DEBUG
+ 	/* This happens if someone tries to call
+ 	 * kmem_cache_create(), or __kmalloc(), before
+ 	 * the generic caches are initialized.
+@@ -795,9 +712,98 @@ static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+ 	return __find_general_cachep(size, gfpflags);
+ }
+ 
+-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
++static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
++{
++	return (kmem_bufctl_t *) (slabp + 1);
++}
++
++#ifdef CONFIG_BEANCOUNTERS
++#define init_slab_ubps(cachep, slabp)	do {				\
++		if (!((cachep)->flags & SLAB_UBC))			\
++			break;						\
++		memset(slab_ubcs(cachep, slabp), 0,			\
++				(cachep)->num * sizeof(void *));	\
++	} while (0)
++
++#define UB_ALIGN(flags)		(flags & SLAB_UBC ? sizeof(void *) : 1)
++#define UB_EXTRA(flags)		(flags & SLAB_UBC ? sizeof(void *) : 0)
++#define set_cache_objuse(cachep)	do {				\
++		(cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) +	\
++				(cachep)->num - 1) / (cachep)->num;	\
++		if (!OFF_SLAB(cachep))					\
++			break;						\
++		(cachep)->objuse += ((cachep)->slabp_cache->objuse +	\
++				(cachep)->num - 1) / (cachep)->num;	\
++	} while (0)
++
++void kmem_mark_nocharge(struct kmem_cache *cachep)
++{
++	cachep->flags |= SLAB_NO_CHARGE;
++}
++
++int kmem_cache_objuse(struct kmem_cache *cachep)
++{
++	return cachep->objuse;
++}
++
++EXPORT_SYMBOL(kmem_cache_objuse);
++
++int kmem_obj_objuse(void *obj)
++{
++	return virt_to_cache(obj)->objuse;
++}
++
++int kmem_dname_objuse(void *obj)
++{
++	return virt_to_cache(obj)->objuse;
++}
++
++unsigned long ub_cache_growth(struct kmem_cache *cachep)
++{
++	return (cachep->grown - cachep->reaped - cachep->shrunk)
++		<< cachep->gfporder;
++}
++
++#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\
++		(ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\
++		       sizeof(void *))))
++
++struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj)
++{
++	struct slab *slabp;
++	int objnr;
++
++	BUG_ON(!(cachep->flags & SLAB_UBC));
++	slabp = virt_to_slab(obj);
++	objnr = (obj - slabp->s_mem) / cachep->buffer_size;
++	return slab_ubcs(cachep, slabp) + objnr;
++}
++
++struct user_beancounter *slab_ub(void *obj)
+ {
+-	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
++	return *ub_slab_ptr(virt_to_cache(obj), obj);
++}
++
++EXPORT_SYMBOL(slab_ub);
++
++#else
++#define UB_ALIGN(flags)		1
++#define UB_EXTRA(flags)		0
++#define set_cache_objuse(c)	do { } while (0)
++#define init_slab_ubps(c, s)	do { } while (0)
++#endif
++
++static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags)
++{
++	size_t size_noub;
++
++	size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
++	return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags);
++}
++
++static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags)
++{
++	return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align);
+ }
+ 
+ /*
+@@ -842,20 +848,23 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
+ 		 * into account.
+ 		 */
+ 		nr_objs = (slab_size - sizeof(struct slab)) /
+-			  (buffer_size + sizeof(kmem_bufctl_t));
++			  (buffer_size + sizeof(kmem_bufctl_t) +
++			   	UB_EXTRA(flags));
+ 
+ 		/*
+ 		 * This calculated number will be either the right
+ 		 * amount, or one greater than what we want.
+ 		 */
+-		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+-		       > slab_size)
++		if (slab_mgmt_size(nr_objs, align, flags) +
++				nr_objs * buffer_size > slab_size)
+ 			nr_objs--;
++		BUG_ON(slab_mgmt_size(nr_objs, align, flags) +
++				nr_objs * buffer_size > slab_size);
+ 
+ 		if (nr_objs > SLAB_LIMIT)
+ 			nr_objs = SLAB_LIMIT;
+ 
+-		mgmt_size = slab_mgmt_size(nr_objs, align);
++		mgmt_size = slab_mgmt_size(nr_objs, align, flags);
+ 	}
+ 	*num = nr_objs;
+ 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+@@ -1403,6 +1412,7 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+ 	cachep->nodelists[nodeid] = ptr;
+ 	local_irq_enable();
+ }
++static int offslab_limit;
+ 
+ /*
+  * For setting up all the kmem_list3s for cache whose buffer_size is same as
+@@ -1476,6 +1486,7 @@ void __init kmem_cache_init(void)
+ 
+ 	/* 1) create the cache_cache */
+ 	INIT_LIST_HEAD(&cache_chain);
++	spin_lock_init(&cache_chain_lock);
+ 	list_add(&cache_cache.next, &cache_chain);
+ 	cache_cache.colour_off = cache_line_size();
+ 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+@@ -1487,7 +1498,7 @@ void __init kmem_cache_init(void)
+ 	 */
+ 	cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
+ 				 nr_node_ids * sizeof(struct kmem_list3 *);
+-#if DEBUG
++#if SLAB_DEBUG
+ 	cache_cache.obj_size = cache_cache.buffer_size;
+ #endif
+ 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+@@ -1534,6 +1545,7 @@ void __init kmem_cache_init(void)
+ 
+ 	slab_early_init = 0;
+ 
++	for (i = 0; i < 2; i++) {
+ 	while (sizes->cs_size != ULONG_MAX) {
+ 		/*
+ 		 * For performance, all the general caches are L1 aligned.
+@@ -1546,21 +1558,30 @@ void __init kmem_cache_init(void)
+ 			sizes->cs_cachep = kmem_cache_create(names->name,
+ 					sizes->cs_size,
+ 					ARCH_KMALLOC_MINALIGN,
+-					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
++					ARCH_KMALLOC_FLAGS|SLAB_PANIC|
++					(i ? SLAB_UBC : 0)|SLAB_NO_CHARGE,
+ 					NULL);
+ 		}
++		if (!(OFF_SLAB(sizes->cs_cachep)))
++			offslab_limit = sizes->cs_size;
+ #ifdef CONFIG_ZONE_DMA
+-		sizes->cs_dmacachep = kmem_cache_create(
+-					names->name_dma,
++		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
+ 					sizes->cs_size,
+ 					ARCH_KMALLOC_MINALIGN,
+ 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
++					(i ? SLAB_UBC : 0) | SLAB_NO_CHARGE|
+ 						SLAB_PANIC,
+ 					NULL);
+ #endif
+ 		sizes++;
+ 		names++;
+ 	}
++
++	sizes++;
++	names++;
++	if (!i)
++		malloc_cache_num = sizes - malloc_sizes;
++	}
+ 	/* 4) Replace the bootstrap head arrays */
+ 	{
+ 		struct array_cache *ptr;
+@@ -1730,7 +1751,7 @@ static void kmem_rcu_free(struct rcu_head *head)
+ 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
+ }
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
+@@ -1807,7 +1828,7 @@ static void dump_line(char *data, int offset, int limit)
+ }
+ #endif
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 
+ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
+ {
+@@ -1900,7 +1921,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
+ }
+ #endif
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ /**
+  * slab_destroy_objs - destroy a slab and its objects
+  * @cachep: cache pointer being destroyed
+@@ -2008,7 +2029,6 @@ static void __kmem_cache_destroy(struct kmem_cache *cachep)
+ static size_t calculate_slab_order(struct kmem_cache *cachep,
+ 			size_t size, size_t align, unsigned long flags)
+ {
+-	unsigned long offslab_limit;
+ 	size_t left_over = 0;
+ 	int gfporder;
+ 
+@@ -2021,15 +2041,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
+ 			continue;
+ 
+ 		if (flags & CFLGS_OFF_SLAB) {
+-			/*
+-			 * Max number of objs-per-slab for caches which
+-			 * use off-slab slabs. Needed to avoid a possible
+-			 * looping condition in cache_grow().
+-			 */
+-			offslab_limit = size - sizeof(struct slab);
+-			offslab_limit /= sizeof(kmem_bufctl_t);
++			int slab_size;
+ 
+- 			if (num > offslab_limit)
++			slab_size = slab_mgmt_size_noalign(num, flags);
++			if (slab_size > offslab_limit)
+ 				break;
+ 		}
+ 
+@@ -2193,9 +2208,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 		}
+ 	}
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
+-#if FORCED_DEBUG
++#if SLAB_FORCED_DEBUG
+ 	/*
+ 	 * Enable redzoning and last user accounting, except for caches with
+ 	 * large objects, if the increased size would increase the object size
+@@ -2280,7 +2295,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 	if (!cachep)
+ 		goto oops;
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 	cachep->obj_size = size;
+ 
+ 	/*
+@@ -2302,7 +2317,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 		else
+ 			size += BYTES_PER_WORD;
+ 	}
+-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
++#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
+ 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+ 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
+ 		cachep->obj_offset += PAGE_SIZE - size;
+@@ -2334,8 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 		cachep = NULL;
+ 		goto oops;
+ 	}
+-	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+-			  + sizeof(struct slab), align);
++	slab_size = slab_mgmt_size(cachep->num, align, flags);
+ 
+ 	/*
+ 	 * If the slab has been placed off-slab, and we have enough space then
+@@ -2348,8 +2362,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 
+ 	if (flags & CFLGS_OFF_SLAB) {
+ 		/* really off slab. No need for manual alignment */
+-		slab_size =
+-		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
++		slab_size = slab_mgmt_size_noalign(cachep->num, flags);
+ 	}
+ 
+ 	cachep->colour_off = cache_line_size();
+@@ -2386,7 +2399,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
+ 	}
+ 
+ 	/* cache setup completed, link it into the list */
++	spin_lock(&cache_chain_lock);
+ 	list_add(&cachep->next, &cache_chain);
++	spin_unlock(&cache_chain_lock);
++	set_cache_objuse(cachep);
+ oops:
+ 	if (!cachep && (flags & SLAB_PANIC))
+ 		panic("kmem_cache_create(): failed to create slab `%s'\n",
+@@ -2397,7 +2413,7 @@ oops:
+ }
+ EXPORT_SYMBOL(kmem_cache_create);
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ static void check_irq_off(void)
+ {
+ 	BUG_ON(!irqs_disabled());
+@@ -2493,10 +2509,11 @@ static int drain_freelist(struct kmem_cache *cache,
+ 		}
+ 
+ 		slabp = list_entry(p, struct slab, list);
+-#if DEBUG
++#if SLAB_DEBUG
+ 		BUG_ON(slabp->inuse);
+ #endif
+ 		list_del(&slabp->list);
++		STATS_INC_SHRUNK(cache);
+ 		/*
+ 		 * Safe to drop the lock. The slab is no longer linked
+ 		 * to the cache.
+@@ -2579,10 +2596,14 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
+ 	/*
+ 	 * the chain is never empty, cache_cache is never destroyed
+ 	 */
++	spin_lock(&cache_chain_lock);
+ 	list_del(&cachep->next);
++	spin_unlock(&cache_chain_lock);
+ 	if (__cache_shrink(cachep)) {
+ 		slab_error(cachep, "Can't free all objects");
++		spin_lock(&cache_chain_lock);
+ 		list_add(&cachep->next, &cache_chain);
++		spin_unlock(&cache_chain_lock);
+ 		mutex_unlock(&cache_chain_mutex);
+ 		put_online_cpus();
+ 		return;
+@@ -2591,6 +2612,8 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
+ 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+ 		synchronize_rcu();
+ 
++
++	ub_kmemcache_free(cachep);
+ 	__kmem_cache_destroy(cachep);
+ 	mutex_unlock(&cache_chain_mutex);
+ 	put_online_cpus();
+@@ -2617,7 +2640,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+ 	if (OFF_SLAB(cachep)) {
+ 		/* Slab management obj is off-slab. */
+ 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
+-					      local_flags & ~GFP_THISNODE, nodeid);
++				local_flags & (~(__GFP_UBC | GFP_THISNODE)),
++				nodeid);
+ 		if (!slabp)
+ 			return NULL;
+ 	} else {
+@@ -2629,14 +2653,10 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+ 	slabp->s_mem = objp + colour_off;
+ 	slabp->nodeid = nodeid;
+ 	slabp->free = 0;
++	init_slab_ubps(cachep, slabp);
+ 	return slabp;
+ }
+ 
+-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+-{
+-	return (kmem_bufctl_t *) (slabp + 1);
+-}
+-
+ static void cache_init_objs(struct kmem_cache *cachep,
+ 			    struct slab *slabp)
+ {
+@@ -2644,7 +2664,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
+ 
+ 	for (i = 0; i < cachep->num; i++) {
+ 		void *objp = index_to_obj(cachep, slabp, i);
+-#if DEBUG
++#if SLAB_DEBUG
+ 		/* need to poison the objs? */
+ 		if (cachep->flags & SLAB_POISON)
+ 			poison_obj(cachep, objp, POISON_FREE);
+@@ -2702,7 +2722,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+ 
+ 	slabp->inuse++;
+ 	next = slab_bufctl(slabp)[slabp->free];
+-#if DEBUG
++#if SLAB_DEBUG
+ 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ 	WARN_ON(slabp->nodeid != nodeid);
+ #endif
+@@ -2716,7 +2736,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+ {
+ 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 	/* Verify that the slab belongs to the intended node */
+ 	WARN_ON(slabp->nodeid != nodeid);
+ 
+@@ -2804,7 +2824,7 @@ static int cache_grow(struct kmem_cache *cachep,
+ 	 * 'nodeid'.
+ 	 */
+ 	if (!objp)
+-		objp = kmem_getpages(cachep, local_flags, nodeid);
++		objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid);
+ 	if (!objp)
+ 		goto failed;
+ 
+@@ -2837,7 +2857,7 @@ failed:
+ 	return 0;
+ }
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 
+ /*
+  * Perform extra freeing checks:
+@@ -3050,12 +3070,12 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+ 						gfp_t flags)
+ {
+ 	might_sleep_if(flags & __GFP_WAIT);
+-#if DEBUG
++#if SLAB_DEBUG
+ 	kmem_flagcheck(cachep, flags);
+ #endif
+ }
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+ 				gfp_t flags, void *objp, void *caller)
+ {
+@@ -3471,9 +3491,14 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+ 	cache_alloc_debugcheck_before(cachep, flags);
+ 	local_irq_save(save_flags);
+ 	objp = __do_cache_alloc(cachep, flags);
+-	local_irq_restore(save_flags);
+ 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+ 	prefetchw(objp);
++	if (objp && should_charge(cachep, flags) &&
++			ub_slab_charge(cachep, objp, flags)) {
++		kmem_cache_free(cachep, objp);
++		objp = NULL;
++	}
++	local_irq_restore(save_flags);
+ 
+ 	if (unlikely((flags & __GFP_ZERO) && objp))
+ 		memset(objp, 0, obj_size(cachep));
+@@ -3507,6 +3532,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
+ 		/* fixup slab chains */
+ 		if (slabp->inuse == 0) {
+ 			if (l3->free_objects > l3->free_limit) {
++				STATS_INC_SHRUNK(cachep);
+ 				l3->free_objects -= cachep->num;
+ 				/* No need to drop any previously held
+ 				 * lock here, even if we have a off-slab slab
+@@ -3535,7 +3561,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+ 	int node = numa_node_id();
+ 
+ 	batchcount = ac->batchcount;
+-#if DEBUG
++#if SLAB_DEBUG
+ 	BUG_ON(!batchcount || batchcount > ac->avail);
+ #endif
+ 	check_irq_off();
+@@ -3556,7 +3582,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+ 
+ 	free_block(cachep, ac->entry, batchcount, node);
+ free_done:
+-#if STATS
++#if SLAB_STATS
+ 	{
+ 		int i = 0;
+ 		struct list_head *p;
+@@ -3590,6 +3616,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+ 	check_irq_off();
+ 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ 
++	if (should_uncharge(cachep))
++		ub_slab_uncharge(cachep, objp);
++
+ 	/*
+ 	 * Skip calling cache_free_alien() when the platform is not numa.
+ 	 * This will avoid cache misses that happen while accessing slabp (which
+@@ -3998,7 +4027,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
+ 	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+ 		shared = 8;
+ 
+-#if DEBUG
++#if SLAB_DEBUG
+ 	/*
+ 	 * With debugging enabled, large batchcount lead to excessively long
+ 	 * periods with disabled local interrupts. Limit the batchcount
+@@ -4066,6 +4095,7 @@ static void cache_reap(struct work_struct *w)
+ 		/* Give up. Setup the next iteration. */
+ 		goto out;
+ 
++	{KSTAT_PERF_ENTER(cache_reap)
+ 	list_for_each_entry(searchp, &cache_chain, next) {
+ 		check_irq_on();
+ 
+@@ -4106,6 +4136,7 @@ next:
+ 	check_irq_on();
+ 	mutex_unlock(&cache_chain_mutex);
+ 	next_reap_node();
++	KSTAT_PERF_LEAVE(cache_reap)}
+ out:
+ 	/* Set up the next iteration */
+ 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+@@ -4119,7 +4150,7 @@ static void print_slabinfo_header(struct seq_file *m)
+ 	 * Output format version, so at least we can change it
+ 	 * without _too_ many complaints.
+ 	 */
+-#if STATS
++#if SLAB_STATS
+ 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+ #else
+ 	seq_puts(m, "slabinfo - version: 2.1\n");
+@@ -4128,14 +4159,82 @@ static void print_slabinfo_header(struct seq_file *m)
+ 		 "<objperslab> <pagesperslab>");
+ 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+-#if STATS
++#if SLAB_STATS
+ 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+-		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
++		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow> <shrunk>");
+ 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+ #endif
+ 	seq_putc(m, '\n');
+ }
+ 
++#define SHOW_TOP_SLABS	10
++
++static unsigned long get_cache_size(struct kmem_cache *cachep)
++{
++	unsigned long flags;
++	unsigned long slabs;
++	struct kmem_list3 *l3;
++	struct list_head *lh;
++	int node;
++
++	slabs = 0;
++
++	for_each_online_node (node) {
++		l3 = cachep->nodelists[node];
++		if (l3 == NULL)
++			continue;
++
++		spin_lock_irqsave(&l3->list_lock, flags);
++		list_for_each (lh, &l3->slabs_full)
++			slabs++;
++		list_for_each (lh, &l3->slabs_partial)
++			slabs++;
++		list_for_each (lh, &l3->slabs_free)
++			slabs++;
++		spin_unlock_irqrestore(&l3->list_lock, flags);
++	}
++
++	return slabs * (PAGE_SIZE << cachep->gfporder) +
++		(OFF_SLAB(cachep) ?
++		 cachep->slabp_cache->buffer_size * slabs : 0);
++}
++
++void show_slab_info(void)
++{
++	int i, j;
++	unsigned long size;
++	struct kmem_cache *ptr;
++	unsigned long sizes[SHOW_TOP_SLABS];
++	struct kmem_cache *top[SHOW_TOP_SLABS];
++
++	memset(top, 0, sizeof(top));
++	memset(sizes, 0, sizeof(sizes));
++
++	printk("Top %d caches:\n", SHOW_TOP_SLABS);
++
++	spin_lock(&cache_chain_lock);
++	list_for_each_entry (ptr, &cache_chain, next) {
++		size = get_cache_size(ptr);
++
++		j = 0;
++		for (i = 1; i < SHOW_TOP_SLABS; i++)
++			if (sizes[i] < sizes[j])
++				j = i;
++
++		if (size > sizes[j]) {
++			sizes[j] = size;
++			top[j] = ptr;
++		}
++	}
++
++	for (i = 0; i < SHOW_TOP_SLABS; i++)
++		if (top[i])
++			printk("%-21s: size %10lu objsize %10u\n",
++					top[i]->name, sizes[i],
++					top[i]->buffer_size);
++	spin_unlock(&cache_chain_lock);
++}
++
+ static void *s_start(struct seq_file *m, loff_t *pos)
+ {
+ 	loff_t n = *pos;
+@@ -4214,19 +4313,20 @@ static int s_show(struct seq_file *m, void *p)
+ 	if (error)
+ 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+ 
+-	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
++	seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d",
+ 		   name, active_objs, num_objs, cachep->buffer_size,
+ 		   cachep->num, (1 << cachep->gfporder));
+ 	seq_printf(m, " : tunables %4u %4u %4u",
+ 		   cachep->limit, cachep->batchcount, cachep->shared);
+ 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ 		   active_slabs, num_slabs, shared_avail);
+-#if STATS
++#if SLAB_STATS
+ 	{			/* list3 stats */
+ 		unsigned long high = cachep->high_mark;
+ 		unsigned long allocs = cachep->num_allocations;
+ 		unsigned long grown = cachep->grown;
+ 		unsigned long reaped = cachep->reaped;
++		unsigned long shrunk = cachep->shrunk;
+ 		unsigned long errors = cachep->errors;
+ 		unsigned long max_freeable = cachep->max_freeable;
+ 		unsigned long node_allocs = cachep->node_allocs;
+@@ -4234,9 +4334,10 @@ static int s_show(struct seq_file *m, void *p)
+ 		unsigned long overflows = cachep->node_overflow;
+ 
+ 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+-				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
++				%4lu %4lu %4lu %4lu %4lu %4lu",
++				allocs, high, grown,
+ 				reaped, errors, max_freeable, node_allocs,
+-				node_frees, overflows);
++				node_frees, overflows, shrunk);
+ 	}
+ 	/* cpu stats */
+ 	{
+diff --git a/mm/slub.c b/mm/slub.c
+index 315c392..ad802eb 100644
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -24,6 +24,8 @@
+ #include <linux/memory.h>
+ #include <linux/math64.h>
+ 
++#include <bc/kmem.h>
++
+ /*
+  * Lock order:
+  *   1. slab_lock(page)
+@@ -169,9 +171,11 @@ static inline void ClearSlabDebug(struct page *page)
+ 
+ /*
+  * Set of flags that will prevent slab merging
++ *
++ * FIXME - think over how to allow merging accountable slubs
+  */
+ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+-		SLAB_TRACE | SLAB_DESTROY_BY_RCU)
++		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_UBC)
+ 
+ #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
+ 		SLAB_CACHE_DMA)
+@@ -337,6 +341,95 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
+ 	return x.x & ((1 << 16) - 1);
+ }
+ 
++#ifdef CONFIG_BEANCOUNTERS
++static inline void inc_cache_grown(struct kmem_cache *s)
++{
++	atomic_inc(&s->grown);
++}
++
++static inline void dec_cache_grown(struct kmem_cache *s)
++{
++	atomic_dec(&s->grown);
++}
++
++unsigned long ub_cache_growth(struct kmem_cache *cachep)
++{
++	return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */
++}
++
++static void __flush_cpu_slab(struct kmem_cache *s, int cpu);
++
++int kmem_cache_objuse(struct kmem_cache *cachep)
++{
++	return cachep->objuse;
++}
++
++EXPORT_SYMBOL(kmem_cache_objuse);
++
++int kmem_obj_objuse(void *obj)
++{
++	return kmem_cache_objuse(virt_to_head_page(obj)->slab);
++}
++
++EXPORT_SYMBOL(kmem_obj_objuse);
++
++int kmem_dname_objuse(void *obj)
++{
++	struct kmem_cache *s;
++
++	/*
++	 * Allocations larger than PAGE_SIZE/2 go directly through
++	 * __get_free_pages() and aren't associated with any cache.
++	 */
++	s = virt_to_head_page(obj)->slab;
++	if (!s)
++		return PAGE_SIZE;
++	return kmem_cache_objuse(s);
++}
++
++#define page_ubs(pg)	(pg->bc.slub_ubs)
++
++struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj)
++{
++	struct page *pg;
++
++	BUG_ON(!(s->flags & SLAB_UBC));
++	pg = virt_to_head_page(obj);
++	return page_ubs(pg) + slab_index(obj, s, page_address(pg));
++}
++
++EXPORT_SYMBOL(ub_slab_ptr);
++
++struct user_beancounter *slab_ub(void *obj)
++{
++	struct page *pg;
++
++	pg = virt_to_head_page(obj);
++	BUG_ON(!(pg->slab->flags & SLAB_UBC));
++	return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))];
++}
++
++EXPORT_SYMBOL(slab_ub);
++
++void kmem_mark_nocharge(struct kmem_cache *cachep)
++{
++	cachep->flags |= SLAB_NO_CHARGE;
++}
++#else
++static inline void inc_cache_grown(struct kmem_cache *s)
++{
++}
++
++static inline void dec_cache_grown(struct kmem_cache *s)
++{
++}
++#endif
++
++void show_slab_info(void)
++{
++	/* FIXME - show it */
++}
++
+ #ifdef CONFIG_SLUB_DEBUG
+ /*
+  * Debug settings:
+@@ -1106,6 +1199,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+ 	struct page *page;
+ 	struct kmem_cache_order_objects oo = s->oo;
+ 
++	flags &= ~__GFP_UBC;
+ 	flags |= s->allocflags;
+ 
+ 	page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node,
+@@ -1128,9 +1222,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+ 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ 		1 << oo_order(oo));
+ 
++	inc_cache_grown(s);
+ 	return page;
+ }
+ 
++static void __free_slab(struct kmem_cache *s, struct page *page);
++
+ static void setup_object(struct kmem_cache *s, struct page *page,
+ 				void *object)
+ {
+@@ -1153,6 +1250,18 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+ 	if (!page)
+ 		goto out;
+ 
++#ifdef CONFIG_BEANCOUNTERS
++	if (s->flags & SLAB_UBC) {
++		BUG_ON(page_ubs(page) != NULL);
++		page_ubs(page) = kzalloc(page->objects * sizeof(void *),
++				flags & ~__GFP_UBC);
++		if (page_ubs(page) == NULL) {
++			__free_slab(s, page);
++			page = NULL;
++			goto out;
++		}
++	}
++#endif
+ 	inc_slabs_node(s, page_to_nid(page), page->objects);
+ 	page->slab = s;
+ 	page->flags |= 1 << PG_slab;
+@@ -1202,6 +1311,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
+ 
+ 	__ClearPageSlab(page);
+ 	reset_page_mapcount(page);
++#ifdef CONFIG_BEANCOUNTERS
++	if (page_ubs(page) != NULL) {
++		BUG_ON(!(s->flags & SLAB_UBC));
++		kfree(page_ubs(page));
++		page_ubs(page) = NULL;
++	}
++#endif
+ 	__free_pages(page, order);
+ }
+ 
+@@ -1224,6 +1340,8 @@ static void free_slab(struct kmem_cache *s, struct page *page)
+ 		call_rcu(head, rcu_free_slab);
+ 	} else
+ 		__free_slab(s, page);
++
++	dec_cache_grown(s);
+ }
+ 
+ static void discard_slab(struct kmem_cache *s, struct page *page)
+@@ -1642,6 +1760,13 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
+ 		c->freelist = object[c->offset];
+ 		stat(c, ALLOC_FASTPATH);
+ 	}
++
++	if (object && should_charge(s, gfpflags) &&
++			ub_slab_charge(s, object, gfpflags)) {
++		kmem_cache_free(s, object);
++		object = NULL;
++	}
++
+ 	local_irq_restore(flags);
+ 
+ 	if (unlikely((gfpflags & __GFP_ZERO) && object))
+@@ -1752,6 +1877,9 @@ static __always_inline void slab_free(struct kmem_cache *s,
+ 	local_irq_save(flags);
+ 	c = get_cpu_slab(s, smp_processor_id());
+ 	debug_check_no_locks_freed(object, c->objsize);
++
++	if (should_uncharge(s))
++		ub_slab_uncharge(s, x);
+ 	if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ 		debug_check_no_obj_freed(object, s->objsize);
+ 	if (likely(page == c->page && c->node >= 0)) {
+@@ -2342,6 +2470,9 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+ #ifdef CONFIG_NUMA
+ 	s->remote_node_defrag_ratio = 100;
+ #endif
++#ifdef CONFIG_BEANCOUNTERS
++	s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo));
++#endif
+ 	if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+ 		goto error;
+ 
+@@ -2496,6 +2627,10 @@ EXPORT_SYMBOL(kmem_cache_destroy);
+ 
+ struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+ EXPORT_SYMBOL(kmalloc_caches);
++#ifdef CONFIG_BEANCOUNTERS
++struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
++EXPORT_SYMBOL(ub_kmalloc_caches);
++#endif
+ 
+ static int __init setup_slub_min_order(char *str)
+ {
+@@ -2537,6 +2672,11 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
+ {
+ 	unsigned int flags = 0;
+ 
++	if (gfp_flags & __GFP_UBC) {
++		flags = SLAB_UBC | SLAB_NO_CHARGE;
++		gfp_flags &= ~__GFP_UBC;
++	}
++
+ 	if (gfp_flags & SLUB_DMA)
+ 		flags = SLAB_CACHE_DMA;
+ 
+@@ -2666,11 +2806,14 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+ 		index = fls(size - 1);
+ 
+ #ifdef CONFIG_ZONE_DMA
+-	if (unlikely((flags & SLUB_DMA)))
++	if (unlikely((flags & SLUB_DMA))) {
++		BUG_ON(flags & __GFP_UBC);
+ 		return dma_kmalloc_cache(index, flags);
++	}
+ 
+ #endif
+-	return &kmalloc_caches[index];
++
++	return __kmalloc_cache(flags, index);
+ }
+ 
+ void *__kmalloc(size_t size, gfp_t flags)
+@@ -2984,6 +3127,11 @@ void __init kmem_cache_init(void)
+ 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
+ 		sizeof(struct kmem_cache_node), GFP_KERNEL);
+ 	kmalloc_caches[0].refcount = -1;
++#ifdef CONFIG_BEANCOUNTERS
++	create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc",
++		sizeof(struct kmem_cache_node), GFP_KERNEL_UBC);
++	ub_kmalloc_caches[0].refcount = -1;
++#endif
+ 	caches++;
+ 
+ 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+@@ -2996,15 +3144,27 @@ void __init kmem_cache_init(void)
+ 	if (KMALLOC_MIN_SIZE <= 64) {
+ 		create_kmalloc_cache(&kmalloc_caches[1],
+ 				"kmalloc-96", 96, GFP_KERNEL);
++#ifdef CONFIG_BEANCOUNTERS
++		create_kmalloc_cache(&ub_kmalloc_caches[1],
++				"kmalloc-96-ubc", 96, GFP_KERNEL_UBC);
++#endif
+ 		caches++;
+ 		create_kmalloc_cache(&kmalloc_caches[2],
+ 				"kmalloc-192", 192, GFP_KERNEL);
++#ifdef CONFIG_BEANCOUNTERS
++		create_kmalloc_cache(&ub_kmalloc_caches[2],
++				"kmalloc-192-ubc", 192, GFP_KERNEL_UBC);
++#endif
+ 		caches++;
+ 	}
+ 
+ 	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+ 		create_kmalloc_cache(&kmalloc_caches[i],
+ 			"kmalloc", 1 << i, GFP_KERNEL);
++#ifdef CONFIG_BEANCOUNTERS
++		create_kmalloc_cache(&ub_kmalloc_caches[i],
++			"kmalloc-ubc", 1 << i, GFP_KERNEL_UBC);
++#endif
+ 		caches++;
+ 	}
+ 
+@@ -3039,9 +3199,14 @@ void __init kmem_cache_init(void)
+ 	slab_state = UP;
+ 
+ 	/* Provide the correct kmalloc names now that the caches are up */
+-	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
++	for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+ 		kmalloc_caches[i]. name =
+ 			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
++#ifdef CONFIG_BEANCOUNTERS
++		ub_kmalloc_caches[i].name =
++			kasprintf(GFP_KERNEL, "kmalloc-%d-ubc", 1 << i);
++#endif
++	}
+ 
+ #ifdef CONFIG_SMP
+ 	register_cpu_notifier(&slab_notifier);
+@@ -4308,6 +4473,8 @@ static char *create_unique_id(struct kmem_cache *s)
+ 		*p++ = 'a';
+ 	if (s->flags & SLAB_DEBUG_FREE)
+ 		*p++ = 'F';
++	if (s->flags & SLAB_UBC)
++		*p++ = 'b';
+ 	if (p != name + 1)
+ 		*p++ = '-';
+ 	p += sprintf(p, "%07d", s->size);
+diff --git a/mm/swap.c b/mm/swap.c
+index 45c9f25..8160a2e 100644
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -209,6 +209,7 @@ void lru_cache_add_active(struct page *page)
+ 		__pagevec_lru_add_active(pvec);
+ 	put_cpu_var(lru_add_active_pvecs);
+ }
++EXPORT_SYMBOL(lru_cache_add_active);
+ 
+ /*
+  * Drain pages out of the cpu's pagevecs.
+@@ -244,6 +245,8 @@ void lru_add_drain(void)
+ 	put_cpu();
+ }
+ 
++EXPORT_SYMBOL(lru_add_drain);
++
+ #ifdef CONFIG_NUMA
+ static void lru_add_drain_per_cpu(struct work_struct *dummy)
+ {
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index d8aadaf..46cb3df 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -20,6 +20,9 @@
+ 
+ #include <asm/pgtable.h>
+ 
++#include <bc/vmpages.h>
++#include <bc/io_acct.h>
++
+ /*
+  * swapper_space is a fiction, retained to simplify the path through
+  * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
+@@ -44,6 +47,7 @@ struct address_space swapper_space = {
+ 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+ 	.backing_dev_info = &swap_backing_dev_info,
+ };
++EXPORT_SYMBOL(swapper_space);
+ 
+ #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
+ 
+@@ -93,6 +97,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
+ 	return error;
+ }
+ 
++EXPORT_SYMBOL(add_to_swap_cache);
++
+ /*
+  * This must be called only on pages that have
+  * been verified to be in the swap cache.
+@@ -129,7 +135,14 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
+ 	BUG_ON(!PageUptodate(page));
+ 
+ 	for (;;) {
+-		entry = get_swap_page();
++		struct user_beancounter *ub;
++
++		ub = pb_grab_page_ub(page);
++		if (IS_ERR(ub))
++			return 0;
++
++		entry = get_swap_page(ub);
++		put_beancounter(ub);
+ 		if (!entry.val)
+ 			return 0;
+ 
+@@ -313,6 +326,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ 	return found_page;
+ }
+ 
++EXPORT_SYMBOL(read_swap_cache_async);
++
+ /**
+  * swapin_readahead - swap in pages in hope we need them soon
+  * @entry: swap entry of this memory
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index bd1bb59..019db42 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -33,6 +33,8 @@
+ #include <asm/tlbflush.h>
+ #include <linux/swapops.h>
+ 
++#include <bc/vmpages.h>
++
+ DEFINE_SPINLOCK(swap_lock);
+ unsigned int nr_swapfiles;
+ long total_swap_pages;
+@@ -44,8 +46,12 @@ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+ 
+ struct swap_list_t swap_list = {-1, -1};
++struct swap_info_struct swap_info[MAX_SWAPFILES];
+ 
+-static struct swap_info_struct swap_info[MAX_SWAPFILES];
++EXPORT_SYMBOL(total_swap_pages);
++EXPORT_SYMBOL(swap_lock);
++EXPORT_SYMBOL(swap_list);
++EXPORT_SYMBOL(swap_info);
+ 
+ static DEFINE_MUTEX(swapon_mutex);
+ 
+@@ -172,7 +178,7 @@ no_page:
+ 	return 0;
+ }
+ 
+-swp_entry_t get_swap_page(void)
++swp_entry_t get_swap_page(struct user_beancounter *ub)
+ {
+ 	struct swap_info_struct *si;
+ 	pgoff_t offset;
+@@ -193,6 +199,8 @@ swp_entry_t get_swap_page(void)
+ 			wrapped++;
+ 		}
+ 
++		if (si->flags & SWP_READONLY)
++			continue;
+ 		if (!si->highest_bit)
+ 			continue;
+ 		if (!(si->flags & SWP_WRITEOK))
+@@ -202,6 +210,7 @@ swp_entry_t get_swap_page(void)
+ 		offset = scan_swap_map(si);
+ 		if (offset) {
+ 			spin_unlock(&swap_lock);
++			ub_swapentry_inc(si, offset, ub);
+ 			return swp_entry(type, offset);
+ 		}
+ 		next = swap_list.next;
+@@ -213,6 +222,8 @@ noswap:
+ 	return (swp_entry_t) {0};
+ }
+ 
++EXPORT_SYMBOL(get_swap_page);
++
+ swp_entry_t get_swap_page_of_type(int type)
+ {
+ 	struct swap_info_struct *si;
+@@ -220,7 +231,7 @@ swp_entry_t get_swap_page_of_type(int type)
+ 
+ 	spin_lock(&swap_lock);
+ 	si = swap_info + type;
+-	if (si->flags & SWP_WRITEOK) {
++	if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) {
+ 		nr_swap_pages--;
+ 		offset = scan_swap_map(si);
+ 		if (offset) {
+@@ -277,6 +288,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+ 		count--;
+ 		p->swap_map[offset] = count;
+ 		if (!count) {
++			ub_swapentry_dec(p, offset);
+ 			if (offset < p->lowest_bit)
+ 				p->lowest_bit = offset;
+ 			if (offset > p->highest_bit)
+@@ -305,6 +317,8 @@ void swap_free(swp_entry_t entry)
+ 	}
+ }
+ 
++EXPORT_SYMBOL(swap_free);
++
+ /*
+  * How many references to page are currently swapped out?
+  */
+@@ -386,6 +400,55 @@ int remove_exclusive_swap_page(struct page *page)
+ 	return retval;
+ }
+ 
++int try_to_remove_exclusive_swap_page(struct page *page)
++{
++	int retval;
++	struct swap_info_struct * p;
++	swp_entry_t entry;
++
++	BUG_ON(PagePrivate(page));
++	BUG_ON(!PageLocked(page));
++
++	if (!PageSwapCache(page))
++		return 0;
++	if (PageWriteback(page))
++		return 0;
++	if (page_count(page) != 2) /* 2: us + cache */
++		return 0;
++
++	entry.val = page->private;
++	p = swap_info_get(entry);
++	if (!p)
++		return 0;
++
++	if (!vm_swap_full() &&
++			(p->flags & (SWP_ACTIVE|SWP_READONLY)) == SWP_ACTIVE) {
++		spin_unlock(&swap_lock);
++		return 0;
++	}
++
++	/* Is the only swap cache user the cache itself? */
++	retval = 0;
++	if (p->swap_map[swp_offset(entry)] == 1) {
++		/* Recheck the page count with the swapcache lock held.. */
++		write_lock_irq(&swapper_space.tree_lock);
++		if ((page_count(page) == 2) && !PageWriteback(page)) {
++			__delete_from_swap_cache(page);
++			SetPageDirty(page);
++			retval = 1;
++		}
++		write_unlock_irq(&swapper_space.tree_lock);
++	}
++	spin_unlock(&swap_lock);
++
++	if (retval) {
++		swap_free(entry);
++		page_cache_release(page);
++	}
++
++	return retval;
++}
++
+ /*
+  * Free the swap entry like above, but also try to
+  * free the page cache entry if it is the last user.
+@@ -425,6 +488,7 @@ void free_swap_and_cache(swp_entry_t entry)
+ 		page_cache_release(page);
+ 	}
+ }
++EXPORT_SYMBOL(free_swap_and_cache);
+ 
+ #ifdef CONFIG_HIBERNATION
+ /*
+@@ -508,11 +572,13 @@ unsigned int count_swap_pages(int type, int free)
+  * force COW, vm_page_prot omits write permission from any private vma.
+  */
+ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
+-		unsigned long addr, swp_entry_t entry, struct page *page)
++		unsigned long addr, swp_entry_t entry, struct page *page,
++		struct page_beancounter **pb)
+ {
+ 	spinlock_t *ptl;
+ 	pte_t *pte;
+ 	int ret = 1;
++	struct mm_struct *mm = vma->vm_mm;
+ 
+ 	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ 		ret = -ENOMEM;
+@@ -525,9 +591,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
+ 		goto out;
+ 	}
+ 
+-	inc_mm_counter(vma->vm_mm, anon_rss);
++	inc_mm_counter(mm, anon_rss);
++	ub_unused_privvm_dec(mm, vma);
++	pb_add_ref(page, mm, pb);
+ 	get_page(page);
+-	set_pte_at(vma->vm_mm, addr, pte,
++	set_pte_at(mm, addr, pte,
+ 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ 	page_add_anon_rmap(page, vma, addr);
+ 	swap_free(entry);
+@@ -543,7 +611,8 @@ out:
+ 
+ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ 				unsigned long addr, unsigned long end,
+-				swp_entry_t entry, struct page *page)
++				swp_entry_t entry, struct page *page,
++				struct page_beancounter **pb)
+ {
+ 	pte_t swp_pte = swp_entry_to_pte(entry);
+ 	pte_t *pte;
+@@ -566,7 +635,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ 		 */
+ 		if (unlikely(pte_same(*pte, swp_pte))) {
+ 			pte_unmap(pte);
+-			ret = unuse_pte(vma, pmd, addr, entry, page);
++			ret = unuse_pte(vma, pmd, addr, entry, page, pb);
+ 			if (ret)
+ 				goto out;
+ 			pte = pte_offset_map(pmd, addr);
+@@ -579,7 +648,8 @@ out:
+ 
+ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 				unsigned long addr, unsigned long end,
+-				swp_entry_t entry, struct page *page)
++				swp_entry_t entry, struct page *page,
++				struct page_beancounter **pb)
+ {
+ 	pmd_t *pmd;
+ 	unsigned long next;
+@@ -590,7 +660,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 		next = pmd_addr_end(addr, end);
+ 		if (pmd_none_or_clear_bad(pmd))
+ 			continue;
+-		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
++		ret = unuse_pte_range(vma, pmd, addr, next, entry, page, pb);
+ 		if (ret)
+ 			return ret;
+ 	} while (pmd++, addr = next, addr != end);
+@@ -599,7 +669,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ 
+ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ 				unsigned long addr, unsigned long end,
+-				swp_entry_t entry, struct page *page)
++				swp_entry_t entry, struct page *page,
++				struct page_beancounter **pb)
+ {
+ 	pud_t *pud;
+ 	unsigned long next;
+@@ -610,7 +681,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ 		next = pud_addr_end(addr, end);
+ 		if (pud_none_or_clear_bad(pud))
+ 			continue;
+-		ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
++		ret = unuse_pmd_range(vma, pud, addr, next, entry, page, pb);
+ 		if (ret)
+ 			return ret;
+ 	} while (pud++, addr = next, addr != end);
+@@ -618,7 +689,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ }
+ 
+ static int unuse_vma(struct vm_area_struct *vma,
+-				swp_entry_t entry, struct page *page)
++				swp_entry_t entry, struct page *page,
++				struct page_beancounter **pb)
+ {
+ 	pgd_t *pgd;
+ 	unsigned long addr, end, next;
+@@ -640,7 +712,7 @@ static int unuse_vma(struct vm_area_struct *vma,
+ 		next = pgd_addr_end(addr, end);
+ 		if (pgd_none_or_clear_bad(pgd))
+ 			continue;
+-		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
++		ret = unuse_pud_range(vma, pgd, addr, next, entry, page, pb);
+ 		if (ret)
+ 			return ret;
+ 	} while (pgd++, addr = next, addr != end);
+@@ -648,7 +720,8 @@ static int unuse_vma(struct vm_area_struct *vma,
+ }
+ 
+ static int unuse_mm(struct mm_struct *mm,
+-				swp_entry_t entry, struct page *page)
++				swp_entry_t entry, struct page *page,
++				struct page_beancounter **pb)
+ {
+ 	struct vm_area_struct *vma;
+ 	int ret = 0;
+@@ -664,7 +737,7 @@ static int unuse_mm(struct mm_struct *mm,
+ 		lock_page(page);
+ 	}
+ 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+-		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
++		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page, pb)))
+ 			break;
+ 	}
+ 	up_read(&mm->mmap_sem);
+@@ -726,6 +799,7 @@ static int try_to_unuse(unsigned int type)
+ 	int retval = 0;
+ 	int reset_overflow = 0;
+ 	int shmem;
++	struct page_beancounter *pb;
+ 
+ 	/*
+ 	 * When searching mms for an entry, a good strategy is to
+@@ -778,6 +852,13 @@ static int try_to_unuse(unsigned int type)
+ 			break;
+ 		}
+ 
++		pb = NULL;
++		if (pb_alloc_all(&pb)) {
++			page_cache_release(page);
++			retval = -ENOMEM;
++			break;
++		}
++
+ 		/*
+ 		 * Don't hold on to start_mm if it looks like exiting.
+ 		 */
+@@ -800,6 +881,20 @@ static int try_to_unuse(unsigned int type)
+ 		lock_page(page);
+ 		wait_on_page_writeback(page);
+ 
++		/* If read failed we cannot map not-uptodate page to 
++		 * user space. Actually, we are in serious troubles,
++		 * we do not even know what process to kill. So, the only
++		 * variant remains: to stop swapoff() and allow someone
++		 * to kill processes to zap invalid pages.
++		 */
++		if (unlikely(!PageUptodate(page))) {
++			pb_free_list(&pb);
++			unlock_page(page);
++			page_cache_release(page);
++			retval = -EIO;
++			break;
++		}
++
+ 		/*
+ 		 * Remove all references to entry.
+ 		 * Whenever we reach init_mm, there's no address space
+@@ -811,7 +906,7 @@ static int try_to_unuse(unsigned int type)
+ 			if (start_mm == &init_mm)
+ 				shmem = shmem_unuse(entry, page);
+ 			else
+-				retval = unuse_mm(start_mm, entry, page);
++				retval = unuse_mm(start_mm, entry, page, &pb);
+ 		}
+ 		if (*swap_map > 1) {
+ 			int set_start_mm = (*swap_map >= swcount);
+@@ -841,7 +936,7 @@ static int try_to_unuse(unsigned int type)
+ 					set_start_mm = 1;
+ 					shmem = shmem_unuse(entry, page);
+ 				} else
+-					retval = unuse_mm(mm, entry, page);
++					retval = unuse_mm(mm, entry, page, &pb);
+ 				if (set_start_mm && *swap_map < swcount) {
+ 					mmput(new_start_mm);
+ 					atomic_inc(&mm->mm_users);
+@@ -862,6 +957,8 @@ static int try_to_unuse(unsigned int type)
+ 			retval = shmem;
+ 			break;
+ 		}
++
++		pb_free_list(&pb);
+ 		if (retval) {
+ 			unlock_page(page);
+ 			page_cache_release(page);
+@@ -1214,6 +1311,10 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
+ 	int i, type, prev;
+ 	int err;
+ 	
++	/* VE admin check is just to be on the safe side, the admin may affect
++	 * swaps only if he has access to special, i.e. if he has been granted
++	 * access to the block device or if the swap file is in the area
++	 * visible to him. */
+ 	if (!capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 
+@@ -1313,6 +1414,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
+ 	spin_unlock(&swap_lock);
+ 	mutex_unlock(&swapon_mutex);
+ 	vfree(swap_map);
++	ub_swap_fini(p);
+ 	inode = mapping->host;
+ 	if (S_ISBLK(inode->i_mode)) {
+ 		struct block_device *bdev = I_BDEV(inode);
+@@ -1332,6 +1434,8 @@ out:
+ 	return err;
+ }
+ 
++EXPORT_SYMBOL(sys_swapoff);
++
+ #ifdef CONFIG_PROC_FS
+ /* iterator */
+ static void *swap_start(struct seq_file *swap, loff_t *pos)
+@@ -1426,7 +1530,7 @@ static const struct file_operations proc_swaps_operations = {
+ 
+ static int __init procswaps_init(void)
+ {
+-	proc_create("swaps", 0, NULL, &proc_swaps_operations);
++	proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations);
+ 	return 0;
+ }
+ __initcall(procswaps_init);
+@@ -1670,9 +1774,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+ 		goto bad_swap;
+ 	}
+ 
++	if (ub_swap_init(p, maxpages)) {
++		error = -ENOMEM;
++		goto bad_swap;
++	}
++
+ 	mutex_lock(&swapon_mutex);
+ 	spin_lock(&swap_lock);
+ 	p->flags = SWP_ACTIVE;
++	if (swap_flags & SWAP_FLAG_READONLY)
++		p->flags |= SWP_READONLY;
+ 	nr_swap_pages += nr_good_pages;
+ 	total_swap_pages += nr_good_pages;
+ 
+@@ -1732,6 +1843,8 @@ out:
+ 	return error;
+ }
+ 
++EXPORT_SYMBOL(sys_swapon);
++
+ void si_swapinfo(struct sysinfo *val)
+ {
+ 	unsigned int i;
+@@ -1791,6 +1904,8 @@ bad_file:
+ 	goto out;
+ }
+ 
++EXPORT_SYMBOL(swap_duplicate);
++
+ struct swap_info_struct *
+ get_swap_info_struct(unsigned type)
+ {
+diff --git a/mm/truncate.c b/mm/truncate.c
+index b8961cb..d2f3e40 100644
+--- a/mm/truncate.c
++++ b/mm/truncate.c
+@@ -77,6 +77,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
+ 					BDI_RECLAIMABLE);
+ 			if (account_size)
+ 				task_io_account_cancelled_write(account_size);
++			ub_io_release_context(page, account_size);
+ 		}
+ 	}
+ }
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index 6e45b0f..555f735 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -22,6 +22,9 @@
+ #include <asm/uaccess.h>
+ #include <asm/tlbflush.h>
+ 
++#include <bc/kmem.h>
++#include <bc/debug.h>
++
+ 
+ DEFINE_RWLOCK(vmlist_lock);
+ struct vm_struct *vmlist;
+@@ -334,6 +337,70 @@ static struct vm_struct *__find_vm_area(const void *addr)
+ 	return tmp;
+ }
+ 
++struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags)
++{
++	unsigned long addr, best_addr, delta, best_delta;
++	struct vm_struct **p, **best_p, *tmp, *area;
++
++	area = kmalloc(sizeof(*area), GFP_KERNEL);
++	if (!area)
++		return NULL;
++
++	size += PAGE_SIZE; /* one-page gap at the end */
++	addr = VMALLOC_START;
++	best_addr = 0UL;
++	best_p = NULL;
++	best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START;
++
++	write_lock(&vmlist_lock);
++	for (p = &vmlist; (tmp = *p) &&
++			(tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END));
++			p = &tmp->next) {
++		if ((unsigned long)tmp->addr < addr)
++			continue;
++		if ((size + addr) < addr)
++			break;
++		delta = (unsigned long) tmp->addr - (size + addr);
++		if (delta < best_delta) {
++			best_delta = delta;
++			best_addr = addr;
++			best_p = p;
++		}
++		addr = tmp->size + (unsigned long)tmp->addr;
++		if (addr > VMALLOC_END-size)
++			break;
++	}
++
++	if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) {
++		/* check free area after list end */
++		delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr);
++		if (delta < best_delta) {
++			best_delta = delta;
++			best_addr = addr;
++			best_p = p;
++		}
++	}
++	if (best_addr) {
++		area->flags = flags;
++		/* allocate at the end of this area */
++		area->addr = (void *)(best_addr + best_delta);
++		area->size = size;
++		area->next = *best_p;
++		area->pages = NULL;
++		area->nr_pages = 0;
++		area->phys_addr = 0;
++		*best_p = area;
++		/* check like in __vunmap */
++		WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr);
++	} else {
++		kfree(area);
++		area = NULL;
++	}
++	write_unlock(&vmlist_lock);
++
++	return area;
++}
++
+ /* Caller must hold vmlist_lock */
+ static struct vm_struct *__remove_vm_area(const void *addr)
+ {
+@@ -373,7 +440,7 @@ struct vm_struct *remove_vm_area(const void *addr)
+ 	return v;
+ }
+ 
+-static void __vunmap(const void *addr, int deallocate_pages)
++static void __vunmap(const void *addr, int deallocate_pages, int uncharge)
+ {
+ 	struct vm_struct *area;
+ 
+@@ -400,6 +467,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
+ 	if (deallocate_pages) {
+ 		int i;
+ 
++		if (uncharge)
++			dec_vmalloc_charged(area);
+ 		for (i = 0; i < area->nr_pages; i++) {
+ 			struct page *page = area->pages[i];
+ 
+@@ -430,7 +499,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
+ void vfree(const void *addr)
+ {
+ 	BUG_ON(in_interrupt());
+-	__vunmap(addr, 1);
++	__vunmap(addr, 1, 1);
+ }
+ EXPORT_SYMBOL(vfree);
+ 
+@@ -446,7 +515,7 @@ EXPORT_SYMBOL(vfree);
+ void vunmap(const void *addr)
+ {
+ 	BUG_ON(in_interrupt());
+-	__vunmap(addr, 0);
++	__vunmap(addr, 0, 0);
+ }
+ EXPORT_SYMBOL(vunmap);
+ 
+@@ -528,10 +597,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ 
+ 	if (map_vm_area(area, prot, &pages))
+ 		goto fail;
++
++	inc_vmalloc_charged(area, gfp_mask);
+ 	return area->addr;
+ 
+ fail:
+-	vfree(area->addr);
++	__vunmap(area->addr, 1, 0);
+ 	return NULL;
+ }
+ 
+@@ -578,6 +649,22 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+ }
+ EXPORT_SYMBOL(__vmalloc);
+ 
++static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot,
++		void *caller)
++{
++	struct vm_struct *area;
++
++	size = PAGE_ALIGN(size);
++	if (!size || (size >> PAGE_SHIFT) > num_physpages)
++		return NULL;
++
++	area = get_vm_area_best(size, VM_ALLOC);
++	if (!area)
++		return NULL;
++
++	return __vmalloc_area_node(area, mask, prot, -1, caller);
++}
++
+ /**
+  *	vmalloc  -  allocate virtually contiguous memory
+  *	@size:		allocation size
+@@ -594,6 +681,28 @@ void *vmalloc(unsigned long size)
+ }
+ EXPORT_SYMBOL(vmalloc);
+ 
++void *ub_vmalloc(unsigned long size)
++{
++	return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL);
++}
++EXPORT_SYMBOL(ub_vmalloc);
++
++void *vmalloc_best(unsigned long size)
++{
++	return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
++			__builtin_return_address(0));
++}
++
++EXPORT_SYMBOL(vmalloc_best);
++
++void *ub_vmalloc_best(unsigned long size)
++{
++	return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL,
++			__builtin_return_address(0));
++}
++
++EXPORT_SYMBOL(ub_vmalloc_best);
++
+ /**
+  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
+  * @size: allocation size
+@@ -635,6 +744,13 @@ void *vmalloc_node(unsigned long size, int node)
+ }
+ EXPORT_SYMBOL(vmalloc_node);
+ 
++void *ub_vmalloc_node(unsigned long size, int node)
++{
++	return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL,
++					node, __builtin_return_address(0));
++}
++EXPORT_SYMBOL(ub_vmalloc_node);
++
+ #ifndef PAGE_KERNEL_EXEC
+ # define PAGE_KERNEL_EXEC PAGE_KERNEL
+ #endif
+@@ -898,6 +1014,39 @@ void free_vm_area(struct vm_struct *area)
+ }
+ EXPORT_SYMBOL_GPL(free_vm_area);
+ 
++void vprintstat(void)
++{
++	struct vm_struct *p, *last_p = NULL;
++	unsigned long addr, size, free_size, max_free_size;
++	int num;
++
++	addr = VMALLOC_START;
++	size = max_free_size = 0;
++	num = 0;
++
++	read_lock(&vmlist_lock);
++	for (p = vmlist; p; p = p->next) {
++		free_size = (unsigned long)p->addr - addr;
++		if (free_size > max_free_size)
++			max_free_size = free_size;
++		addr = (unsigned long)p->addr + p->size;
++		size += p->size;
++		++num;
++		last_p = p;
++	}
++	if (last_p) {
++		free_size = VMALLOC_END -
++			((unsigned long)last_p->addr + last_p->size);
++		if (free_size > max_free_size)
++			max_free_size = free_size;
++	}
++	read_unlock(&vmlist_lock);
++
++	printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n"
++		"    Max_Free: %luKB Start: %lx End: %lx\n",
++		size/1024, (VMALLOC_END - VMALLOC_START)/1024, num,
++		max_free_size/1024, VMALLOC_START, VMALLOC_END);
++}
+ 
+ #ifdef CONFIG_PROC_FS
+ static void *s_start(struct seq_file *m, loff_t *pos)
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 967d30c..8e05cd3 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -39,10 +39,14 @@
+ #include <linux/freezer.h>
+ #include <linux/memcontrol.h>
+ 
++#include <bc/oom_kill.h>
++#include <bc/io_acct.h>
++
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+ 
+ #include <linux/swapops.h>
++#include <linux/vzstat.h>
+ 
+ #include "internal.h"
+ 
+@@ -177,6 +181,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ 	if (scanned == 0)
+ 		scanned = SWAP_CLUSTER_MAX;
+ 
++	if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
++		return 1;
++
+ 	if (!down_read_trylock(&shrinker_rwsem))
+ 		return 1;	/* Assume we'll be able to shrink next time */
+ 
+@@ -211,6 +218,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ 			int shrink_ret;
+ 			int nr_before;
+ 
++			if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
++				goto done;
++
+ 			nr_before = (*shrinker->shrink)(0, gfp_mask);
+ 			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
+ 			if (shrink_ret == -1)
+@@ -225,6 +235,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ 
+ 		shrinker->nr += total_scan;
+ 	}
++done:
+ 	up_read(&shrinker_rwsem);
+ 	return ret;
+ }
+@@ -338,6 +349,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
+ 		 */
+ 		if (PagePrivate(page)) {
+ 			if (try_to_free_buffers(page)) {
++				ub_io_release_context(page, 0);
+ 				ClearPageDirty(page);
+ 				printk("%s: orphaned page\n", __func__);
+ 				return PAGE_CLEAN;
+@@ -1073,6 +1085,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ 	if (sc->may_swap)
+ 		reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
+ 
++	{KSTAT_PERF_ENTER(refill_inact)
+ 	lru_add_drain();
+ 	spin_lock_irq(&zone->lru_lock);
+ 	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+@@ -1162,6 +1175,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ 	spin_unlock_irq(&zone->lru_lock);
+ 
+ 	pagevec_release(&pvec);
++	KSTAT_PERF_LEAVE(refill_inact)}
+ }
+ 
+ /*
+@@ -1214,6 +1228,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
+ 			nr_to_scan = min(nr_active,
+ 					(unsigned long)sc->swap_cluster_max);
+ 			nr_active -= nr_to_scan;
++			if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
++				goto done;
+ 			shrink_active_list(nr_to_scan, zone, sc, priority);
+ 		}
+ 
+@@ -1221,12 +1237,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
+ 			nr_to_scan = min(nr_inactive,
+ 					(unsigned long)sc->swap_cluster_max);
+ 			nr_inactive -= nr_to_scan;
++			if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
++				goto done;
+ 			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+ 								sc);
+ 		}
+ 	}
+ 
+ 	throttle_vm_writeout(sc->gfp_mask);
++done:
+ 	return nr_reclaimed;
+ }
+ 
+@@ -1282,6 +1301,9 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+ 		}
+ 
+ 		nr_reclaimed += shrink_zone(priority, zone, sc);
++
++		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
++			break;
+ 	}
+ 
+ 	return nr_reclaimed;
+@@ -1316,8 +1338,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 	struct zone *zone;
+ 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+ 
++	KSTAT_PERF_ENTER(ttfp);
+ 	if (scan_global_lru(sc))
+ 		count_vm_event(ALLOCSTALL);
++
++	ub_oom_start();
+ 	/*
+ 	 * mem_cgroup will not do shrink_slab.
+ 	 */
+@@ -1367,6 +1392,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 			sc->may_writepage = 1;
+ 		}
+ 
++		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) {
++			ret = 1;
++			goto out;
++		}
++
+ 		/* Take a nap, wait for some writeback to complete */
+ 		if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
+ 			congestion_wait(WRITE, HZ/10);
+@@ -1396,6 +1426,7 @@ out:
+ 	} else
+ 		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+ 
++	KSTAT_PERF_LEAVE(ttfp);
+ 	return ret;
+ }
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index db9eabb..fa80228 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -14,6 +14,40 @@
+ #include <linux/module.h>
+ #include <linux/cpu.h>
+ #include <linux/sched.h>
++#include <linux/virtinfo.h>
++
++void __get_zone_counts(unsigned long *active, unsigned long *inactive,
++			unsigned long *free, struct pglist_data *pgdat)
++{
++	struct zone *zones = pgdat->node_zones;
++	int i;
++
++	*active = 0;
++	*inactive = 0;
++	*free = 0;
++	for (i = 0; i < MAX_NR_ZONES; i++) {
++		*active += zone_page_state(&zones[i], NR_ACTIVE);
++		*inactive += zone_page_state(&zones[i], NR_INACTIVE);
++		*free += zone_page_state(&zones[i], NR_FREE_PAGES);
++	}
++}
++
++void get_zone_counts(unsigned long *active,
++		unsigned long *inactive, unsigned long *free)
++{
++	struct pglist_data *pgdat;
++
++	*active = 0;
++	*inactive = 0;
++	*free = 0;
++	for_each_online_pgdat(pgdat) {
++		unsigned long l, m, n;
++		__get_zone_counts(&l, &m, &n, pgdat);
++		*active += l;
++		*inactive += m;
++		*free += n;
++	}
++}
+ 
+ #ifdef CONFIG_VM_EVENT_COUNTERS
+ DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
+@@ -34,6 +68,20 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+ 	}
+ }
+ 
++unsigned long vm_events(enum vm_event_item i)
++{
++	int cpu;
++	unsigned long sum;
++	struct vm_event_state *st;
++
++	sum = 0;
++	for_each_online_cpu(cpu) {
++		st = &per_cpu(vm_event_states, cpu);
++		sum += st->event[i];
++	}
++
++	return (sum < 0 ? 0 : sum);
++}
+ /*
+  * Accumulate the vm event counters across all CPUs.
+  * The result is unavoidably approximate - it can change
+@@ -745,30 +793,40 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
+ 	unsigned long *v;
+ #ifdef CONFIG_VM_EVENT_COUNTERS
+ 	unsigned long *e;
++#define VMSTAT_BUFSIZE	(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \
++				sizeof(struct vm_event_state))
++#else
++#define VMSTAT_BUFSIZE	(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long))
+ #endif
+ 	int i;
+ 
+ 	if (*pos >= ARRAY_SIZE(vmstat_text))
+ 		return NULL;
+ 
+-#ifdef CONFIG_VM_EVENT_COUNTERS
+-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+-			+ sizeof(struct vm_event_state), GFP_KERNEL);
+-#else
+-	v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
+-			GFP_KERNEL);
+-#endif
++	v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL);
+ 	m->private = v;
+ 	if (!v)
+ 		return ERR_PTR(-ENOMEM);
+-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+-		v[i] = global_page_state(i);
++
++	if (ve_is_super(get_exec_env())) {
++		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
++			v[i] = global_page_state(i);
+ #ifdef CONFIG_VM_EVENT_COUNTERS
+-	e = v + NR_VM_ZONE_STAT_ITEMS;
+-	all_vm_events(e);
+-	e[PGPGIN] /= 2;		/* sectors -> kbytes */
+-	e[PGPGOUT] /= 2;
++		e = v + NR_VM_ZONE_STAT_ITEMS;
++		all_vm_events(e);
++		e[PGPGIN] /= 2;		/* sectors -> kbytes */
++		e[PGPGOUT] /= 2;
+ #endif
++	} else
++		memset(v, 0, VMSTAT_BUFSIZE);
++
++	if (virtinfo_notifier_call(VITYPE_GENERAL,
++				VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) {
++		kfree(v);
++		m->private = NULL;
++		return ERR_PTR(-ENOMSG);
++	}
++
+ 	return v + *pos;
+ }
+ 
+diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
+index ab2225d..27de02f 100644
+--- a/net/8021q/vlan.c
++++ b/net/8021q/vlan.c
+@@ -107,7 +107,7 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
+ {
+ 	struct vlan_group *grp;
+ 
+-	grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL);
++	grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC);
+ 	if (!grp)
+ 		return NULL;
+ 
+@@ -129,7 +129,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, int vid)
+ 		return 0;
+ 
+ 	size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
+-	array = kzalloc(size, GFP_KERNEL);
++	array = kzalloc(size, GFP_KERNEL_UBC);
+ 	if (array == NULL)
+ 		return -ENOBUFS;
+ 
+@@ -148,6 +148,7 @@ void unregister_vlan_dev(struct net_device *dev)
+ 	struct net_device *real_dev = vlan->real_dev;
+ 	struct vlan_group *grp;
+ 	unsigned short vlan_id = vlan->vlan_id;
++	struct ve_struct *env;
+ 
+ 	ASSERT_RTNL();
+ 
+@@ -179,7 +180,9 @@ void unregister_vlan_dev(struct net_device *dev)
+ 	/* Get rid of the vlan's reference to real_dev */
+ 	dev_put(real_dev);
+ 
++	env = set_exec_env(dev->owner_env);
+ 	unregister_netdevice(dev);
++	set_exec_env(env);
+ }
+ 
+ static void vlan_transfer_operstate(const struct net_device *dev,
+@@ -527,6 +530,17 @@ static struct notifier_block vlan_notifier_block __read_mostly = {
+ 	.notifier_call = vlan_device_event,
+ };
+ 
++static inline int vlan_check_caps(void)
++{
++	if (capable(CAP_NET_ADMIN))
++		return 1;
++#ifdef CONFIG_VE
++	if (capable(CAP_VE_NET_ADMIN))
++		return 1;
++#endif
++	return 0;
++}
++
+ /*
+  *	VLAN IOCTL handler.
+  *	o execute requested action or pass command to the device driver
+@@ -570,7 +584,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
+ 	switch (args.cmd) {
+ 	case SET_VLAN_INGRESS_PRIORITY_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		vlan_dev_set_ingress_priority(dev,
+ 					      args.u.skb_priority,
+@@ -580,7 +594,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
+ 
+ 	case SET_VLAN_EGRESS_PRIORITY_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		err = vlan_dev_set_egress_priority(dev,
+ 						   args.u.skb_priority,
+@@ -589,7 +603,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
+ 
+ 	case SET_VLAN_FLAG_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		err = vlan_dev_set_vlan_flag(dev,
+ 					     args.u.flag,
+@@ -598,7 +612,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
+ 
+ 	case SET_VLAN_NAME_TYPE_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		if ((args.u.name_type >= 0) &&
+ 		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
+@@ -614,14 +628,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
+ 
+ 	case ADD_VLAN_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		err = register_vlan_device(dev, args.u.VID);
+ 		break;
+ 
+ 	case DEL_VLAN_CMD:
+ 		err = -EPERM;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!vlan_check_caps())
+ 			break;
+ 		unregister_vlan_dev(dev);
+ 		err = 0;
+diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
+index 5d055c2..a05a47e 100644
+--- a/net/8021q/vlan_dev.c
++++ b/net/8021q/vlan_dev.c
+@@ -361,6 +361,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
+ 
+ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
++	struct ve_struct *env;
+ 	struct net_device_stats *stats = &dev->stats;
+ 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+ 
+@@ -413,7 +414,10 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ 	stats->tx_bytes += skb->len;
+ 
+ 	skb->dev = vlan_dev_info(dev)->real_dev;
++	skb->owner_env = skb->dev->owner_env;
++	env = set_exec_env(skb->owner_env);
+ 	dev_queue_xmit(skb);
++	set_exec_env(env);
+ 
+ 	return 0;
+ }
+@@ -421,6 +425,7 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
+ 					    struct net_device *dev)
+ {
++	struct ve_struct *env;
+ 	struct net_device_stats *stats = &dev->stats;
+ 	unsigned short veth_TCI;
+ 
+@@ -438,7 +443,10 @@ static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
+ 	stats->tx_bytes += skb->len;
+ 
+ 	skb->dev = vlan_dev_info(dev)->real_dev;
++	skb->owner_env = skb->dev->owner_env;
++	env = set_exec_env(skb->owner_env);
+ 	dev_queue_xmit(skb);
++	set_exec_env(env);
+ 
+ 	return 0;
+ }
+@@ -725,4 +733,6 @@ void vlan_setup(struct net_device *dev)
+ 	dev->destructor		= free_netdev;
+ 
+ 	memset(dev->broadcast, 0, ETH_ALEN);
++	if (!ve_is_super(get_exec_env()))
++		dev->features |= NETIF_F_VIRTUAL;
+ }
+diff --git a/net/Kconfig b/net/Kconfig
+index acbf7c6..9aad03b 100644
+--- a/net/Kconfig
++++ b/net/Kconfig
+@@ -30,7 +30,7 @@ menu "Networking options"
+ config NET_NS
+ 	bool "Network namespace support"
+ 	default n
+-	depends on EXPERIMENTAL && !SYSFS && NAMESPACES
++	depends on EXPERIMENTAL && NAMESPACES
+ 	help
+ 	  Allow user space to create what appear to be multiple instances
+ 	  of the network stack.
+diff --git a/net/bridge/br.c b/net/bridge/br.c
+index 8f3c58e..8e51412 100644
+--- a/net/bridge/br.c
++++ b/net/bridge/br.c
+@@ -55,6 +55,7 @@ static int __init br_init(void)
+ 
+ 	brioctl_set(br_ioctl_deviceless_stub);
+ 	br_handle_frame_hook = br_handle_frame;
++	br_hard_xmit_hook = br_xmit;
+ 
+ 	br_fdb_get_hook = br_fdb_get;
+ 	br_fdb_put_hook = br_fdb_put;
+@@ -89,6 +90,7 @@ static void __exit br_deinit(void)
+ 	br_fdb_put_hook = NULL;
+ 
+ 	br_handle_frame_hook = NULL;
++	br_hard_xmit_hook = NULL;
+ 	br_fdb_fini();
+ }
+ 
+diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
+index bf77873..38f3441 100644
+--- a/net/bridge/br_device.c
++++ b/net/bridge/br_device.c
+@@ -40,16 +40,47 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+ 	skb_reset_mac_header(skb);
+ 	skb_pull(skb, ETH_HLEN);
+ 
++	skb->brmark = BR_ALREADY_SEEN;
++
+ 	if (dest[0] & 1)
+ 		br_flood_deliver(br, skb);
+ 	else if ((dst = __br_fdb_get(br, dest)) != NULL)
+-		br_deliver(dst->dst, skb);
++		br_deliver(dst->dst, skb, 1);
+ 	else
+ 		br_flood_deliver(br, skb);
+ 
+ 	return 0;
+ }
+ 
++int br_xmit(struct sk_buff *skb, struct net_bridge_port *port)
++{
++	struct net_bridge *br = port->br;
++	const unsigned char *dest = skb->data;
++	struct net_bridge_fdb_entry *dst;
++
++	if (!br->via_phys_dev)
++		return 0;
++
++	br->statistics.tx_packets++;
++	br->statistics.tx_bytes += skb->len;
++
++	skb_reset_mac_header(skb);
++	skb_pull(skb, ETH_HLEN);
++
++	skb->brmark = BR_ALREADY_SEEN;
++
++	if (dest[0] & 1)
++		br_xmit_deliver(br, port, skb);
++	else if ((dst = __br_fdb_get(br, dest)) != NULL)
++		br_deliver(dst->dst, skb, 0);
++	else
++		br_xmit_deliver(br, port, skb);
++
++	skb_push(skb, ETH_HLEN);
++
++	return 0;
++}
++
+ static int br_dev_open(struct net_device *dev)
+ {
+ 	struct net_bridge *br = netdev_priv(dev);
+diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
+index bdd7c35..2517cd4 100644
+--- a/net/bridge/br_forward.c
++++ b/net/bridge/br_forward.c
+@@ -78,14 +78,24 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+ }
+ 
+ /* called with rcu_read_lock */
+-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
++void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free)
+ {
+ 	if (should_deliver(to, skb)) {
++		if (!free) {
++			struct sk_buff *skb2;
++
++			if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
++				to->br->statistics.tx_dropped++;
++				return;
++			}
++			skb = skb2;
++		}
+ 		__br_deliver(to, skb);
+ 		return;
+ 	}
+ 
+-	kfree_skb(skb);
++	if (free)
++		kfree_skb(skb);
+ }
+ 
+ /* called with rcu_read_lock */
+@@ -101,6 +111,7 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+ 
+ /* called under bridge lock */
+ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
++	int free,
+ 	void (*__packet_hook)(const struct net_bridge_port *p,
+ 			      struct sk_buff *skb))
+ {
+@@ -132,18 +143,41 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ 		return;
+ 	}
+ 
+-	kfree_skb(skb);
++	if (free)
++		kfree_skb(skb);
+ }
+ 
+ 
+ /* called with rcu_read_lock */
+ void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
+ {
+-	br_flood(br, skb, __br_deliver);
++	br_flood(br, skb, 1, __br_deliver);
++}
++
++/* called with rcu_read_lock */
++void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port,
++						struct sk_buff *skb)
++{
++	struct net_bridge_port *p;
++
++	list_for_each_entry_rcu(p, &br->port_list, list) {
++		if (p == port)
++			continue;
++		if (should_deliver(p, skb)) {
++			struct sk_buff *skb2;
++
++			if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
++				br->statistics.tx_dropped++;
++				return;
++			}
++			__br_deliver(p, skb2);
++		}
++	}
+ }
+ 
+ /* called under bridge lock */
+ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb)
+ {
+-	br_flood(br, skb, __br_forward);
++	skb->brmark = BR_ALREADY_SEEN;
++	br_flood(br, skb, 1, __br_forward);
+ }
+diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
+index f38cc53..3dac8fc 100644
+--- a/net/bridge/br_if.c
++++ b/net/bridge/br_if.c
+@@ -14,6 +14,7 @@
+  */
+ 
+ #include <linux/kernel.h>
++#include <linux/nsproxy.h>
+ #include <linux/netdevice.h>
+ #include <linux/ethtool.h>
+ #include <linux/if_arp.h>
+@@ -160,6 +161,11 @@ static void del_br(struct net_bridge *br)
+ {
+ 	struct net_bridge_port *p, *n;
+ 
++	if (br->master_dev) {
++		dev_put(br->master_dev);
++		br->master_dev = NULL;
++	}
++
+ 	list_for_each_entry_safe(p, n, &br->port_list, list) {
+ 		del_nbp(p);
+ 	}
+@@ -299,7 +305,7 @@ int br_del_bridge(const char *name)
+ 	int ret = 0;
+ 
+ 	rtnl_lock();
+-	dev = __dev_get_by_name(&init_net, name);
++	dev = __dev_get_by_name(current->nsproxy->net_ns, name);
+ 	if (dev == NULL)
+ 		ret =  -ENXIO; 	/* Could not find device */
+ 
+@@ -400,6 +406,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
+ 	if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
+ 	    (br->dev->flags & IFF_UP))
+ 		br_stp_enable_port(p);
++	if (!(dev->features & NETIF_F_VIRTUAL)) {
++		dev_hold(dev);
++		br->master_dev = dev;
++	}
+ 	spin_unlock_bh(&br->lock);
+ 
+ 	br_ifinfo_notify(RTM_NEWLINK, p);
+@@ -435,6 +445,16 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
+ 	spin_lock_bh(&br->lock);
+ 	br_stp_recalculate_bridge_id(br);
+ 	br_features_recompute(br);
++	if (br->master_dev == dev) {
++		br->master_dev = NULL;
++		dev_put(dev);
++		list_for_each_entry(p, &br->port_list, list)
++			if (!(p->dev->features & NETIF_F_VIRTUAL)) {
++				dev_hold(p->dev);
++				br->master_dev = p->dev;
++				break;
++			}
++	}
+ 	spin_unlock_bh(&br->lock);
+ 
+ 	return 0;
+@@ -446,7 +466,7 @@ void __exit br_cleanup_bridges(void)
+ 
+ 	rtnl_lock();
+ restart:
+-	for_each_netdev(&init_net, dev) {
++	for_each_netdev(current->nsproxy->net_ns, dev) {
+ 		if (dev->priv_flags & IFF_EBRIDGE) {
+ 			del_br(dev->priv);
+ 			goto restart;
+diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
+index 255c00f..8809156 100644
+--- a/net/bridge/br_input.c
++++ b/net/bridge/br_input.c
+@@ -24,13 +24,20 @@ const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+ 
+ static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
+ {
+-	struct net_device *indev;
++	struct net_device *indev, *outdev;
+ 
+ 	br->statistics.rx_packets++;
+ 	br->statistics.rx_bytes += skb->len;
+ 
+ 	indev = skb->dev;
+-	skb->dev = br->dev;
++	if (!br->via_phys_dev)
++		skb->dev = br->dev;
++	else {
++		skb->brmark = BR_ALREADY_SEEN;
++		outdev = br->master_dev;
++		if (outdev)
++			skb->dev = outdev;
++	}
+ 
+ 	NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
+ 		netif_receive_skb);
+@@ -58,7 +65,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
+ 	/* The packet skb2 goes to the local host (NULL to skip). */
+ 	skb2 = NULL;
+ 
+-	if (br->dev->flags & IFF_PROMISC)
++	if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev)
+ 		skb2 = skb;
+ 
+ 	dst = NULL;
+@@ -147,6 +154,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+ 	}
+ 
+ 	switch (p->state) {
++		struct net_device *out;
++
+ 	case BR_STATE_FORWARDING:
+ 		rhook = rcu_dereference(br_should_route_hook);
+ 		if (rhook != NULL) {
+@@ -156,7 +165,12 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+ 		}
+ 		/* fall through */
+ 	case BR_STATE_LEARNING:
+-		if (!compare_ether_addr(p->br->dev->dev_addr, dest))
++		if (skb->brmark == BR_ALREADY_SEEN)
++			return 0;
++
++		out = p->br->via_phys_dev ? p->br->master_dev : p->br->dev;
++
++		if (out && !compare_ether_addr(p->br->dev->dev_addr, dest))
+ 			skb->pkt_type = PACKET_HOST;
+ 
+ 		NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
+diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
+index 0655a5f..be53554 100644
+--- a/net/bridge/br_ioctl.c
++++ b/net/bridge/br_ioctl.c
+@@ -17,6 +17,7 @@
+ #include <linux/kernel.h>
+ #include <linux/if_bridge.h>
+ #include <linux/netdevice.h>
++#include <linux/nsproxy.h>
+ #include <linux/times.h>
+ #include <net/net_namespace.h>
+ #include <asm/uaccess.h>
+@@ -28,7 +29,7 @@ static int get_bridge_ifindices(int *indices, int num)
+ 	struct net_device *dev;
+ 	int i = 0;
+ 
+-	for_each_netdev(&init_net, dev) {
++	for_each_netdev(current->nsproxy->net_ns, dev) {
+ 		if (i >= num)
+ 			break;
+ 		if (dev->priv_flags & IFF_EBRIDGE)
+@@ -91,7 +92,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
+ 	if (!capable(CAP_NET_ADMIN))
+ 		return -EPERM;
+ 
+-	dev = dev_get_by_index(&init_net, ifindex);
++	dev = dev_get_by_index(current->nsproxy->net_ns, ifindex);
+ 	if (dev == NULL)
+ 		return -EINVAL;
+ 
+@@ -142,6 +143,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+ 		b.root_port = br->root_port;
+ 
+ 		b.stp_enabled = (br->stp_enabled != BR_NO_STP);
++		b.via_phys_dev = br->via_phys_dev;
+ 		b.ageing_time = jiffies_to_clock_t(br->ageing_time);
+ 		b.hello_timer_value = br_timer_value(&br->hello_timer);
+ 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
+@@ -258,6 +260,13 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+ 		br_stp_set_enabled(br, args[1]);
+ 		return 0;
+ 
++	case BRCTL_SET_VIA_ORIG_DEV:
++		if (!capable(CAP_NET_ADMIN))
++			return -EPERM;
++
++		br->via_phys_dev = args[1] ? 1 : 0;
++		return 0;
++
+ 	case BRCTL_SET_BRIDGE_PRIORITY:
+ 		if (!capable(CAP_NET_ADMIN))
+ 			return -EPERM;
+diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
+index f155e6c..e7a1b78 100644
+--- a/net/bridge/br_netlink.c
++++ b/net/bridge/br_netlink.c
+@@ -11,6 +11,7 @@
+  */
+ 
+ #include <linux/kernel.h>
++#include <linux/nsproxy.h>
+ #include <net/rtnetlink.h>
+ #include <net/net_namespace.h>
+ #include <net/sock.h>
+@@ -97,10 +98,11 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
+ 		kfree_skb(skb);
+ 		goto errout;
+ 	}
+-	err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
++	err = rtnl_notify(skb, dev_net(port->dev),0, RTNLGRP_LINK,
++			NULL, GFP_ATOMIC);
+ errout:
+ 	if (err < 0)
+-		rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
++		rtnl_set_sk_err(dev_net(port->dev), RTNLGRP_LINK, err);
+ }
+ 
+ /*
+@@ -112,11 +114,8 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+ 	struct net_device *dev;
+ 	int idx;
+ 
+-	if (net != &init_net)
+-		return 0;
+-
+ 	idx = 0;
+-	for_each_netdev(&init_net, dev) {
++	for_each_netdev(net, dev) {
+ 		/* not a bridge port */
+ 		if (dev->br_port == NULL || idx < cb->args[0])
+ 			goto skip;
+@@ -165,7 +164,7 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
+ 	if (new_state > BR_STATE_BLOCKING)
+ 		return -EINVAL;
+ 
+-	dev = __dev_get_by_index(&init_net, ifm->ifi_index);
++	dev = __dev_get_by_index(current->nsproxy->net_ns, ifm->ifi_index);
+ 	if (!dev)
+ 		return -ENODEV;
+ 
+diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
+index 00644a5..7484a56 100644
+--- a/net/bridge/br_notify.c
++++ b/net/bridge/br_notify.c
+@@ -37,9 +37,6 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
+ 	struct net_bridge_port *p = dev->br_port;
+ 	struct net_bridge *br;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	/* not a port of a bridge */
+ 	if (p == NULL)
+ 		return NOTIFY_DONE;
+diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
+index c11b554..5acb2e4 100644
+--- a/net/bridge/br_private.h
++++ b/net/bridge/br_private.h
+@@ -90,6 +90,8 @@ struct net_bridge
+ 	spinlock_t			lock;
+ 	struct list_head		port_list;
+ 	struct net_device		*dev;
++	struct net_device		*master_dev;
++	unsigned char			via_phys_dev;
+ 	struct net_device_stats		statistics;
+ 	spinlock_t			hash_lock;
+ 	struct hlist_head		hash[BR_HASH_SIZE];
+@@ -139,6 +141,7 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
+ /* br_device.c */
+ extern void br_dev_setup(struct net_device *dev);
+ extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
++extern int br_xmit(struct sk_buff *skb, struct net_bridge_port *port);
+ 
+ /* br_fdb.c */
+ extern int br_fdb_init(void);
+@@ -165,12 +168,13 @@ extern void br_fdb_update(struct net_bridge *br,
+ 
+ /* br_forward.c */
+ extern void br_deliver(const struct net_bridge_port *to,
+-		struct sk_buff *skb);
++		struct sk_buff *skb, int free);
+ extern int br_dev_queue_push_xmit(struct sk_buff *skb);
+ extern void br_forward(const struct net_bridge_port *to,
+ 		struct sk_buff *skb);
+ extern int br_forward_finish(struct sk_buff *skb);
+ extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
++extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb);
+ extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
+ 
+ /* br_if.c */
+diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
+index ddeb6e5..e9f6b7e 100644
+--- a/net/bridge/br_stp_bpdu.c
++++ b/net/bridge/br_stp_bpdu.c
+@@ -142,9 +142,6 @@ int br_stp_rcv(struct sk_buff *skb, struct net_device *dev,
+ 	struct net_bridge *br;
+ 	const unsigned char *buf;
+ 
+-	if (dev_net(dev) != &init_net)
+-		goto err;
+-
+ 	if (!p)
+ 		goto err;
+ 
+diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
+index 27d6a51..0661fd0 100644
+--- a/net/bridge/br_sysfs_br.c
++++ b/net/bridge/br_sysfs_br.c
+@@ -172,6 +172,27 @@ static ssize_t store_stp_state(struct device *d,
+ static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
+ 		   store_stp_state);
+ 
++static ssize_t show_via_phys_dev_state(struct device *cd,
++				struct device_attribute *attr, char *buf)
++{
++	struct net_bridge *br = to_bridge(cd);
++	return sprintf(buf, "%d\n", br->via_phys_dev);
++}
++
++static void set_via_phys_dev_state(struct net_bridge *br, unsigned long val)
++{
++	br->via_phys_dev = val;
++}
++
++static ssize_t store_via_phys_dev_state(struct device *cd,
++		struct device_attribute *attr, const char *buf, size_t len)
++{
++	return store_bridge_parm(cd, buf, len, set_via_phys_dev_state);
++}
++
++static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state,
++			 store_via_phys_dev_state);
++
+ static ssize_t show_priority(struct device *d, struct device_attribute *attr,
+ 			     char *buf)
+ {
+@@ -340,6 +361,7 @@ static struct attribute *bridge_attrs[] = {
+ 	&dev_attr_max_age.attr,
+ 	&dev_attr_ageing_time.attr,
+ 	&dev_attr_stp_state.attr,
++	&dev_attr_via_phys_dev.attr,
+ 	&dev_attr_priority.attr,
+ 	&dev_attr_bridge_id.attr,
+ 	&dev_attr_root_id.attr,
+diff --git a/net/core/datagram.c b/net/core/datagram.c
+index 8a28fc9..d9e576c 100644
+--- a/net/core/datagram.c
++++ b/net/core/datagram.c
+@@ -56,6 +56,8 @@
+ #include <net/sock.h>
+ #include <net/tcp_states.h>
+ 
++#include <bc/net.h>
++
+ /*
+  *	Is a socket 'connection oriented' ?
+  */
+@@ -522,6 +524,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
+ {
+ 	struct sock *sk = sock->sk;
+ 	unsigned int mask;
++	int no_ubc_space;
+ 
+ 	poll_wait(file, sk->sk_sleep, wait);
+ 	mask = 0;
+@@ -531,8 +534,14 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
+ 		mask |= POLLERR;
+ 	if (sk->sk_shutdown & RCV_SHUTDOWN)
+ 		mask |= POLLRDHUP;
+-	if (sk->sk_shutdown == SHUTDOWN_MASK)
++	if (sk->sk_shutdown == SHUTDOWN_MASK) {
++		no_ubc_space = 0;
+ 		mask |= POLLHUP;
++	} else {
++		no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
++		if (no_ubc_space)
++			ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
++	}
+ 
+ 	/* readable? */
+ 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+@@ -549,7 +558,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
+ 	}
+ 
+ 	/* writable? */
+-	if (sock_writeable(sk))
++	if (!no_ubc_space && sock_writeable(sk))
+ 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ 	else
+ 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+diff --git a/net/core/dev.c b/net/core/dev.c
+index fca23a3..26b529f 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -123,6 +123,9 @@
+ 
+ #include "net-sysfs.h"
+ 
++#include <bc/beancounter.h>
++#include <bc/kmem.h>
++
+ /*
+  *	The list of packet types we will receive (as opposed to discard)
+  *	and the routines to invoke.
+@@ -200,20 +203,6 @@ DEFINE_RWLOCK(dev_base_lock);
+ 
+ EXPORT_SYMBOL(dev_base_lock);
+ 
+-#define NETDEV_HASHBITS	8
+-#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
+-
+-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+-{
+-	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+-	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+-}
+-
+-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+-{
+-	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+-}
+-
+ /* Device list insertion */
+ static int list_netdevice(struct net_device *dev)
+ {
+@@ -1566,6 +1555,23 @@ static int dev_gso_segment(struct sk_buff *skb)
+ 	return 0;
+ }
+ 
++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
++int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
++static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb,
++						struct net_device *dev)
++{
++	struct net_bridge_port *port;
++
++	if (((port = rcu_dereference(dev->br_port)) == NULL) ||
++		(skb->brmark == BR_ALREADY_SEEN))
++		return 0;
++
++	return br_hard_xmit_hook(skb, port);
++}
++#else
++#define bridge_hard_start_xmit(skb, dev)	(0)
++#endif
++
+ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ 	if (likely(!skb->next)) {
+@@ -1579,6 +1585,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ 				goto gso;
+ 		}
+ 
++		bridge_hard_start_xmit(skb, dev);
++
+ 		return dev->hard_start_xmit(skb, dev);
+ 	}
+ 
+@@ -1589,6 +1597,9 @@ gso:
+ 
+ 		skb->next = nskb->next;
+ 		nskb->next = NULL;
++
++		bridge_hard_start_xmit(skb, dev);
++
+ 		rc = dev->hard_start_xmit(nskb, dev);
+ 		if (unlikely(rc)) {
+ 			nskb->next = skb->next;
+@@ -2051,6 +2062,7 @@ int netif_receive_skb(struct sk_buff *skb)
+ 	struct net_device *orig_dev;
+ 	int ret = NET_RX_DROP;
+ 	__be16 type;
++	struct ve_struct *old_ve;
+ 
+ 	/* if we've gotten here through NAPI, check netpoll */
+ 	if (netpoll_receive_skb(skb))
+@@ -2073,6 +2085,16 @@ int netif_receive_skb(struct sk_buff *skb)
+ 	skb_reset_transport_header(skb);
+ 	skb->mac_len = skb->network_header - skb->mac_header;
+ 
++#ifdef CONFIG_VE
++	/*
++	 * Skb might be alloced in another VE context, than its device works.
++	 * So, set the correct owner_env.
++	 */
++	skb->owner_env = skb->dev->owner_env;
++	BUG_ON(skb->owner_env == NULL);
++#endif
++	old_ve = set_exec_env(skb->owner_env);
++
+ 	pt_prev = NULL;
+ 
+ 	rcu_read_lock();
+@@ -2133,6 +2155,7 @@ ncls:
+ 
+ out:
+ 	rcu_read_unlock();
++	(void)set_exec_env(old_ve);
+ 	return ret;
+ }
+ 
+@@ -2779,8 +2802,11 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc)
+ 		dev->flags &= ~IFF_PROMISC;
+ 	else
+ 		dev->flags |= IFF_PROMISC;
++	/* Promiscous mode on these devices does not mean anything */
++	if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
++		return;
+ 	if (dev->flags != old_flags) {
+-		printk(KERN_INFO "device %s %s promiscuous mode\n",
++		ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n",
+ 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
+ 							       "left");
+ 		if (audit_enabled)
+@@ -3543,11 +3569,20 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+ 		 *	- require strict serialization.
+ 		 *	- do not return a value
+ 		 */
++		case SIOCSIFMTU:
++		case SIOCSIFHWADDR:
+ 		case SIOCSIFFLAGS:
++			if (!capable(CAP_NET_ADMIN) &&
++			    !capable(CAP_VE_NET_ADMIN))
++				return -EPERM;
++			dev_load(net, ifr.ifr_name);
++			rtnl_lock();
++			ret = dev_ifsioc(net, &ifr, cmd);
++			rtnl_unlock();
++			return ret;
++
+ 		case SIOCSIFMETRIC:
+-		case SIOCSIFMTU:
+ 		case SIOCSIFMAP:
+-		case SIOCSIFHWADDR:
+ 		case SIOCSIFSLAVE:
+ 		case SIOCADDMULTI:
+ 		case SIOCDELMULTI:
+@@ -3614,12 +3649,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+  */
+ static int dev_new_index(struct net *net)
+ {
+-	static int ifindex;
+ 	for (;;) {
+-		if (++ifindex <= 0)
+-			ifindex = 1;
+-		if (!__dev_get_by_index(net, ifindex))
+-			return ifindex;
++		if (++net->ifindex <= 0)
++			net->ifindex = 1;
++		if (!__dev_get_by_index(net, net->ifindex))
++			return net->ifindex;
+ 	}
+ }
+ 
+@@ -3722,6 +3756,10 @@ int register_netdevice(struct net_device *dev)
+ 	BUG_ON(!dev_net(dev));
+ 	net = dev_net(dev);
+ 
++	ret = -EPERM;
++	if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev))
++		goto out;
++
+ 	spin_lock_init(&dev->queue_lock);
+ 	spin_lock_init(&dev->_xmit_lock);
+ 	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
+@@ -3819,6 +3857,10 @@ int register_netdevice(struct net_device *dev)
+ 
+ 	set_bit(__LINK_STATE_PRESENT, &dev->state);
+ 
++	dev->owner_env = get_exec_env();
++	netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub());
++	netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub());
++
+ 	dev_init_scheduler(dev);
+ 	dev_hold(dev);
+ 	list_netdevice(dev);
+@@ -3952,6 +3994,7 @@ static DEFINE_MUTEX(net_todo_run_mutex);
+ void netdev_run_todo(void)
+ {
+ 	struct list_head list;
++	struct ve_struct *old_ve;
+ 
+ 	/* Need to guard against multiple cpu's getting out of order. */
+ 	mutex_lock(&net_todo_run_mutex);
+@@ -3969,6 +4012,7 @@ void netdev_run_todo(void)
+ 	list_replace_init(&net_todo_list, &list);
+ 	spin_unlock(&net_todo_list_lock);
+ 
++	old_ve = get_exec_env();
+ 	while (!list_empty(&list)) {
+ 		struct net_device *dev
+ 			= list_entry(list.next, struct net_device, todo_list);
+@@ -3981,6 +4025,7 @@ void netdev_run_todo(void)
+ 			continue;
+ 		}
+ 
++		(void)set_exec_env(dev->owner_env);
+ 		dev->reg_state = NETREG_UNREGISTERED;
+ 
+ 		netdev_wait_allrefs(dev);
+@@ -3991,13 +4036,21 @@ void netdev_run_todo(void)
+ 		BUG_TRAP(!dev->ip6_ptr);
+ 		BUG_TRAP(!dev->dn_ptr);
+ 
++		put_beancounter(netdev_bc(dev)->exec_ub);
++		put_beancounter(netdev_bc(dev)->owner_ub);
++		netdev_bc(dev)->exec_ub = NULL;
++		netdev_bc(dev)->owner_ub = NULL;
++
++		/* It must be the very last action,
++		 * after this 'dev' may point to freed up memory.
++		 */
+ 		if (dev->destructor)
+ 			dev->destructor(dev);
+ 
+ 		/* Free network device */
+ 		kobject_put(&dev->dev.kobj);
+ 	}
+-
++	(void)set_exec_env(old_ve);
+ out:
+ 	mutex_unlock(&net_todo_run_mutex);
+ }
+@@ -4037,7 +4090,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ 	/* ensure 32-byte alignment of whole construct */
+ 	alloc_size += NETDEV_ALIGN_CONST;
+ 
+-	p = kzalloc(alloc_size, GFP_KERNEL);
++	p = kzalloc(alloc_size, GFP_KERNEL_UBC);
+ 	if (!p) {
+ 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
+ 		return NULL;
+@@ -4153,11 +4206,15 @@ EXPORT_SYMBOL(unregister_netdev);
+  *	Callers must hold the rtnl semaphore.
+  */
+ 
+-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
++int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat,
++		struct ve_struct *src_ve, struct ve_struct *dst_ve,
++		struct user_beancounter *exec_ub)
+ {
+ 	char buf[IFNAMSIZ];
+ 	const char *destname;
+ 	int err;
++	struct ve_struct *cur_ve;
++	struct user_beancounter *tmp_ub;
+ 
+ 	ASSERT_RTNL();
+ 
+@@ -4208,6 +4265,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
+ 	err = -ENODEV;
+ 	unlist_netdevice(dev);
+ 
++	dev->owner_env = dst_ve;
++	tmp_ub = netdev_bc(dev)->exec_ub;
++	netdev_bc(dev)->exec_ub = get_beancounter(exec_ub);
++	put_beancounter(tmp_ub);
++
+ 	synchronize_net();
+ 
+ 	/* Shutdown queueing discipline. */
+@@ -4216,7 +4278,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
+ 	/* Notify protocols, that we are about to destroy
+ 	   this device. They should clean all the things.
+ 	*/
++	cur_ve = set_exec_env(src_ve);
+ 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
++	(void)set_exec_env(cur_ve);
+ 
+ 	/*
+ 	 *	Flush the unicast and multicast chains
+@@ -4247,7 +4311,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
+ 	list_netdevice(dev);
+ 
+ 	/* Notify protocols, that a new device appeared. */
++	cur_ve = set_exec_env(dst_ve);
+ 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
++	(void)set_exec_env(cur_ve);
+ 
+ 	synchronize_net();
+ 	err = 0;
+@@ -4255,6 +4321,14 @@ out:
+ 	return err;
+ }
+ 
++int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
++{
++	struct ve_struct *ve = get_exec_env();
++	struct user_beancounter *ub = get_exec_ub();
++
++	return __dev_change_net_namespace(dev, net, pat, ve, ve, ub);
++}
++
+ static int dev_cpu_callback(struct notifier_block *nfb,
+ 			    unsigned long action,
+ 			    void *ocpu)
+@@ -4460,7 +4534,7 @@ static struct hlist_head *netdev_create_hash(void)
+ 	int i;
+ 	struct hlist_head *hash;
+ 
+-	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
++	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC);
+ 	if (hash != NULL)
+ 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ 			INIT_HLIST_HEAD(&hash[i]);
+@@ -4604,6 +4678,7 @@ EXPORT_SYMBOL(__dev_remove_pack);
+ EXPORT_SYMBOL(dev_valid_name);
+ EXPORT_SYMBOL(dev_add_pack);
+ EXPORT_SYMBOL(dev_alloc_name);
++EXPORT_SYMBOL(__dev_change_net_namespace);
+ EXPORT_SYMBOL(dev_close);
+ EXPORT_SYMBOL(dev_get_by_flags);
+ EXPORT_SYMBOL(dev_get_by_index);
+@@ -4635,6 +4710,7 @@ EXPORT_SYMBOL(dev_get_flags);
+ 
+ #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ EXPORT_SYMBOL(br_handle_frame_hook);
++EXPORT_SYMBOL(br_hard_xmit_hook);
+ EXPORT_SYMBOL(br_fdb_get_hook);
+ EXPORT_SYMBOL(br_fdb_put_hook);
+ #endif
+diff --git a/net/core/dst.c b/net/core/dst.c
+index fe03266..ce92751 100644
+--- a/net/core/dst.c
++++ b/net/core/dst.c
+@@ -308,6 +308,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void
+ 	switch (event) {
+ 	case NETDEV_UNREGISTER:
+ 	case NETDEV_DOWN:
++		dst_gc_task(NULL);
+ 		mutex_lock(&dst_gc_mutex);
+ 		for (dst = dst_busy_list; dst; dst = dst->next) {
+ 			last = dst;
+diff --git a/net/core/ethtool.c b/net/core/ethtool.c
+index 0133b5e..770e607 100644
+--- a/net/core/ethtool.c
++++ b/net/core/ethtool.c
+@@ -828,7 +828,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
+ 	case ETHTOOL_GPFLAGS:
+ 		break;
+ 	default:
+-		if (!capable(CAP_NET_ADMIN))
++		if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ 			return -EPERM;
+ 	}
+ 
+diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
+index 277a230..ee4499f 100644
+--- a/net/core/fib_rules.c
++++ b/net/core/fib_rules.c
+@@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
+ {
+ 	struct fib_rule *r;
+ 
+-	r = kzalloc(ops->rule_size, GFP_KERNEL);
++	r = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
+ 	if (r == NULL)
+ 		return -ENOMEM;
+ 
+@@ -69,7 +69,7 @@ static void rules_ops_put(struct fib_rules_ops *ops)
+ static void flush_route_cache(struct fib_rules_ops *ops)
+ {
+ 	if (ops->flush_cache)
+-		ops->flush_cache();
++		ops->flush_cache(ops);
+ }
+ 
+ int fib_rules_register(struct fib_rules_ops *ops)
+@@ -238,7 +238,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+ 	if (err < 0)
+ 		goto errout;
+ 
+-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
++	rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
+ 	if (rule == NULL) {
+ 		err = -ENOMEM;
+ 		goto errout;
+diff --git a/net/core/filter.c b/net/core/filter.c
+index df37443..8a4933c 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -478,7 +478,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+ 	if (fprog->filter == NULL)
+ 		return -EINVAL;
+ 
+-	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
++	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC);
+ 	if (!fp)
+ 		return -ENOMEM;
+ 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+diff --git a/net/core/neighbour.c b/net/core/neighbour.c
+index 65f01f7..a423038 100644
+--- a/net/core/neighbour.c
++++ b/net/core/neighbour.c
+@@ -21,6 +21,7 @@
+ #include <linux/socket.h>
+ #include <linux/netdevice.h>
+ #include <linux/proc_fs.h>
++#include <linux/ve.h>
+ #ifdef CONFIG_SYSCTL
+ #include <linux/sysctl.h>
+ #endif
+@@ -35,6 +36,7 @@
+ #include <linux/random.h>
+ #include <linux/string.h>
+ #include <linux/log2.h>
++#include <bc/beancounter.h>
+ 
+ #define NEIGH_DEBUG 1
+ 
+@@ -264,6 +266,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+ 	int entries;
+ 
+ 	entries = atomic_inc_return(&tbl->entries) - 1;
++	n = ERR_PTR(-ENOBUFS);
+ 	if (entries >= tbl->gc_thresh3 ||
+ 	    (entries >= tbl->gc_thresh2 &&
+ 	     time_after(now, tbl->last_flush + 5 * HZ))) {
+@@ -274,7 +277,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+ 
+ 	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ 	if (!n)
+-		goto out_entries;
++		goto out_nomem;
+ 
+ 	skb_queue_head_init(&n->arp_queue);
+ 	rwlock_init(&n->lock);
+@@ -291,6 +294,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+ out:
+ 	return n;
+ 
++out_nomem:
++	n = ERR_PTR(-ENOMEM);
+ out_entries:
+ 	atomic_dec(&tbl->entries);
+ 	goto out;
+@@ -409,12 +414,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
+ 	u32 hash_val;
+ 	int key_len = tbl->key_len;
+ 	int error;
+-	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
++	struct neighbour *n1, *rc, *n;
+ 
+-	if (!n) {
+-		rc = ERR_PTR(-ENOBUFS);
++	rc = n = neigh_alloc(tbl);
++	if (IS_ERR(n))
+ 		goto out;
+-	}
+ 
+ 	memcpy(n->primary_key, pkey, key_len);
+ 	n->dev = dev;
+@@ -736,10 +740,21 @@ static void neigh_periodic_timer(unsigned long arg)
+ 		if (atomic_read(&n->refcnt) == 1 &&
+ 		    (state == NUD_FAILED ||
+ 		     time_after(now, n->used + n->parms->gc_staletime))) {
++			struct net_device *dev = n->dev;
++			struct ve_struct *ve;
++			struct user_beancounter *ub;
++
+ 			*np = n->next;
+ 			n->dead = 1;
+ 			write_unlock(&n->lock);
++
++			ve = set_exec_env(dev->owner_env);
++			ub = set_exec_ub(netdev_bc(dev)->owner_ub);
++
+ 			neigh_cleanup_and_release(n);
++
++			set_exec_ub(ub);
++			set_exec_env(ve);
+ 			continue;
+ 		}
+ 		write_unlock(&n->lock);
+@@ -781,6 +796,11 @@ static void neigh_timer_handler(unsigned long arg)
+ 	struct neighbour *neigh = (struct neighbour *)arg;
+ 	unsigned state;
+ 	int notify = 0;
++	struct ve_struct *env;
++	struct user_beancounter *ub;
++
++	env = set_exec_env(neigh->dev->owner_env);
++	ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub);
+ 
+ 	write_lock(&neigh->lock);
+ 
+@@ -884,6 +904,8 @@ out:
+ 		neigh_update_notify(neigh);
+ 
+ 	neigh_release(neigh);
++	(void)set_exec_ub(ub);
++	(void)set_exec_env(env);
+ }
+ 
+ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+@@ -1272,9 +1294,16 @@ static void neigh_proxy_process(unsigned long arg)
+ 		if (tdif <= 0) {
+ 			struct net_device *dev = back->dev;
+ 			__skb_unlink(back, &tbl->proxy_queue);
+-			if (tbl->proxy_redo && netif_running(dev))
++			if (tbl->proxy_redo && netif_running(dev)) {
++				struct ve_struct *ve;
++				struct user_beancounter *ub;
++
++				ve = set_exec_env(dev->owner_env);
++				ub = set_exec_ub(netdev_bc(dev)->owner_ub);
+ 				tbl->proxy_redo(back);
+-			else
++				set_exec_ub(ub);
++				set_exec_env(ve);
++			} else
+ 				kfree_skb(back);
+ 
+ 			dev_put(dev);
+diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
+index 90e2177..1c484ee 100644
+--- a/net/core/net-sysfs.c
++++ b/net/core/net-sysfs.c
+@@ -229,6 +229,27 @@ static struct device_attribute net_class_attributes[] = {
+ 	{}
+ };
+ 
++#ifdef CONFIG_VE
++struct device_attribute ve_net_class_attributes[] = {
++	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
++	__ATTR(iflink, S_IRUGO, show_iflink, NULL),
++	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
++	__ATTR(features, S_IRUGO, show_features, NULL),
++	__ATTR(type, S_IRUGO, show_type, NULL),
++	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
++	__ATTR(address, S_IRUGO, show_address, NULL),
++	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
++	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
++	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
++	__ATTR(operstate, S_IRUGO, show_operstate, NULL),
++	__ATTR(mtu, S_IRUGO, show_mtu, NULL),
++	__ATTR(flags, S_IRUGO, show_flags, NULL),
++	__ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL),
++	{}
++};
++EXPORT_SYMBOL(ve_net_class_attributes);
++#endif
++
+ /* Show a given an attribute in the statistics group */
+ static ssize_t netstat_show(const struct device *d,
+ 			    struct device_attribute *attr, char *buf,
+@@ -421,7 +442,7 @@ static void netdev_release(struct device *d)
+ 	kfree((char *)dev - dev->padded);
+ }
+ 
+-static struct class net_class = {
++struct class net_class = {
+ 	.name = "net",
+ 	.dev_release = netdev_release,
+ #ifdef CONFIG_SYSFS
+@@ -431,6 +452,13 @@ static struct class net_class = {
+ 	.dev_uevent = netdev_uevent,
+ #endif
+ };
++EXPORT_SYMBOL(net_class);
++
++#ifndef CONFIG_VE
++#define visible_net_class net_class
++#else
++#define visible_net_class (*get_exec_env()->net_class)
++#endif
+ 
+ /* Delete sysfs entries but hold kobject reference until after all
+  * netdev references are gone.
+@@ -449,7 +477,7 @@ int netdev_register_kobject(struct net_device *net)
+ 	struct device *dev = &(net->dev);
+ 	struct attribute_group **groups = net->sysfs_groups;
+ 
+-	dev->class = &net_class;
++	dev->class = &visible_net_class;
+ 	dev->platform_data = net;
+ 	dev->groups = groups;
+ 
+@@ -475,7 +503,15 @@ void netdev_initialize_kobject(struct net_device *net)
+ 	device_initialize(device);
+ }
+ 
++void prepare_sysfs_netdev(void)
++{
++#ifdef CONFIG_VE
++	get_ve0()->net_class = &net_class;
++#endif
++}
++
+ int netdev_kobject_init(void)
+ {
++	prepare_sysfs_netdev();
+ 	return class_register(&net_class);
+ }
+diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
+index 7c52fe2..e1aeb0e 100644
+--- a/net/core/net_namespace.c
++++ b/net/core/net_namespace.c
+@@ -1,6 +1,7 @@
+ #include <linux/workqueue.h>
+ #include <linux/rtnetlink.h>
+ #include <linux/cache.h>
++#include <linux/proc_fs.h>
+ #include <linux/slab.h>
+ #include <linux/list.h>
+ #include <linux/delay.h>
+@@ -34,6 +35,10 @@ static __net_init int setup_net(struct net *net)
+ 	int error;
+ 	struct net_generic *ng;
+ 
++#ifdef CONFIG_VE
++	net->owner_ve = get_exec_env();
++#endif
++
+ 	atomic_set(&net->count, 1);
+ #ifdef NETNS_REFCNT_DEBUG
+ 	atomic_set(&net->use_count, 0);
+@@ -85,6 +90,8 @@ static struct net *net_alloc(void)
+ 
+ static void net_free(struct net *net)
+ {
++	struct completion *sysfs_completion;
++
+ 	if (!net)
+ 		return;
+ 
+@@ -96,7 +103,10 @@ static void net_free(struct net *net)
+ 	}
+ #endif
+ 
++	sysfs_completion = net->sysfs_completion;
+ 	kmem_cache_free(net_cachep, net);
++	if (sysfs_completion)
++		complete(sysfs_completion);
+ }
+ 
+ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+@@ -139,6 +149,7 @@ static void cleanup_net(struct work_struct *work)
+ {
+ 	struct pernet_operations *ops;
+ 	struct net *net;
++	struct ve_struct *old_ve;
+ 
+ 	/* Be very certain incoming network packets will not find us */
+ 	rcu_barrier();
+@@ -152,11 +163,13 @@ static void cleanup_net(struct work_struct *work)
+ 	list_del(&net->list);
+ 	rtnl_unlock();
+ 
++	old_ve = set_exec_env(net->owner_ve);
+ 	/* Run all of the network namespace exit methods */
+ 	list_for_each_entry_reverse(ops, &pernet_list, list) {
+ 		if (ops->exit)
+ 			ops->exit(net);
+ 	}
++	(void)set_exec_env(old_ve);
+ 
+ 	mutex_unlock(&net_mutex);
+ 
+diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
+index a9a7721..7e10b49 100644
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -1205,6 +1205,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+ 		if (rtnl_msg_handlers[idx] == NULL ||
+ 		    rtnl_msg_handlers[idx][type].dumpit == NULL)
+ 			continue;
++		if (vz_security_family_check(idx))
++			continue;
+ 		if (idx > s_idx)
+ 			memset(&cb->args[0], 0, sizeof(cb->args));
+ 		if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
+@@ -1265,13 +1267,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+ 		return 0;
+ 
+ 	family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+-	if (family >= NPROTO)
++	if (family >= NPROTO || vz_security_family_check(family))
+ 		return -EAFNOSUPPORT;
+ 
+ 	sz_idx = type>>2;
+ 	kind = type&3;
+ 
+-	if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+diff --git a/net/core/scm.c b/net/core/scm.c
+index 10f5c65..65e0983 100644
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -36,6 +36,7 @@
+ #include <net/compat.h>
+ #include <net/scm.h>
+ 
++#include <bc/kmem.h>
+ 
+ /*
+  *	Only allow a user to send credentials, that they could set with
+@@ -44,7 +45,9 @@
+ 
+ static __inline__ int scm_check_creds(struct ucred *creds)
+ {
+-	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
++	if ((creds->pid == task_tgid_vnr(current) ||
++	     creds->pid == current->tgid ||
++	     capable(CAP_VE_SYS_ADMIN)) &&
+ 	    ((creds->uid == current->uid || creds->uid == current->euid ||
+ 	      creds->uid == current->suid) || capable(CAP_SETUID)) &&
+ 	    ((creds->gid == current->gid || creds->gid == current->egid ||
+@@ -71,7 +74,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+ 
+ 	if (!fpl)
+ 	{
+-		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
++		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC);
+ 		if (!fpl)
+ 			return -ENOMEM;
+ 		*fplp = fpl;
+@@ -282,7 +285,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+ 	if (!fpl)
+ 		return NULL;
+ 
+-	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
++	new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL_UBC);
+ 	if (new_fpl) {
+ 		for (i=fpl->count-1; i>=0; i--)
+ 			get_file(fpl->fp[i]);
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index 3666216..b82442c 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -67,6 +67,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+ 
++#include <bc/net.h>
++
+ #include "kmap_skb.h"
+ 
+ static struct kmem_cache *skbuff_head_cache __read_mostly;
+@@ -193,6 +195,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 	if (!skb)
+ 		goto out;
+ 
++	if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA))
++		goto nobc;
++
++	/* Get the DATA. Size must match skb_add_mtu(). */
+ 	size = SKB_DATA_ALIGN(size);
+ 	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+ 			gfp_mask, node);
+@@ -211,6 +217,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ 	skb->data = data;
+ 	skb_reset_tail_pointer(skb);
+ 	skb->end = skb->tail + size;
++	skb->owner_env = get_exec_env();
+ 	/* make sure we initialize shinfo sequentially */
+ 	shinfo = skb_shinfo(skb);
+ 	atomic_set(&shinfo->dataref, 1);
+@@ -233,6 +240,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ out:
+ 	return skb;
+ nodata:
++	ub_skb_free_bc(skb);
++nobc:
+ 	kmem_cache_free(cache, skb);
+ 	skb = NULL;
+ 	goto out;
+@@ -339,6 +348,7 @@ static void kfree_skbmem(struct sk_buff *skb)
+ 	struct sk_buff *other;
+ 	atomic_t *fclone_ref;
+ 
++	ub_skb_free_bc(skb);
+ 	switch (skb->fclone) {
+ 	case SKB_FCLONE_UNAVAILABLE:
+ 		kmem_cache_free(skbuff_head_cache, skb);
+@@ -372,6 +382,7 @@ static void skb_release_all(struct sk_buff *skb)
+ #ifdef CONFIG_XFRM
+ 	secpath_put(skb->sp);
+ #endif
++	ub_skb_uncharge(skb);
+ 	if (skb->destructor) {
+ 		WARN_ON(in_irq());
+ 		skb->destructor(skb);
+@@ -461,6 +472,11 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+ 	new->tc_verd		= old->tc_verd;
+ #endif
+ #endif
++#ifdef CONFIG_VE
++	new->accounted = old->accounted;
++	new->redirected = old->redirected;
++#endif
++	skb_copy_brmark(new, old);
+ 	skb_copy_secmark(new, old);
+ }
+ 
+@@ -478,6 +494,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+ 	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ 	n->cloned = 1;
+ 	n->nohdr = 0;
++	C(owner_env);
++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
++	C(brmark);
++#endif
+ 	n->destructor = NULL;
+ 	C(iif);
+ 	C(tail);
+@@ -487,6 +507,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+ 	C(truesize);
+ 	atomic_set(&n->users, 1);
+ 
++#ifdef CONFIG_VE
++	C(accounted);
++	C(redirected);
++#endif
++
+ 	atomic_inc(&(skb_shinfo(skb)->dataref));
+ 	skb->cloned = 1;
+ 
+@@ -542,6 +567,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+ 		n->fclone = SKB_FCLONE_UNAVAILABLE;
+ 	}
+ 
++	if (ub_skb_alloc_bc(n, gfp_mask)) {
++		kmem_cache_free(skbuff_head_cache, n);
++		return NULL;
++	}
+ 	return __skb_clone(n, skb);
+ }
+ 
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 88094cb..bb59e40 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -125,6 +125,9 @@
+ #include <net/xfrm.h>
+ #include <linux/ipsec.h>
+ 
++#include <bc/net.h>
++#include <bc/beancounter.h>
++
+ #include <linux/filter.h>
+ 
+ #ifdef CONFIG_INET
+@@ -250,7 +253,7 @@ static void sock_warn_obsolete_bsdism(const char *name)
+ 	static char warncomm[TASK_COMM_LEN];
+ 	if (strcmp(warncomm, current->comm) && warned < 5) {
+ 		strcpy(warncomm,  current->comm);
+-		printk(KERN_WARNING "process `%s' is using obsolete "
++		ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
+ 		       "%s SO_BSDCOMPAT\n", warncomm, name);
+ 		warned++;
+ 	}
+@@ -283,7 +286,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+ 	if (err)
+ 		goto out;
+ 
+-	if (!sk_rmem_schedule(sk, skb->truesize)) {
++	if (!sk_rmem_schedule(sk, skb)) {
+ 		err = -ENOBUFS;
+ 		goto out;
+ 	}
+@@ -921,6 +924,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
+ 	slab = prot->slab;
+ 
+ 	security_sk_free(sk);
++	ub_sock_uncharge(sk);
+ 	if (slab != NULL)
+ 		kmem_cache_free(slab, sk);
+ 	else
+@@ -949,6 +953,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ 		 */
+ 		sk->sk_prot = sk->sk_prot_creator = prot;
+ 		sock_lock_init(sk);
++		sk->owner_env = get_exec_env();
+ 		sock_net_set(sk, get_net(net));
+ 	}
+ 
+@@ -1043,14 +1048,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+ 		if (filter != NULL)
+ 			sk_filter_charge(newsk, filter);
+ 
+-		if (unlikely(xfrm_sk_clone_policy(newsk))) {
+-			/* It is still raw copy of parent, so invalidate
+-			 * destructor and make plain sk_free() */
+-			newsk->sk_destruct = NULL;
+-			sk_free(newsk);
+-			newsk = NULL;
+-			goto out;
+-		}
++		if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0)
++			goto out_err;
++
++		if (unlikely(xfrm_sk_clone_policy(newsk)))
++			 goto out_err;
+ 
+ 		newsk->sk_err	   = 0;
+ 		newsk->sk_priority = 0;
+@@ -1074,14 +1076,23 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+ 		if (newsk->sk_prot->sockets_allocated)
+ 			atomic_inc(newsk->sk_prot->sockets_allocated);
+ 	}
+-out:
+ 	return newsk;
++
++out_err:
++	/* It is still raw copy of parent, so invalidate
++	 * destructor and make plain sk_free() */
++	sock_reset_flag(newsk, SOCK_TIMESTAMP);
++	newsk->sk_destruct = NULL;
++	sk_free(newsk);
++	return NULL;
+ }
+ 
+ EXPORT_SYMBOL_GPL(sk_clone);
+ 
+ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+ {
++	extern int sysctl_tcp_use_sg;
++
+ 	__sk_dst_set(sk, dst);
+ 	sk->sk_route_caps = dst->dev->features;
+ 	if (sk->sk_route_caps & NETIF_F_GSO)
+@@ -1094,6 +1105,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+ 			sk->sk_gso_max_size = dst->dev->gso_max_size;
+ 		}
+ 	}
++	if (!sysctl_tcp_use_sg)
++		sk->sk_route_caps &= ~NETIF_F_SG;
+ }
+ EXPORT_SYMBOL_GPL(sk_setup_caps);
+ 
+@@ -1254,11 +1267,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
+ /*
+  *	Generic send/receive buffer handlers
+  */
+-
+-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
+-					    unsigned long header_len,
+-					    unsigned long data_len,
+-					    int noblock, int *errcode)
++struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size,
++				     unsigned long size2, int noblock,
++				     int *errcode)
+ {
+ 	struct sk_buff *skb;
+ 	gfp_t gfp_mask;
+@@ -1279,46 +1290,35 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
+ 		if (sk->sk_shutdown & SEND_SHUTDOWN)
+ 			goto failure;
+ 
+-		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+-			skb = alloc_skb(header_len, gfp_mask);
+-			if (skb) {
+-				int npages;
+-				int i;
+-
+-				/* No pages, we're done... */
+-				if (!data_len)
+-					break;
+-
+-				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+-				skb->truesize += data_len;
+-				skb_shinfo(skb)->nr_frags = npages;
+-				for (i = 0; i < npages; i++) {
+-					struct page *page;
+-					skb_frag_t *frag;
+-
+-					page = alloc_pages(sk->sk_allocation, 0);
+-					if (!page) {
+-						err = -ENOBUFS;
+-						skb_shinfo(skb)->nr_frags = i;
+-						kfree_skb(skb);
+-						goto failure;
+-					}
+-
+-					frag = &skb_shinfo(skb)->frags[i];
+-					frag->page = page;
+-					frag->page_offset = 0;
+-					frag->size = (data_len >= PAGE_SIZE ?
+-						      PAGE_SIZE :
+-						      data_len);
+-					data_len -= PAGE_SIZE;
+-				}
++		if (ub_sock_getwres_other(sk, skb_charge_size(size))) {
++			if (size2 < size) {
++				size = size2;
++				continue;
++			}
++			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
++			err = -EAGAIN;
++			if (!timeo)
++				goto failure;
++			if (signal_pending(current))
++				goto interrupted;
++			timeo = ub_sock_wait_for_space(sk, timeo,
++					skb_charge_size(size));
++			continue;
++		}
+ 
++		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
++			skb = alloc_skb(size, gfp_mask);
++			if (skb)
+ 				/* Full success... */
+ 				break;
+-			}
++			ub_sock_retwres_other(sk, skb_charge_size(size),
++					SOCK_MIN_UBCSPACE_CH);
+ 			err = -ENOBUFS;
+ 			goto failure;
+ 		}
++		ub_sock_retwres_other(sk,
++				skb_charge_size(size),
++				SOCK_MIN_UBCSPACE_CH);
+ 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ 		err = -EAGAIN;
+@@ -1329,6 +1329,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
+ 		timeo = sock_wait_for_wmem(sk, timeo);
+ 	}
+ 
++	ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF);
+ 	skb_set_owner_w(skb, sk);
+ 	return skb;
+ 
+@@ -1338,11 +1339,12 @@ failure:
+ 	*errcode = err;
+ 	return NULL;
+ }
++EXPORT_SYMBOL(sock_alloc_send_skb2);
+ 
+ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ 				    int noblock, int *errcode)
+ {
+-	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
++	return sock_alloc_send_skb2(sk, size, size, noblock, errcode);
+ }
+ 
+ static void __lock_sock(struct sock *sk)
+@@ -1752,10 +1754,12 @@ void lock_sock_nested(struct sock *sk, int subclass)
+ 		__lock_sock(sk);
+ 	sk->sk_lock.owned = 1;
+ 	spin_unlock(&sk->sk_lock.slock);
++#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
+ 	/*
+ 	 * The sk_lock has mutex_lock() semantics here:
+ 	 */
+ 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
++#endif
+ 	local_bh_enable();
+ }
+ 
+@@ -1763,11 +1767,12 @@ EXPORT_SYMBOL(lock_sock_nested);
+ 
+ void release_sock(struct sock *sk)
+ {
++#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
+ 	/*
+ 	 * The sk_lock has mutex_unlock() semantics:
+ 	 */
+ 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+-
++#endif
+ 	spin_lock_bh(&sk->sk_lock.slock);
+ 	if (sk->sk_backlog.tail)
+ 		__release_sock(sk);
+@@ -2041,7 +2046,7 @@ int proto_register(struct proto *prot, int alloc_slab)
+ 
+ 	if (alloc_slab) {
+ 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
+-					       SLAB_HWCACHE_ALIGN, NULL);
++					       SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
+ 
+ 		if (prot->slab == NULL) {
+ 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
+@@ -2059,7 +2064,7 @@ int proto_register(struct proto *prot, int alloc_slab)
+ 			sprintf(request_sock_slab_name, mask, prot->name);
+ 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+ 								 prot->rsk_prot->obj_size, 0,
+-								 SLAB_HWCACHE_ALIGN, NULL);
++								 SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
+ 
+ 			if (prot->rsk_prot->slab == NULL) {
+ 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+@@ -2080,7 +2085,7 @@ int proto_register(struct proto *prot, int alloc_slab)
+ 			prot->twsk_prot->twsk_slab =
+ 				kmem_cache_create(timewait_sock_slab_name,
+ 						  prot->twsk_prot->twsk_obj_size,
+-						  0, SLAB_HWCACHE_ALIGN,
++						  0, SLAB_HWCACHE_ALIGN|SLAB_UBC,
+ 						  NULL);
+ 			if (prot->twsk_prot->twsk_slab == NULL)
+ 				goto out_free_timewait_sock_slab_name;
+@@ -2237,10 +2242,26 @@ static const struct file_operations proto_seq_fops = {
+ 	.release	= seq_release,
+ };
+ 
++static int proto_net_init(struct net *net)
++{
++	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
++		return -ENOBUFS;
++	return 0;
++}
++
++static void proto_net_exit(struct net *net)
++{
++	proc_net_remove(net, "protocols");
++}
++
++static struct pernet_operations proto_net_ops = {
++	.init = proto_net_init,
++	.exit = proto_net_exit,
++};
++
+ static int __init proto_init(void)
+ {
+-	/* register /proc/net/protocols */
+-	return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
++	return register_pernet_subsys(&proto_net_ops);
+ }
+ 
+ subsys_initcall(proto_init);
+diff --git a/net/core/stream.c b/net/core/stream.c
+index 4a0ad15..5c39418 100644
+--- a/net/core/stream.c
++++ b/net/core/stream.c
+@@ -111,8 +111,10 @@ EXPORT_SYMBOL(sk_stream_wait_close);
+  * sk_stream_wait_memory - Wait for more memory for a socket
+  * @sk: socket to wait for memory
+  * @timeo_p: for how long
++ * @amount - amount of memory to wait for (in UB space!)
+  */
+-int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
++int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
++		unsigned long amount)
+ {
+ 	int err = 0;
+ 	long vm_wait = 0;
+@@ -134,8 +136,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+ 		if (signal_pending(current))
+ 			goto do_interrupted;
+ 		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+-		if (sk_stream_memory_free(sk) && !vm_wait)
+-			break;
++		if (amount == 0) {
++			if (sk_stream_memory_free(sk) && !vm_wait)
++				break;
++		} else
++			ub_sock_sndqueueadd_tcp(sk, amount);
+ 
+ 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ 		sk->sk_write_pending++;
+@@ -144,6 +149,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+ 						  sk_stream_memory_free(sk) &&
+ 						  vm_wait);
+ 		sk->sk_write_pending--;
++		if (amount > 0)
++			ub_sock_sndqueuedel(sk);
+ 
+ 		if (vm_wait) {
+ 			vm_wait -= current_timeo;
+@@ -170,6 +177,10 @@ do_interrupted:
+ 	goto out;
+ }
+ 
++int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
++{
++	return __sk_stream_wait_memory(sk, timeo_p, 0);
++}
+ EXPORT_SYMBOL(sk_stream_wait_memory);
+ 
+ int sk_stream_error(struct sock *sk, int flags, int err)
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
+index f7fe2a5..024413f 100644
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -567,6 +567,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+ 	__ip6_dst_store(newsk, dst, NULL, NULL);
+ 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
+ 						      NETIF_F_TSO);
++	if (!sysctl_tcp_use_sg)
++		newsk->sk_route_caps &= ~NETIF_F_SG;
+ 	newdp6 = (struct dccp6_sock *)newsk;
+ 	newinet = inet_sk(newsk);
+ 	newinet->pinet6 = &newdp6->inet6;
+diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
+index 66dca5b..e7802a4 100644
+--- a/net/dccp/minisocks.c
++++ b/net/dccp/minisocks.c
+@@ -19,6 +19,8 @@
+ #include <net/xfrm.h>
+ #include <net/inet_timewait_sock.h>
+ 
++#include <bc/sock_orphan.h>
++
+ #include "ackvec.h"
+ #include "ccid.h"
+ #include "dccp.h"
+@@ -56,7 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
+ {
+ 	struct inet_timewait_sock *tw = NULL;
+ 
+-	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
++	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets &&
++			ub_timewait_check(sk, &dccp_death_row))
+ 		tw = inet_twsk_alloc(sk, state);
+ 
+ 	if (tw != NULL) {
+diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
+index 5b7539b..14fbca5 100644
+--- a/net/decnet/dn_rules.c
++++ b/net/decnet/dn_rules.c
+@@ -229,7 +229,7 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
+ 	return 0;
+ }
+ 
+-static void dn_fib_rule_flush_cache(void)
++static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
+ {
+ 	dn_rt_cache_flush(-1);
+ }
+diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
+index 6d2bd32..45567e3 100644
+--- a/net/decnet/netfilter/dn_rtmsg.c
++++ b/net/decnet/netfilter/dn_rtmsg.c
+@@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
+ 	if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ 		return;
+ 
+-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		RCV_SKB_FAIL(-EPERM);
+ 
+ 	/* Eventually we might send routing messages too */
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index 24eca23..dcaebf0 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -115,6 +115,7 @@
+ #ifdef CONFIG_IP_MROUTE
+ #include <linux/mroute.h>
+ #endif
++#include <bc/net.h>
+ 
+ DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
+ 
+@@ -330,6 +331,10 @@ lookup_protocol:
+ 			goto out_rcu_unlock;
+ 	}
+ 
++	err = vz_security_protocol_check(answer->protocol);
++	if (err < 0)
++		goto out_rcu_unlock;
++
+ 	err = -EPERM;
+ 	if (answer->capability > 0 && !capable(answer->capability))
+ 		goto out_rcu_unlock;
+@@ -351,6 +356,13 @@ lookup_protocol:
+ 	if (sk == NULL)
+ 		goto out;
+ 
++	err = -ENOBUFS;
++	if (ub_sock_charge(sk, PF_INET, sock->type))
++		goto out_sk_free;
++	/* if charge was successful, sock_init_data() MUST be called to
++	 * set sk->sk_type. otherwise sk will be uncharged to wrong resource
++	 */
++
+ 	err = 0;
+ 	sk->sk_no_check = answer_no_check;
+ 	if (INET_PROTOSW_REUSE & answer_flags)
+@@ -408,6 +420,9 @@ out:
+ out_rcu_unlock:
+ 	rcu_read_unlock();
+ 	goto out;
++out_sk_free:
++	sk_free(sk);
++	return err;
+ }
+ 
+ 
+@@ -422,6 +437,9 @@ int inet_release(struct socket *sock)
+ 
+ 	if (sk) {
+ 		long timeout;
++		struct ve_struct *saved_env;
++
++		saved_env = set_exec_env(sk->owner_env);
+ 
+ 		/* Applications forget to leave groups before exiting */
+ 		ip_mc_drop_socket(sk);
+@@ -439,6 +457,8 @@ int inet_release(struct socket *sock)
+ 			timeout = sk->sk_lingertime;
+ 		sock->sk = NULL;
+ 		sk->sk_prot->close(sk, timeout);
++
++		(void)set_exec_env(saved_env);
+ 	}
+ 	return 0;
+ }
+@@ -1341,27 +1361,27 @@ static struct net_protocol icmp_protocol = {
+ 	.netns_ok =	1,
+ };
+ 
+-static int __init init_ipv4_mibs(void)
++int init_ipv4_mibs(void)
+ {
+-	if (snmp_mib_init((void **)net_statistics,
++	if (snmp_mib_init((void **)ve_net_statistics,
+ 			  sizeof(struct linux_mib)) < 0)
+ 		goto err_net_mib;
+-	if (snmp_mib_init((void **)ip_statistics,
++	if (snmp_mib_init((void **)ve_ip_statistics,
+ 			  sizeof(struct ipstats_mib)) < 0)
+ 		goto err_ip_mib;
+-	if (snmp_mib_init((void **)icmp_statistics,
++	if (snmp_mib_init((void **)ve_icmp_statistics,
+ 			  sizeof(struct icmp_mib)) < 0)
+ 		goto err_icmp_mib;
+-	if (snmp_mib_init((void **)icmpmsg_statistics,
++	if (snmp_mib_init((void **)ve_icmpmsg_statistics,
+ 			  sizeof(struct icmpmsg_mib)) < 0)
+ 		goto err_icmpmsg_mib;
+-	if (snmp_mib_init((void **)tcp_statistics,
++	if (snmp_mib_init((void **)ve_tcp_statistics,
+ 			  sizeof(struct tcp_mib)) < 0)
+ 		goto err_tcp_mib;
+-	if (snmp_mib_init((void **)udp_statistics,
++	if (snmp_mib_init((void **)ve_udp_statistics,
+ 			  sizeof(struct udp_mib)) < 0)
+ 		goto err_udp_mib;
+-	if (snmp_mib_init((void **)udplite_statistics,
++	if (snmp_mib_init((void **)ve_udplite_statistics,
+ 			  sizeof(struct udp_mib)) < 0)
+ 		goto err_udplite_mib;
+ 
+@@ -1370,20 +1390,33 @@ static int __init init_ipv4_mibs(void)
+ 	return 0;
+ 
+ err_udplite_mib:
+-	snmp_mib_free((void **)udp_statistics);
++	snmp_mib_free((void **)ve_udp_statistics);
+ err_udp_mib:
+-	snmp_mib_free((void **)tcp_statistics);
++	snmp_mib_free((void **)ve_tcp_statistics);
+ err_tcp_mib:
+-	snmp_mib_free((void **)icmpmsg_statistics);
++	snmp_mib_free((void **)ve_icmpmsg_statistics);
+ err_icmpmsg_mib:
+-	snmp_mib_free((void **)icmp_statistics);
++	snmp_mib_free((void **)ve_icmp_statistics);
+ err_icmp_mib:
+-	snmp_mib_free((void **)ip_statistics);
++	snmp_mib_free((void **)ve_ip_statistics);
+ err_ip_mib:
+-	snmp_mib_free((void **)net_statistics);
++	snmp_mib_free((void **)ve_net_statistics);
+ err_net_mib:
+ 	return -ENOMEM;
+ }
++EXPORT_SYMBOL(init_ipv4_mibs);
++
++void cleanup_ipv4_mibs(void)
++{
++	snmp_mib_free((void **)ve_udplite_statistics);
++	snmp_mib_free((void **)ve_udp_statistics);
++	snmp_mib_free((void **)ve_tcp_statistics);
++	snmp_mib_free((void **)ve_icmpmsg_statistics);
++	snmp_mib_free((void **)ve_icmp_statistics);
++	snmp_mib_free((void **)ve_ip_statistics);
++	snmp_mib_free((void **)ve_net_statistics);
++}
++EXPORT_SYMBOL(cleanup_ipv4_mibs);
+ 
+ static int ipv4_proc_init(void);
+ 
+diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
+index 9b539fa..afd5dfb 100644
+--- a/net/ipv4/arp.c
++++ b/net/ipv4/arp.c
+@@ -1137,7 +1137,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+ 	switch (cmd) {
+ 		case SIOCDARP:
+ 		case SIOCSARP:
+-			if (!capable(CAP_NET_ADMIN))
++			if (!capable(CAP_NET_ADMIN) &&
++					!capable(CAP_VE_NET_ADMIN))
+ 				return -EPERM;
+ 		case SIOCGARP:
+ 			err = copy_from_user(&r, arg, sizeof(struct arpreq));
+@@ -1199,7 +1200,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo
+ 	switch (event) {
+ 	case NETDEV_CHANGEADDR:
+ 		neigh_changeaddr(&arp_tbl, dev);
+-		rt_cache_flush(0);
++		rt_cache_flush(dev_net(dev), 0);
+ 		break;
+ 	default:
+ 		break;
+diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
+index 79a7ef6..46a3f5c 100644
+--- a/net/ipv4/devinet.c
++++ b/net/ipv4/devinet.c
+@@ -112,9 +112,9 @@ static inline void devinet_sysctl_unregister(struct in_device *idev)
+ 
+ /* Locks all the inet devices. */
+ 
+-static struct in_ifaddr *inet_alloc_ifa(void)
++struct in_ifaddr *inet_alloc_ifa(void)
+ {
+-	struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
++	struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_UBC);
+ 
+ 	if (ifa) {
+ 		INIT_RCU_HEAD(&ifa->rcu_head);
+@@ -122,6 +122,7 @@ static struct in_ifaddr *inet_alloc_ifa(void)
+ 
+ 	return ifa;
+ }
++EXPORT_SYMBOL_GPL(inet_alloc_ifa);
+ 
+ static void inet_rcu_free_ifa(struct rcu_head *head)
+ {
+@@ -154,7 +155,7 @@ void in_dev_finish_destroy(struct in_device *idev)
+ 	}
+ }
+ 
+-static struct in_device *inetdev_init(struct net_device *dev)
++struct in_device *inetdev_init(struct net_device *dev)
+ {
+ 	struct in_device *in_dev;
+ 
+@@ -189,6 +190,7 @@ out_kfree:
+ 	in_dev = NULL;
+ 	goto out;
+ }
++EXPORT_SYMBOL_GPL(inetdev_init);
+ 
+ static void in_dev_rcu_put(struct rcu_head *head)
+ {
+@@ -382,7 +384,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ 	return 0;
+ }
+ 
+-static int inet_insert_ifa(struct in_ifaddr *ifa)
++int inet_insert_ifa(struct in_ifaddr *ifa)
+ {
+ 	return __inet_insert_ifa(ifa, NULL, 0);
+ }
+@@ -433,6 +435,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+ 	} endfor_ifa(in_dev);
+ 	return NULL;
+ }
++EXPORT_SYMBOL_GPL(inet_insert_ifa);
+ 
+ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+ {
+@@ -633,7 +636,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+ 
+ 	case SIOCSIFFLAGS:
+ 		ret = -EACCES;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!capable(CAP_VE_NET_ADMIN))
+ 			goto out;
+ 		break;
+ 	case SIOCSIFADDR:	/* Set interface address (and family) */
+@@ -641,7 +644,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+ 	case SIOCSIFDSTADDR:	/* Set the destination address */
+ 	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+ 		ret = -EACCES;
+-		if (!capable(CAP_NET_ADMIN))
++		if (!capable(CAP_VE_NET_ADMIN))
+ 			goto out;
+ 		ret = -EINVAL;
+ 		if (sin->sin_family != AF_INET)
+@@ -1249,7 +1252,7 @@ static void inet_forward_change(struct net *net)
+ 	}
+ 	read_unlock(&dev_base_lock);
+ 
+-	rt_cache_flush(0);
++	rt_cache_flush(net, 0);
+ }
+ 
+ static int devinet_conf_proc(ctl_table *ctl, int write,
+@@ -1338,7 +1341,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
+ 		if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING))
+ 			inet_forward_change(net);
+ 		else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING))
+-			rt_cache_flush(0);
++			rt_cache_flush(net, 0);
+ 	}
+ 
+ 	return ret;
+@@ -1351,9 +1354,10 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
+ 	int *valp = ctl->data;
+ 	int val = *valp;
+ 	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
++	struct net *net = ctl->extra2;
+ 
+ 	if (write && *valp != val)
+-		rt_cache_flush(0);
++		rt_cache_flush(net, 0);
+ 
+ 	return ret;
+ }
+@@ -1364,9 +1368,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+ {
+ 	int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp,
+ 				      newval, newlen);
++	struct net *net = table->extra2;
+ 
+ 	if (ret == 1)
+-		rt_cache_flush(0);
++		rt_cache_flush(net, 0);
+ 
+ 	return ret;
+ }
+diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
+index 0b2ac6a..2dcec65 100644
+--- a/net/ipv4/fib_frontend.c
++++ b/net/ipv4/fib_frontend.c
+@@ -146,7 +146,7 @@ static void fib_flush(struct net *net)
+ 	}
+ 
+ 	if (flushed)
+-		rt_cache_flush(-1);
++		rt_cache_flush(net, -1);
+ }
+ 
+ /*
+@@ -260,7 +260,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
+ 	net = dev_net(dev);
+ 	if (fib_lookup(net, &fl, &res))
+ 		goto last_resort;
+-	if (res.type != RTN_UNICAST)
++	if (res.type != RTN_UNICAST &&
++		(!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL))
+ 		goto e_inval_res;
+ 	*spec_dst = FIB_RES_PREFSRC(res);
+ 	fib_combine_itag(itag, &res);
+@@ -462,7 +463,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+ 	switch (cmd) {
+ 	case SIOCADDRT:		/* Add a route */
+ 	case SIOCDELRT:		/* Delete a route */
+-		if (!capable(CAP_NET_ADMIN))
++		if (!capable(CAP_VE_NET_ADMIN))
+ 			return -EPERM;
+ 
+ 		if (copy_from_user(&rt, arg, sizeof(rt)))
+@@ -899,21 +900,22 @@ static void fib_disable_ip(struct net_device *dev, int force)
+ {
+ 	if (fib_sync_down_dev(dev, force))
+ 		fib_flush(dev_net(dev));
+-	rt_cache_flush(0);
++	rt_cache_flush(dev_net(dev), 0);
+ 	arp_ifdown(dev);
+ }
+ 
+ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+ {
+ 	struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
++	struct net_device *dev = ifa->ifa_dev->dev;
+ 
+ 	switch (event) {
+ 	case NETDEV_UP:
+ 		fib_add_ifaddr(ifa);
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH
+-		fib_sync_up(ifa->ifa_dev->dev);
++		fib_sync_up(dev);
+ #endif
+-		rt_cache_flush(-1);
++		rt_cache_flush(dev_net(dev), -1);
+ 		break;
+ 	case NETDEV_DOWN:
+ 		fib_del_ifaddr(ifa);
+@@ -921,9 +923,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
+ 			/* Last address was deleted from this interface.
+ 			   Disable IP.
+ 			 */
+-			fib_disable_ip(ifa->ifa_dev->dev, 1);
++			fib_disable_ip(dev, 1);
+ 		} else {
+-			rt_cache_flush(-1);
++			rt_cache_flush(dev_net(dev), -1);
+ 		}
+ 		break;
+ 	}
+@@ -951,14 +953,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH
+ 		fib_sync_up(dev);
+ #endif
+-		rt_cache_flush(-1);
++		rt_cache_flush(dev_net(dev), -1);
+ 		break;
+ 	case NETDEV_DOWN:
+ 		fib_disable_ip(dev, 0);
+ 		break;
+ 	case NETDEV_CHANGEMTU:
+ 	case NETDEV_CHANGE:
+-		rt_cache_flush(0);
++		rt_cache_flush(dev_net(dev), 0);
+ 		break;
+ 	}
+ 	return NOTIFY_DONE;
+diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
+index 2e2fc33..ccb3830 100644
+--- a/net/ipv4/fib_hash.c
++++ b/net/ipv4/fib_hash.c
+@@ -474,7 +474,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
+ 
+ 			fib_release_info(fi_drop);
+ 			if (state & FA_S_ACCESSED)
+-				rt_cache_flush(-1);
++				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 			rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
+ 				  &cfg->fc_nlinfo, NLM_F_REPLACE);
+ 			return 0;
+@@ -534,7 +534,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
+ 
+ 	if (new_f)
+ 		fz->fz_nent++;
+-	rt_cache_flush(-1);
++	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 
+ 	rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
+ 		  &cfg->fc_nlinfo, 0);
+@@ -616,7 +616,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
+ 		write_unlock_bh(&fib_hash_lock);
+ 
+ 		if (fa->fa_state & FA_S_ACCESSED)
+-			rt_cache_flush(-1);
++			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 		fn_free_alias(fa, f);
+ 		if (kill_fn) {
+ 			fn_free_node(f);
+@@ -772,10 +772,10 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
+ void __init fib_hash_init(void)
+ {
+ 	fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
+-					 0, SLAB_PANIC, NULL);
++					 0, SLAB_PANIC | SLAB_UBC, NULL);
+ 
+ 	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
+-					  0, SLAB_PANIC, NULL);
++					  0, SLAB_PANIC | SLAB_UBC, NULL);
+ 
+ }
+ 
+diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
+index 1fb5687..6080d71 100644
+--- a/net/ipv4/fib_rules.c
++++ b/net/ipv4/fib_rules.c
+@@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+ 	       + nla_total_size(4); /* flow */
+ }
+ 
+-static void fib4_rule_flush_cache(void)
++static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+ {
+-	rt_cache_flush(-1);
++	rt_cache_flush(ops->fro_net, -1);
+ }
+ 
+ static struct fib_rules_ops fib4_rules_ops_template = {
+diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
+index e1600ad..be16529 100644
+--- a/net/ipv4/fib_trie.c
++++ b/net/ipv4/fib_trie.c
+@@ -1273,7 +1273,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
+ 
+ 			fib_release_info(fi_drop);
+ 			if (state & FA_S_ACCESSED)
+-				rt_cache_flush(-1);
++				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ 
+@@ -1318,7 +1318,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
+ 	list_add_tail_rcu(&new_fa->fa_list,
+ 			  (fa ? &fa->fa_list : fa_head));
+ 
+-	rt_cache_flush(-1);
++	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+ 		  &cfg->fc_nlinfo, 0);
+ succeeded:
+@@ -1661,7 +1661,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
+ 		trie_leaf_remove(t, l);
+ 
+ 	if (fa->fa_state & FA_S_ACCESSED)
+-		rt_cache_flush(-1);
++		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+ 
+ 	fib_release_info(fa->fa_info);
+ 	alias_free_mem_rcu(fa);
+diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
+index 2769dc4..03a7004 100644
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -83,6 +83,7 @@
+ #include <linux/sockios.h>
+ #include <linux/in.h>
+ #include <linux/inet.h>
++#include <linux/nsproxy.h>
+ #include <linux/netdevice.h>
+ #include <linux/skbuff.h>
+ #include <linux/inetdevice.h>
+@@ -2317,7 +2318,7 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+ 	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ 
+ 	state->in_dev = NULL;
+-	for_each_netdev(&init_net, state->dev) {
++	for_each_netdev(get_exec_env()->ve_netns, state->dev) {
+ 		struct in_device *in_dev;
+ 		in_dev = in_dev_get(state->dev);
+ 		if (!in_dev)
+@@ -2466,7 +2467,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+ 
+ 	state->idev = NULL;
+ 	state->im = NULL;
+-	for_each_netdev(&init_net, state->dev) {
++	for_each_netdev(get_exec_env()->ve_netns, state->dev) {
+ 		struct in_device *idev;
+ 		idev = in_dev_get(state->dev);
+ 		if (unlikely(idev == NULL))
+@@ -2609,11 +2610,34 @@ static const struct file_operations igmp_mcf_seq_fops = {
+ 	.release	=	seq_release_private,
+ };
+ 
+-int __init igmp_mc_proc_init(void)
++static int igmp_net_init(struct net *net)
+ {
+-	proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+-	proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
++	if (!proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops))
++		goto out_igmp;
++	if (!proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops))
++		goto out_mcfilter;
+ 	return 0;
++
++out_mcfilter:
++	proc_net_remove(net, "igmp");
++out_igmp:
++	return -ENOMEM;
++}
++
++static void igmp_net_exit(struct net *net)
++{
++	proc_net_remove(net, "igmp");
++	proc_net_remove(net, "mcfilter");
++}
++
++static struct pernet_operations igmp_net_ops = {
++	.init = igmp_net_init,
++	.exit = igmp_net_exit,
++};
++
++int __init igmp_mc_proc_init(void)
++{
++	return register_pernet_subsys(&igmp_net_ops);
+ }
+ #endif
+ 
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index ec83448..5bfa408 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -24,6 +24,9 @@
+ #include <net/tcp_states.h>
+ #include <net/xfrm.h>
+ 
++#include <bc/net.h>
++#include <bc/sock_orphan.h>
++
+ #ifdef INET_CSK_DEBUG
+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+@@ -93,6 +96,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
+ 	struct inet_bind_bucket *tb;
+ 	int ret;
+ 	struct net *net = sock_net(sk);
++	struct ve_struct *env = sk->owner_env;
+ 
+ 	local_bh_disable();
+ 	if (!snum) {
+@@ -103,7 +107,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
+ 		rover = net_random() % remaining + low;
+ 
+ 		do {
+-			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
++			head = &hashinfo->bhash[inet_bhashfn(rover,
++					hashinfo->bhash_size, VEID(env))];
+ 			spin_lock(&head->lock);
+ 			inet_bind_bucket_for_each(tb, node, &head->chain)
+ 				if (tb->ib_net == net && tb->port == rover)
+@@ -130,7 +135,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
+ 		 */
+ 		snum = rover;
+ 	} else {
+-		head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
++		head = &hashinfo->bhash[inet_bhashfn(snum,
++				hashinfo->bhash_size, VEID(env))];
+ 		spin_lock(&head->lock);
+ 		inet_bind_bucket_for_each(tb, node, &head->chain)
+ 			if (tb->ib_net == net && tb->port == snum)
+@@ -152,7 +158,7 @@ tb_found:
+ tb_not_found:
+ 	ret = 1;
+ 	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+-					net, head, snum)) == NULL)
++					net, head, snum, env)) == NULL)
+ 		goto fail_unlock;
+ 	if (hlist_empty(&tb->owners)) {
+ 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+@@ -553,7 +559,7 @@ void inet_csk_destroy_sock(struct sock *sk)
+ 
+ 	sk_refcnt_debug_release(sk);
+ 
+-	atomic_dec(sk->sk_prot->orphan_count);
++	ub_dec_orphan_count(sk);
+ 	sock_put(sk);
+ }
+ 
+@@ -633,7 +639,7 @@ void inet_csk_listen_stop(struct sock *sk)
+ 
+ 		sock_orphan(child);
+ 
+-		atomic_inc(sk->sk_prot->orphan_count);
++		ub_inc_orphan_count(sk);
+ 
+ 		inet_csk_destroy_sock(child);
+ 
+diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
+index da97695..58408eb 100644
+--- a/net/ipv4/inet_diag.c
++++ b/net/ipv4/inet_diag.c
+@@ -708,6 +708,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+ 	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ 	const struct inet_diag_handler *handler;
+ 	struct inet_hashinfo *hashinfo;
++	struct ve_struct *ve = get_exec_env();
+ 
+ 	handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
+ 	if (IS_ERR(handler))
+@@ -731,6 +732,8 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+ 			sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ 				struct inet_sock *inet = inet_sk(sk);
+ 
++				if (!ve_accessible(sk->owner_env, ve))
++					continue;
+ 				if (num < s_num) {
+ 					num++;
+ 					continue;
+@@ -792,6 +795,8 @@ skip_listen_ht:
+ 		sk_for_each(sk, node, &head->chain) {
+ 			struct inet_sock *inet = inet_sk(sk);
+ 
++			if (!ve_accessible(sk->owner_env, ve))
++				continue;
+ 			if (num < s_num)
+ 				goto next_normal;
+ 			if (!(r->idiag_states & (1 << sk->sk_state)))
+@@ -816,6 +821,8 @@ next_normal:
+ 			inet_twsk_for_each(tw, node,
+ 				    &head->twchain) {
+ 
++				if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve)))
++					continue;
+ 				if (num < s_num)
+ 					goto next_dying;
+ 				if (r->id.idiag_sport != tw->tw_sport &&
+diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
+index 0546a0b..a967588 100644
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -249,6 +249,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+ 	spin_lock_init(&q->lock);
+ 	atomic_set(&q->refcnt, 1);
+ 	q->net = nf;
++#ifdef CONFIG_VE
++	q->owner_ve = get_exec_env();
++#endif
+ 
+ 	return q;
+ }
+diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
+index 2023d37..af2a58b 100644
+--- a/net/ipv4/inet_hashtables.c
++++ b/net/ipv4/inet_hashtables.c
+@@ -30,7 +30,8 @@
+ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ 						 struct net *net,
+ 						 struct inet_bind_hashbucket *head,
+-						 const unsigned short snum)
++						 const unsigned short snum,
++						 struct ve_struct *ve)
+ {
+ 	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+ 
+@@ -39,6 +40,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ 		tb->port      = snum;
+ 		tb->fastreuse = 0;
+ 		INIT_HLIST_HEAD(&tb->owners);
++		tb->owner_env = ve;
+ 		hlist_add_head(&tb->node, &head->chain);
+ 	}
+ 	return tb;
+@@ -70,10 +72,13 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ static void __inet_put_port(struct sock *sk)
+ {
+ 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+-	const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+-	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
++	int bhash;
++	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+ 
++	bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size,
++			VEID(sk->owner_env));
++	head = &hashinfo->bhash[bhash];
+ 	spin_lock(&head->lock);
+ 	tb = inet_csk(sk)->icsk_bind_hash;
+ 	__sk_del_bind_node(sk);
+@@ -95,7 +100,8 @@ EXPORT_SYMBOL(inet_put_port);
+ void __inet_inherit_port(struct sock *sk, struct sock *child)
+ {
+ 	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+-	const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
++	const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size,
++					VEID(child->owner_env));
+ 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
+ 	struct inet_bind_bucket *tb;
+ 
+@@ -190,9 +196,11 @@ struct sock *__inet_lookup_listener(struct net *net,
+ {
+ 	struct sock *sk = NULL;
+ 	const struct hlist_head *head;
++	struct ve_struct *env;
+ 
++	env = get_exec_env();
+ 	read_lock(&hashinfo->lhash_lock);
+-	head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
++	head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))];
+ 	if (!hlist_empty(head)) {
+ 		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+ 
+@@ -225,7 +233,8 @@ struct sock * __inet_lookup_established(struct net *net,
+ 	/* Optimize here for direct hit, only listening connections can
+ 	 * have wildcards anyways.
+ 	 */
+-	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
++	struct ve_struct *ve = get_exec_env();
++	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(ve));
+ 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+ 	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+ 
+@@ -256,7 +265,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
+ /* called with local bh disabled */
+ static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ 				    struct sock *sk, __u16 lport,
+-				    struct inet_timewait_sock **twp)
++				    struct inet_timewait_sock **twp,
++				    struct ve_struct *ve)
+ {
+ 	struct inet_hashinfo *hinfo = death_row->hashinfo;
+ 	struct inet_sock *inet = inet_sk(sk);
+@@ -265,7 +275,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ 	int dif = sk->sk_bound_dev_if;
+ 	INET_ADDR_COOKIE(acookie, saddr, daddr)
+ 	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
+-	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
++	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve));
+ 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ 	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
+ 	struct sock *sk2;
+@@ -415,7 +425,8 @@ EXPORT_SYMBOL_GPL(inet_unhash);
+ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 		struct sock *sk, u32 port_offset,
+ 		int (*check_established)(struct inet_timewait_death_row *,
+-			struct sock *, __u16, struct inet_timewait_sock **),
++			struct sock *, __u16, struct inet_timewait_sock **,
++			struct ve_struct *),
+ 		void (*hash)(struct sock *sk))
+ {
+ 	struct inet_hashinfo *hinfo = death_row->hashinfo;
+@@ -424,6 +435,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 	struct inet_bind_bucket *tb;
+ 	int ret;
+ 	struct net *net = sock_net(sk);
++	struct ve_struct *ve = sk->owner_env;
+ 
+ 	if (!snum) {
+ 		int i, remaining, low, high, port;
+@@ -438,7 +450,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 		local_bh_disable();
+ 		for (i = 1; i <= remaining; i++) {
+ 			port = low + (i + offset) % remaining;
+-			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
++			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size, VEID(ve))];
+ 			spin_lock(&head->lock);
+ 
+ 			/* Does not bother with rcv_saddr checks,
+@@ -451,14 +463,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!check_established(death_row, sk,
+-								port, &tw))
++								port, &tw, ve))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+ 
+ 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+-					net, head, port);
++					net, head, port, ve);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+@@ -493,7 +505,7 @@ ok:
+ 		goto out;
+ 	}
+ 
+-	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
++	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))];
+ 	tb  = inet_csk(sk)->icsk_bind_hash;
+ 	spin_lock_bh(&head->lock);
+ 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+@@ -503,7 +515,7 @@ ok:
+ 	} else {
+ 		spin_unlock(&head->lock);
+ 		/* No definite answer... Walk to established hash table */
+-		ret = check_established(death_row, sk, snum, NULL);
++		ret = check_established(death_row, sk, snum, NULL, ve);
+ out:
+ 		local_bh_enable();
+ 		return ret;
+diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
+index ce16e9a..bd67750 100644
+--- a/net/ipv4/inet_timewait_sock.c
++++ b/net/ipv4/inet_timewait_sock.c
+@@ -13,6 +13,8 @@
+ #include <net/inet_timewait_sock.h>
+ #include <net/ip.h>
+ 
++#include <bc/sock_orphan.h>
++
+ /* Must be called with locally disabled BHs. */
+ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
+ 			     struct inet_hashinfo *hashinfo)
+@@ -32,7 +34,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
+ 	write_unlock(lock);
+ 
+ 	/* Disassociate with bind bucket. */
+-	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
++	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num,
++			hashinfo->bhash_size, tw->tw_owner_env)];
+ 	spin_lock(&bhead->lock);
+ 	tb = tw->tw_tb;
+ 	__hlist_del(&tw->tw_bind_node);
+@@ -81,7 +84,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ 	   Note, that any socket with inet->num != 0 MUST be bound in
+ 	   binding cache, even if it is closed.
+ 	 */
+-	bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
++	bhead = &hashinfo->bhash[inet_bhashfn(inet->num,
++			hashinfo->bhash_size, tw->tw_owner_env)];
+ 	spin_lock(&bhead->lock);
+ 	tw->tw_tb = icsk->icsk_bind_hash;
+ 	BUG_TRAP(icsk->icsk_bind_hash);
+@@ -105,9 +109,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+ 
+ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+ {
+-	struct inet_timewait_sock *tw =
+-		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+-				 GFP_ATOMIC);
++	struct user_beancounter *ub;
++	struct inet_timewait_sock *tw;
++
++	ub = set_exec_ub(sock_bc(sk)->ub);
++	tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
++			GFP_ATOMIC);
++	(void)set_exec_ub(ub);
++
+ 	if (tw != NULL) {
+ 		const struct inet_sock *inet = inet_sk(sk);
+ 
+@@ -156,6 +165,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+ rescan:
+ 	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ 		__inet_twsk_del_dead_node(tw);
++		ub_timewait_dec(tw, twdr);
+ 		spin_unlock(&twdr->death_lock);
+ 		__inet_twsk_kill(tw, twdr->hashinfo);
+ 		inet_twsk_put(tw);
+@@ -252,6 +262,7 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+ {
+ 	spin_lock(&twdr->death_lock);
+ 	if (inet_twsk_del_dead_node(tw)) {
++		ub_timewait_dec(tw, twdr);
+ 		inet_twsk_put(tw);
+ 		if (--twdr->tw_count == 0)
+ 			del_timer(&twdr->tw_timer);
+@@ -298,9 +309,10 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
+ 	spin_lock(&twdr->death_lock);
+ 
+ 	/* Unlink it, if it was scheduled */
+-	if (inet_twsk_del_dead_node(tw))
++	if (inet_twsk_del_dead_node(tw)) {
++		ub_timewait_dec(tw, twdr);
+ 		twdr->tw_count--;
+-	else
++	} else
+ 		atomic_inc(&tw->tw_refcnt);
+ 
+ 	if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+@@ -336,6 +348,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
+ 
+ 	hlist_add_head(&tw->tw_death_node, list);
+ 
++	ub_timewait_inc(tw, twdr);
+ 	if (twdr->tw_count++ == 0)
+ 		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+ 	spin_unlock(&twdr->death_lock);
+@@ -370,6 +383,7 @@ void inet_twdr_twcal_tick(unsigned long data)
+ 						       &twdr->twcal_row[slot]) {
+ 				__inet_twsk_del_dead_node(tw);
+ 				__inet_twsk_kill(tw, twdr->hashinfo);
++				ub_timewait_dec(tw, twdr);
+ 				inet_twsk_put(tw);
+ 				killed++;
+ 			}
+diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
+index 4813c39..d9af146 100644
+--- a/net/ipv4/ip_forward.c
++++ b/net/ipv4/ip_forward.c
+@@ -93,6 +93,24 @@ int ip_forward(struct sk_buff *skb)
+ 		goto drop;
+ 	}
+ 
++	/*
++	 * We try to optimize forwarding of VE packets:
++	 * do not decrement TTL (and so save skb_cow)
++	 * during forwarding of outgoing pkts from VE.
++	 * For incoming pkts we still do ttl decr,
++	 * since such skb is not cloned and does not require
++	 * actual cow. So, there is at least one place
++	 * in pkts path with mandatory ttl decr, that is
++	 * sufficient to prevent routing loops.
++	 */
++	iph = ip_hdr(skb);
++	if (
++#ifdef CONFIG_IP_ROUTE_NAT			
++	    (rt->rt_flags & RTCF_NAT) == 0 &&	  /* no NAT mangling expected */
++#endif						  /* and */
++	    (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */
++		goto no_ttl_decr;
++
+ 	/* We are about to mangle packet. Copy it! */
+ 	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ 		goto drop;
+@@ -101,6 +119,8 @@ int ip_forward(struct sk_buff *skb)
+ 	/* Decrease ttl after skb cow done */
+ 	ip_decrease_ttl(iph);
+ 
++no_ttl_decr:
++
+ 	/*
+ 	 *	We now generate an ICMP HOST REDIRECT giving the route
+ 	 *	we calculated.
+diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
+index 37221f6..b74fe5e 100644
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -188,9 +188,12 @@ static void ip_evictor(struct net *net)
+  */
+ static void ip_expire(unsigned long arg)
+ {
++	struct inet_frag_queue *q = (struct inet_frag_queue *)arg;
+ 	struct ipq *qp;
++	struct ve_struct *old_ve;
+ 
+-	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
++	qp = container_of(q, struct ipq, q);
++	old_ve = set_exec_env(q->owner_ve);
+ 
+ 	spin_lock(&qp->q.lock);
+ 
+@@ -216,6 +219,8 @@ static void ip_expire(unsigned long arg)
+ out:
+ 	spin_unlock(&qp->q.lock);
+ 	ipq_put(qp);
++
++	(void)set_exec_env(old_ve);
+ }
+ 
+ /* Find the correct entry in the "incomplete datagrams" queue for
+@@ -523,6 +528,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+ 		clone->csum = 0;
+ 		clone->ip_summed = head->ip_summed;
+ 		atomic_add(clone->truesize, &qp->q.net->mem);
++		clone->owner_env = head->owner_env;
+ 	}
+ 
+ 	skb_shinfo(head)->frag_list = head->next;
+diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
+index ff77a4a..6408845 100644
+--- a/net/ipv4/ip_input.c
++++ b/net/ipv4/ip_input.c
+@@ -201,6 +201,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
+ {
+ 	struct net *net = dev_net(skb->dev);
+ 
++	if (skb->destructor)
++		skb_orphan(skb);
+ 	__skb_pull(skb, ip_hdrlen(skb));
+ 
+ 	/* Point into the IP datagram, just past the header. */
+diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
+index e527628..adc6892 100644
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -1346,12 +1346,13 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
+ 		char			data[40];
+ 	} replyopts;
+ 	struct ipcm_cookie ipc;
+-	__be32 daddr;
++	__be32 saddr, daddr;
+ 	struct rtable *rt = skb->rtable;
+ 
+ 	if (ip_options_echo(&replyopts.opt, skb))
+ 		return;
+ 
++	saddr = ip_hdr(skb)->daddr;
+ 	daddr = ipc.addr = rt->rt_src;
+ 	ipc.opt = NULL;
+ 
+@@ -1366,7 +1367,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
+ 		struct flowi fl = { .oif = arg->bound_dev_if,
+ 				    .nl_u = { .ip4_u =
+ 					      { .daddr = daddr,
+-						.saddr = rt->rt_spec_dst,
++						.saddr = saddr,
+ 						.tos = RT_TOS(ip_hdr(skb)->tos) } },
+ 				    /* Not quite clean, but right. */
+ 				    .uli_u = { .ports =
+diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
+index ed45037..6fab8a3 100644
+--- a/net/ipv4/ipconfig.c
++++ b/net/ipv4/ipconfig.c
+@@ -189,19 +189,20 @@ static int __init ic_open_devs(void)
+ 	struct ic_device *d, **last;
+ 	struct net_device *dev;
+ 	unsigned short oflags;
++	struct net *net = get_exec_env()->ve_netns;
+ 
+ 	last = &ic_first_dev;
+ 	rtnl_lock();
+ 
+ 	/* bring loopback device up first */
+-	for_each_netdev(&init_net, dev) {
++	for_each_netdev(net, dev) {
+ 		if (!(dev->flags & IFF_LOOPBACK))
+ 			continue;
+ 		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ 			printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ 	}
+ 
+-	for_each_netdev(&init_net, dev) {
++	for_each_netdev(net, dev) {
+ 		if (dev->flags & IFF_LOOPBACK)
+ 			continue;
+ 		if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+@@ -434,9 +435,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
+ 	unsigned char *sha, *tha;		/* s for "source", t for "target" */
+ 	struct ic_device *d;
+ 
+-	if (dev_net(dev) != &init_net)
+-		goto drop;
+-
+ 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ 		return NET_RX_DROP;
+ 
+@@ -854,9 +852,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
+ 	struct ic_device *d;
+ 	int len, ext_len;
+ 
+-	if (dev_net(dev) != &init_net)
+-		goto drop;
+-
+ 	/* Perform verifications before taking the lock.  */
+ 	if (skb->pkt_type == PACKET_OTHERHOST)
+ 		goto drop;
+diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
+index 11700a4..247d1cc 100644
+--- a/net/ipv4/ipmr.c
++++ b/net/ipv4/ipmr.c
+@@ -42,6 +42,7 @@
+ #include <linux/in.h>
+ #include <linux/inet.h>
+ #include <linux/netdevice.h>
++#include <linux/nsproxy.h>
+ #include <linux/inetdevice.h>
+ #include <linux/igmp.h>
+ #include <linux/proc_fs.h>
+@@ -123,9 +124,10 @@ static struct timer_list ipmr_expire_timer;
+ static
+ struct net_device *ipmr_new_tunnel(struct vifctl *v)
+ {
++	struct net *net = get_exec_env()->ve_netns;
+ 	struct net_device  *dev;
+ 
+-	dev = __dev_get_by_name(&init_net, "tunl0");
++	dev = __dev_get_by_name(net, "tunl0");
+ 
+ 	if (dev) {
+ 		int err;
+@@ -149,7 +151,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
+ 
+ 		dev = NULL;
+ 
+-		if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
++		if (err == 0 && (dev = __dev_get_by_name(net, p.name)) != NULL) {
+ 			dev->flags |= IFF_MULTICAST;
+ 
+ 			in_dev = __in_dev_get_rtnl(dev);
+@@ -1089,9 +1091,6 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
+ 	struct vif_device *v;
+ 	int ct;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	if (event != NETDEV_UNREGISTER)
+ 		return NOTIFY_DONE;
+ 	v=&vif_table[0];
+diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
+index 65f1ba1..5f223e8 100644
+--- a/net/ipv4/ipvs/ip_vs_conn.c
++++ b/net/ipv4/ipvs/ip_vs_conn.c
+@@ -981,7 +981,7 @@ int ip_vs_conn_init(void)
+ 	/* Allocate ip_vs_conn slab cache */
+ 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ 					      sizeof(struct ip_vs_conn), 0,
+-					      SLAB_HWCACHE_ALIGN, NULL);
++					      SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
+ 	if (!ip_vs_conn_cachep) {
+ 		vfree(ip_vs_conn_tab);
+ 		return -ENOMEM;
+diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
+index eff54ef..f045d56 100644
+--- a/net/ipv4/ipvs/ip_vs_sync.c
++++ b/net/ipv4/ipvs/ip_vs_sync.c
+@@ -23,6 +23,7 @@
+ #include <linux/slab.h>
+ #include <linux/inetdevice.h>
+ #include <linux/net.h>
++#include <linux/nsproxy.h>
+ #include <linux/completion.h>
+ #include <linux/delay.h>
+ #include <linux/skbuff.h>
+@@ -464,7 +465,8 @@ static int set_mcast_if(struct sock *sk, char *ifname)
+ 	struct net_device *dev;
+ 	struct inet_sock *inet = inet_sk(sk);
+ 
+-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
++	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
++	if (!dev)
+ 		return -ENODEV;
+ 
+ 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+@@ -485,11 +487,12 @@ static int set_mcast_if(struct sock *sk, char *ifname)
+  */
+ static int set_sync_mesg_maxlen(int sync_state)
+ {
++	struct net *net = get_exec_env()->ve_netns;
+ 	struct net_device *dev;
+ 	int num;
+ 
+ 	if (sync_state == IP_VS_STATE_MASTER) {
+-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
++		if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL)
+ 			return -ENODEV;
+ 
+ 		num = (dev->mtu - sizeof(struct iphdr) -
+@@ -500,7 +503,7 @@ static int set_sync_mesg_maxlen(int sync_state)
+ 		IP_VS_DBG(7, "setting the maximum length of sync sending "
+ 			  "message %d.\n", sync_send_mesg_maxlen);
+ 	} else if (sync_state == IP_VS_STATE_BACKUP) {
+-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
++		if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL)
+ 			return -ENODEV;
+ 
+ 		sync_recv_mesg_maxlen = dev->mtu -
+@@ -528,7 +531,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+ 	memset(&mreq, 0, sizeof(mreq));
+ 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+ 
+-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
++	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
++	if (!dev)
+ 		return -ENODEV;
+ 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ 		return -EINVAL;
+@@ -549,7 +553,8 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
+ 	__be32 addr;
+ 	struct sockaddr_in sin;
+ 
+-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
++	dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
++	if (!dev)
+ 		return -ENODEV;
+ 
+ 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
+index 26a37ce..5ac56af 100644
+--- a/net/ipv4/netfilter/ip_queue.c
++++ b/net/ipv4/netfilter/ip_queue.c
+@@ -439,7 +439,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
+ 	if (type <= IPQM_BASE)
+ 		return;
+ 
+-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		RCV_SKB_FAIL(-EPERM);
+ 
+ 	write_lock_bh(&queue_lock);
+@@ -469,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
+ static void
+ ipq_rcv_skb(struct sk_buff *skb)
+ {
++	struct ve_struct *old_ve;
++
+ 	mutex_lock(&ipqnl_mutex);
++	old_ve = set_exec_env(skb->owner_env);
+ 	__ipq_rcv_skb(skb);
++	(void)set_exec_env(old_ve);
+ 	mutex_unlock(&ipqnl_mutex);
+ }
+ 
+@@ -480,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block *this,
+ {
+ 	struct net_device *dev = ptr;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	/* Drop any packets associated with the downed device */
+ 	if (event == NETDEV_DOWN)
+ 		ipq_dev_drop(dev->ifindex);
+@@ -502,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
+ 	if (event == NETLINK_URELEASE &&
+ 	    n->protocol == NETLINK_FIREWALL && n->pid) {
+ 		write_lock_bh(&queue_lock);
+-		if ((n->net == &init_net) && (n->pid == peer_pid))
++		if (n->pid == peer_pid)
+ 			__ipq_reset();
+ 		write_unlock_bh(&queue_lock);
+ 	}
+diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
+index 4e7c719..18e2717 100644
+--- a/net/ipv4/netfilter/ip_tables.c
++++ b/net/ipv4/netfilter/ip_tables.c
+@@ -337,6 +337,9 @@ ipt_do_table(struct sk_buff *skb,
+ 	struct ipt_entry *e, *back;
+ 	struct xt_table_info *private;
+ 
++	if (!table)		/* VE is not allowed to have this xtable */
++		return NF_ACCEPT;
++
+ 	/* Initialization */
+ 	ip = ip_hdr(skb);
+ 	datalen = skb->len - ip->ihl * 4;
+@@ -488,8 +491,8 @@ mark_source_chains(struct xt_table_info *newinfo,
+ 			int visited = e->comefrom & (1 << hook);
+ 
+ 			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+-				printk("iptables: loop hook %u pos %u %08X.\n",
+-				       hook, pos, e->comefrom);
++				ve_printk(VE_LOG, "iptables: loop hook %u pos "
++					"%u %08X.\n", hook, pos, e->comefrom);
+ 				return 0;
+ 			}
+ 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+@@ -932,7 +935,7 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
+ 	   (other than comefrom, which userspace doesn't care
+ 	   about). */
+ 	countersize = sizeof(struct xt_counters) * private->number;
+-	counters = vmalloc_node(countersize, numa_node_id());
++	counters = ub_vmalloc_node(countersize, numa_node_id());
+ 
+ 	if (counters == NULL)
+ 		return ERR_PTR(-ENOMEM);
+@@ -1202,7 +1205,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ 	void *loc_cpu_old_entry;
+ 
+ 	ret = 0;
+-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
++	counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters));
+ 	if (!counters) {
+ 		ret = -ENOMEM;
+ 		goto out;
+@@ -1374,7 +1377,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
+ 	if (len != size + num_counters * sizeof(struct xt_counters))
+ 		return -EINVAL;
+ 
+-	paddc = vmalloc_node(len - size, numa_node_id());
++	paddc = ub_vmalloc_node(len - size, numa_node_id());
+ 	if (!paddc)
+ 		return -ENOMEM;
+ 
+@@ -1841,13 +1844,15 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
+ 	return ret;
+ }
+ 
++static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int);
++
+ static int
+ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
+ 		      unsigned int len)
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -1860,8 +1865,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
+ 		break;
+ 
+ 	default:
+-		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
+-		ret = -EINVAL;
++		ret = do_ipt_set_ctl(sk, cmd, user, len);
+ 	}
+ 
+ 	return ret;
+@@ -1958,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -1980,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -2005,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -2057,7 +2061,7 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *table,
+ 	int ret;
+ 	struct xt_table_info *newinfo;
+ 	struct xt_table_info bootstrap
+-		= { 0, 0, 0, { 0 }, { 0 }, { } };
++		= { 0, 0, 0, 0, { 0 }, { 0 }, { } };
+ 	void *loc_cpu_entry;
+ 	struct xt_table *new_table;
+ 
+@@ -2216,11 +2220,22 @@ static struct xt_match icmp_matchstruct __read_mostly = {
+ 
+ static int __net_init ip_tables_net_init(struct net *net)
+ {
+-	return xt_proto_init(net, AF_INET);
++	int res;
++
++	if (!net_ipt_module_permitted(net, VE_IP_IPTABLES))
++		return 0;
++
++	res = xt_proto_init(net, AF_INET);
++	if (!res)
++		net_ipt_module_set(net, VE_IP_IPTABLES);
++	return res;
+ }
+ 
+ static void __net_exit ip_tables_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES))
++		return;
++
+ 	xt_proto_fini(net, AF_INET);
+ }
+ 
+diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
+index 1819ad7..25223a8 100644
+--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
++++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
+@@ -20,6 +20,7 @@
+ #include <linux/icmp.h>
+ #include <linux/if_arp.h>
+ #include <linux/seq_file.h>
++#include <linux/nsproxy.h>
+ #include <linux/netfilter_arp.h>
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -388,7 +389,8 @@ clusterip_tg_check(const char *tablename, const void *e_void,
+ 				return false;
+ 			}
+ 
+-			dev = dev_get_by_name(&init_net, e->ip.iniface);
++			dev = dev_get_by_name(get_exec_env()->ve_netns,
++						e->ip.iniface);
+ 			if (!dev) {
+ 				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+ 				return false;
+diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
+index 0af1413..08a4bcd 100644
+--- a/net/ipv4/netfilter/ipt_LOG.c
++++ b/net/ipv4/netfilter/ipt_LOG.c
+@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
+ 
+ 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ 	if (ih == NULL) {
+-		printk("TRUNCATED");
++		ve_printk(VE_LOG, "TRUNCATED");
+ 		return;
+ 	}
+ 
+ 	/* Important fields:
+ 	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ 	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+-	printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
++	ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
+ 	       NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
+ 
+ 	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+-	printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
++	ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ 	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ 	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+ 
+ 	/* Max length: 6 "CE DF MF " */
+ 	if (ntohs(ih->frag_off) & IP_CE)
+-		printk("CE ");
++		ve_printk(VE_LOG, "CE ");
+ 	if (ntohs(ih->frag_off) & IP_DF)
+-		printk("DF ");
++		ve_printk(VE_LOG, "DF ");
+ 	if (ntohs(ih->frag_off) & IP_MF)
+-		printk("MF ");
++		ve_printk(VE_LOG, "MF ");
+ 
+ 	/* Max length: 11 "FRAG:65535 " */
+ 	if (ntohs(ih->frag_off) & IP_OFFSET)
+-		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
++		ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+ 
+ 	if ((logflags & IPT_LOG_IPOPT)
+ 	    && ih->ihl * 4 > sizeof(struct iphdr)) {
+@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ 					optsize, _opt);
+ 		if (op == NULL) {
+-			printk("TRUNCATED");
++			ve_printk(VE_LOG, "TRUNCATED");
+ 			return;
+ 		}
+ 
+ 		/* Max length: 127 "OPT (" 15*4*2chars ") " */
+-		printk("OPT (");
++		ve_printk(VE_LOG, "OPT (");
+ 		for (i = 0; i < optsize; i++)
+-			printk("%02X", op[i]);
+-		printk(") ");
++			ve_printk(VE_LOG, "%02X", op[i]);
++		ve_printk(VE_LOG, ") ");
+ 	}
+ 
+ 	switch (ih->protocol) {
+@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		const struct tcphdr *th;
+ 
+ 		/* Max length: 10 "PROTO=TCP " */
+-		printk("PROTO=TCP ");
++		ve_printk(VE_LOG, "PROTO=TCP ");
+ 
+ 		if (ntohs(ih->frag_off) & IP_OFFSET)
+ 			break;
+@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ 					sizeof(_tcph), &_tcph);
+ 		if (th == NULL) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+ 
+ 		/* Max length: 20 "SPT=65535 DPT=65535 " */
+-		printk("SPT=%u DPT=%u ",
++		ve_printk(VE_LOG, "SPT=%u DPT=%u ",
+ 		       ntohs(th->source), ntohs(th->dest));
+ 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ 		if (logflags & IPT_LOG_TCPSEQ)
+-			printk("SEQ=%u ACK=%u ",
++			ve_printk(VE_LOG, "SEQ=%u ACK=%u ",
+ 			       ntohl(th->seq), ntohl(th->ack_seq));
+ 		/* Max length: 13 "WINDOW=65535 " */
+-		printk("WINDOW=%u ", ntohs(th->window));
++		ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window));
+ 		/* Max length: 9 "RES=0x3F " */
+-		printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
++		ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ 		if (th->cwr)
+-			printk("CWR ");
++			ve_printk(VE_LOG, "CWR ");
+ 		if (th->ece)
+-			printk("ECE ");
++			ve_printk(VE_LOG, "ECE ");
+ 		if (th->urg)
+-			printk("URG ");
++			ve_printk(VE_LOG, "URG ");
+ 		if (th->ack)
+-			printk("ACK ");
++			ve_printk(VE_LOG, "ACK ");
+ 		if (th->psh)
+-			printk("PSH ");
++			ve_printk(VE_LOG, "PSH ");
+ 		if (th->rst)
+-			printk("RST ");
++			ve_printk(VE_LOG, "RST ");
+ 		if (th->syn)
+-			printk("SYN ");
++			ve_printk(VE_LOG, "SYN ");
+ 		if (th->fin)
+-			printk("FIN ");
++			ve_printk(VE_LOG, "FIN ");
+ 		/* Max length: 11 "URGP=65535 " */
+-		printk("URGP=%u ", ntohs(th->urg_ptr));
++		ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr));
+ 
+ 		if ((logflags & IPT_LOG_TCPOPT)
+ 		    && th->doff * 4 > sizeof(struct tcphdr)) {
+@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
+ 						iphoff+ih->ihl*4+sizeof(_tcph),
+ 						optsize, _opt);
+ 			if (op == NULL) {
+-				printk("TRUNCATED");
++				ve_printk(VE_LOG, "TRUNCATED");
+ 				return;
+ 			}
+ 
+ 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
+-			printk("OPT (");
++			ve_printk(VE_LOG, "OPT (");
+ 			for (i = 0; i < optsize; i++)
+-				printk("%02X", op[i]);
+-			printk(") ");
++				ve_printk(VE_LOG, "%02X", op[i]);
++			ve_printk(VE_LOG, ") ");
+ 		}
+ 		break;
+ 	}
+@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
+ 
+ 		if (ih->protocol == IPPROTO_UDP)
+ 			/* Max length: 10 "PROTO=UDP "     */
+-			printk("PROTO=UDP " );
++			ve_printk(VE_LOG, "PROTO=UDP " );
+ 		else	/* Max length: 14 "PROTO=UDPLITE " */
+-			printk("PROTO=UDPLITE ");
++			ve_printk(VE_LOG, "PROTO=UDPLITE ");
+ 
+ 		if (ntohs(ih->frag_off) & IP_OFFSET)
+ 			break;
+@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ 					sizeof(_udph), &_udph);
+ 		if (uh == NULL) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+ 
+ 		/* Max length: 20 "SPT=65535 DPT=65535 " */
+-		printk("SPT=%u DPT=%u LEN=%u ",
++		ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ",
+ 		       ntohs(uh->source), ntohs(uh->dest),
+ 		       ntohs(uh->len));
+ 		break;
+@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
+ 			    [ICMP_ADDRESSREPLY] = 12 };
+ 
+ 		/* Max length: 11 "PROTO=ICMP " */
+-		printk("PROTO=ICMP ");
++		ve_printk(VE_LOG, "PROTO=ICMP ");
+ 
+ 		if (ntohs(ih->frag_off) & IP_OFFSET)
+ 			break;
+@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ 					 sizeof(_icmph), &_icmph);
+ 		if (ich == NULL) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+ 
+ 		/* Max length: 18 "TYPE=255 CODE=255 " */
+-		printk("TYPE=%u CODE=%u ", ich->type, ich->code);
++		ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code);
+ 
+ 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ 		if (ich->type <= NR_ICMP_TYPES
+ 		    && required_len[ich->type]
+ 		    && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+@@ -250,19 +250,19 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		case ICMP_ECHOREPLY:
+ 		case ICMP_ECHO:
+ 			/* Max length: 19 "ID=65535 SEQ=65535 " */
+-			printk("ID=%u SEQ=%u ",
++			ve_printk(VE_LOG, "ID=%u SEQ=%u ",
+ 			       ntohs(ich->un.echo.id),
+ 			       ntohs(ich->un.echo.sequence));
+ 			break;
+ 
+ 		case ICMP_PARAMETERPROB:
+ 			/* Max length: 14 "PARAMETER=255 " */
+-			printk("PARAMETER=%u ",
++			ve_printk(VE_LOG, "PARAMETER=%u ",
+ 			       ntohl(ich->un.gateway) >> 24);
+ 			break;
+ 		case ICMP_REDIRECT:
+ 			/* Max length: 24 "GATEWAY=255.255.255.255 " */
+-			printk("GATEWAY=%u.%u.%u.%u ",
++			ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ",
+ 			       NIPQUAD(ich->un.gateway));
+ 			/* Fall through */
+ 		case ICMP_DEST_UNREACH:
+@@ -270,16 +270,16 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		case ICMP_TIME_EXCEEDED:
+ 			/* Max length: 3+maxlen */
+ 			if (!iphoff) { /* Only recurse once. */
+-				printk("[");
++				ve_printk(VE_LOG, "[");
+ 				dump_packet(info, skb,
+ 					    iphoff + ih->ihl*4+sizeof(_icmph));
+-				printk("] ");
++				ve_printk(VE_LOG, "] ");
+ 			}
+ 
+ 			/* Max length: 10 "MTU=65535 " */
+ 			if (ich->type == ICMP_DEST_UNREACH
+ 			    && ich->code == ICMP_FRAG_NEEDED)
+-				printk("MTU=%u ", ntohs(ich->un.frag.mtu));
++				ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu));
+ 		}
+ 		break;
+ 	}
+@@ -292,19 +292,19 @@ static void dump_packet(const struct nf_loginfo *info,
+ 			break;
+ 
+ 		/* Max length: 9 "PROTO=AH " */
+-		printk("PROTO=AH ");
++		ve_printk(VE_LOG, "PROTO=AH ");
+ 
+ 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ 		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ 					sizeof(_ahdr), &_ahdr);
+ 		if (ah == NULL) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+ 
+ 		/* Length: 15 "SPI=0xF1234567 " */
+-		printk("SPI=0x%x ", ntohl(ah->spi));
++		ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi));
+ 		break;
+ 	}
+ 	case IPPROTO_ESP: {
+@@ -312,7 +312,7 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		const struct ip_esp_hdr *eh;
+ 
+ 		/* Max length: 10 "PROTO=ESP " */
+-		printk("PROTO=ESP ");
++		ve_printk(VE_LOG, "PROTO=ESP ");
+ 
+ 		if (ntohs(ih->frag_off) & IP_OFFSET)
+ 			break;
+@@ -321,25 +321,25 @@ static void dump_packet(const struct nf_loginfo *info,
+ 		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ 					sizeof(_esph), &_esph);
+ 		if (eh == NULL) {
+-			printk("INCOMPLETE [%u bytes] ",
++			ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
+ 			       skb->len - iphoff - ih->ihl*4);
+ 			break;
+ 		}
+ 
+ 		/* Length: 15 "SPI=0xF1234567 " */
+-		printk("SPI=0x%x ", ntohl(eh->spi));
++		ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi));
+ 		break;
+ 	}
+ 	/* Max length: 10 "PROTO 255 " */
+ 	default:
+-		printk("PROTO=%u ", ih->protocol);
++		ve_printk(VE_LOG, "PROTO=%u ", ih->protocol);
+ 	}
+ 
+ 	/* Max length: 15 "UID=4294967295 " */
+ 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ 		read_lock_bh(&skb->sk->sk_callback_lock);
+ 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+-			printk("UID=%u GID=%u ",
++			ve_printk(VE_LOG, "UID=%u GID=%u ",
+ 				skb->sk->sk_socket->file->f_uid,
+ 				skb->sk->sk_socket->file->f_gid);
+ 		read_unlock_bh(&skb->sk->sk_callback_lock);
+@@ -387,7 +387,7 @@ ipt_log_packet(unsigned int pf,
+ 		loginfo = &default_loginfo;
+ 
+ 	spin_lock_bh(&log_lock);
+-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
++	ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ 	       prefix,
+ 	       in ? in->name : "",
+ 	       out ? out->name : "");
+@@ -398,30 +398,30 @@ ipt_log_packet(unsigned int pf,
+ 
+ 		physindev = skb->nf_bridge->physindev;
+ 		if (physindev && in != physindev)
+-			printk("PHYSIN=%s ", physindev->name);
++			ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name);
+ 		physoutdev = skb->nf_bridge->physoutdev;
+ 		if (physoutdev && out != physoutdev)
+-			printk("PHYSOUT=%s ", physoutdev->name);
++			ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name);
+ 	}
+ #endif
+ 
+ 	if (in && !out) {
+ 		/* MAC logging for input chain only. */
+-		printk("MAC=");
++		ve_printk(VE_LOG, "MAC=");
+ 		if (skb->dev && skb->dev->hard_header_len
+ 		    && skb->mac_header != skb->network_header) {
+ 			int i;
+ 			const unsigned char *p = skb_mac_header(skb);
+ 			for (i = 0; i < skb->dev->hard_header_len; i++,p++)
+-				printk("%02x%c", *p,
++				ve_printk(VE_LOG, "%02x%c", *p,
+ 				       i==skb->dev->hard_header_len - 1
+ 				       ? ' ':':');
+ 		} else
+-			printk(" ");
++			ve_printk(VE_LOG, " ");
+ 	}
+ 
+ 	dump_packet(loginfo, skb, 0);
+-	printk("\n");
++	ve_printk(VE_LOG, "\n");
+ 	spin_unlock_bh(&log_lock);
+ }
+ 
+diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
+index 84c26dd..85e4a69 100644
+--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
++++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
+@@ -98,6 +98,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in,
+ 	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
+ }
+ 
++#if 0
+ static int
+ device_cmp(struct nf_conn *i, void *ifindex)
+ {
+@@ -120,9 +121,6 @@ static int masq_device_event(struct notifier_block *this,
+ {
+ 	const struct net_device *dev = ptr;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	if (event == NETDEV_DOWN) {
+ 		/* Device was downed.  Search entire table for
+ 		   conntracks which were associated with that device,
+@@ -150,6 +148,7 @@ static struct notifier_block masq_dev_notifier = {
+ static struct notifier_block masq_inet_notifier = {
+ 	.notifier_call	= masq_inet_event,
+ };
++#endif
+ 
+ static struct xt_target masquerade_tg_reg __read_mostly = {
+ 	.name		= "MASQUERADE",
+@@ -168,12 +167,16 @@ static int __init masquerade_tg_init(void)
+ 
+ 	ret = xt_register_target(&masquerade_tg_reg);
+ 
++#if 0
++/*	These notifiers are unnecessary and may
++	lead to oops in virtual environments */
+ 	if (ret == 0) {
+ 		/* Register for device down reports */
+ 		register_netdevice_notifier(&masq_dev_notifier);
+ 		/* Register IP address change reports */
+ 		register_inetaddr_notifier(&masq_inet_notifier);
+ 	}
++#endif
+ 
+ 	return ret;
+ }
+@@ -181,8 +184,8 @@ static int __init masquerade_tg_init(void)
+ static void __exit masquerade_tg_exit(void)
+ {
+ 	xt_unregister_target(&masquerade_tg_reg);
+-	unregister_netdevice_notifier(&masq_dev_notifier);
+-	unregister_inetaddr_notifier(&masq_inet_notifier);
++/*	unregister_netdevice_notifier(&masq_dev_notifier);
++	unregister_inetaddr_notifier(&masq_inet_notifier);*/
+ }
+ 
+ module_init(masquerade_tg_init);
+diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
+index 5c62924..99dfc92 100644
+--- a/net/ipv4/netfilter/ipt_REDIRECT.c
++++ b/net/ipv4/netfilter/ipt_REDIRECT.c
+@@ -72,8 +72,13 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in,
+ 
+ 		rcu_read_lock();
+ 		indev = __in_dev_get_rcu(skb->dev);
+-		if (indev && (ifa = indev->ifa_list))
++		if (indev && (ifa = indev->ifa_list)) {
++			/* because of venet device specific, we should use
++			 * second ifa in the list */
++			if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next)
++				ifa = ifa->ifa_next;
+ 			newdst = ifa->ifa_local;
++		}
+ 		rcu_read_unlock();
+ 
+ 		if (!newdst)
+diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
+index 2639872..6b1fcf8 100644
+--- a/net/ipv4/netfilter/ipt_REJECT.c
++++ b/net/ipv4/netfilter/ipt_REJECT.c
+@@ -186,13 +186,13 @@ reject_tg_check(const char *tablename, const void *e_void,
+ 	const struct ipt_entry *e = e_void;
+ 
+ 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+-		printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
++		ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n");
+ 		return false;
+ 	} else if (rejinfo->with == IPT_TCP_RESET) {
+ 		/* Must specify that it's a TCP packet */
+ 		if (e->ip.proto != IPPROTO_TCP
+ 		    || (e->ip.invflags & XT_INV_PROTO)) {
+-			printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
++			ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n");
+ 			return false;
+ 		}
+ 	}
+diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
+index 21cb053..43d5667 100644
+--- a/net/ipv4/netfilter/ipt_recent.c
++++ b/net/ipv4/netfilter/ipt_recent.c
+@@ -14,6 +14,7 @@
+ #include <linux/init.h>
+ #include <linux/ip.h>
+ #include <linux/moduleparam.h>
++#include <linux/nsproxy.h>
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+ #include <linux/string.h>
+@@ -52,6 +53,19 @@ MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
+ MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
+ MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
+ 
++#include <linux/sched.h>
++
++#if defined(CONFIG_VE_IPTABLES)
++#define tables		(get_exec_env()->_ipt_recent->tables)
++#define proc_dir	(get_exec_env()->_ipt_recent->proc_dir)
++#else
++static LIST_HEAD(tables);
++static struct proc_dir_entry	*proc_dir;
++#endif /* CONFIG_VE_IPTABLES */
++
++static int init_ipt_recent(struct ve_struct *ve);
++static void fini_ipt_recent(struct ve_struct *ve);
++
+ struct recent_entry {
+ 	struct list_head	list;
+ 	struct list_head	lru_list;
+@@ -74,12 +88,10 @@ struct recent_table {
+ 	struct list_head	iphash[0];
+ };
+ 
+-static LIST_HEAD(tables);
+ static DEFINE_SPINLOCK(recent_lock);
+ static DEFINE_MUTEX(recent_mutex);
+ 
+ #ifdef CONFIG_PROC_FS
+-static struct proc_dir_entry	*proc_dir;
+ static const struct file_operations	recent_fops;
+ #endif
+ 
+@@ -258,6 +270,9 @@ recent_mt_check(const char *tablename, const void *ip,
+ 	    strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
+ 		return false;
+ 
++	if (init_ipt_recent(get_exec_env()))
++		return 0;
++
+ 	mutex_lock(&recent_mutex);
+ 	t = recent_table_lookup(info->name);
+ 	if (t != NULL) {
+@@ -298,6 +313,13 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
+ {
+ 	const struct ipt_recent_info *info = matchinfo;
+ 	struct recent_table *t;
++	struct ve_struct *ve;
++
++	ve = get_exec_env();
++#ifdef CONFIG_VE_IPTABLES
++	if (!ve->_ipt_recent)
++		return;
++#endif
+ 
+ 	mutex_lock(&recent_mutex);
+ 	t = recent_table_lookup(info->name);
+@@ -312,6 +334,8 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
+ 		kfree(t);
+ 	}
+ 	mutex_unlock(&recent_mutex);
++	if (!ve_is_super(ve) && list_empty(&tables))
++		fini_ipt_recent(ve);
+ }
+ 
+ #ifdef CONFIG_PROC_FS
+@@ -467,6 +491,49 @@ static struct xt_match recent_mt_reg __read_mostly = {
+ 	.me		= THIS_MODULE,
+ };
+ 
++static int init_ipt_recent(struct ve_struct *ve)
++{
++	int err = 0;
++
++#ifdef CONFIG_VE_IPTABLES
++	if (ve->_ipt_recent)
++		return 0;
++
++	ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL);
++	if (!ve->_ipt_recent) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	INIT_LIST_HEAD(&tables);
++#endif
++#ifdef CONFIG_PROC_FS
++	if (err)
++		return err;
++	proc_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net);
++	if (proc_dir == NULL) {
++		err = -ENOMEM;
++		goto out_mem;
++	}
++#endif
++out:
++	return err;
++out_mem:
++#ifdef CONFIG_VE_IPTABLES
++	kfree(ve->_ipt_recent);
++#endif
++	goto out;
++}
++
++static void fini_ipt_recent(struct ve_struct *ve)
++{
++	remove_proc_entry("ipt_recent", ve->ve_netns->proc_net);
++#ifdef CONFIG_VE_IPTABLES
++	kfree(ve->_ipt_recent);
++	ve->_ipt_recent = NULL;
++#endif
++}
++
+ static int __init recent_mt_init(void)
+ {
+ 	int err;
+@@ -476,25 +543,24 @@ static int __init recent_mt_init(void)
+ 	ip_list_hash_size = 1 << fls(ip_list_tot);
+ 
+ 	err = xt_register_match(&recent_mt_reg);
+-#ifdef CONFIG_PROC_FS
+ 	if (err)
+ 		return err;
+-	proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+-	if (proc_dir == NULL) {
++
++	err = init_ipt_recent(&ve0);
++	if (err) {
+ 		xt_unregister_match(&recent_mt_reg);
+-		err = -ENOMEM;
++		return err;
+ 	}
+-#endif
+-	return err;
++
++	return 0;
+ }
+ 
+ static void __exit recent_mt_exit(void)
+ {
+ 	BUG_ON(!list_empty(&tables));
++
++	fini_ipt_recent(&ve0);
+ 	xt_unregister_match(&recent_mt_reg);
+-#ifdef CONFIG_PROC_FS
+-	remove_proc_entry("ipt_recent", init_net.proc_net);
+-#endif
+ }
+ 
+ module_init(recent_mt_init);
+diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
+index 1ea677d..12c4c2b 100644
+--- a/net/ipv4/netfilter/iptable_filter.c
++++ b/net/ipv4/netfilter/iptable_filter.c
+@@ -134,16 +134,24 @@ module_param(forward, bool, 0000);
+ 
+ static int __net_init iptable_filter_net_init(struct net *net)
+ {
++	if (!net_ipt_module_permitted(net, VE_IP_FILTER))
++		return 0;
++
+ 	/* Register table */
+ 	net->ipv4.iptable_filter =
+ 		ipt_register_table(net, &packet_filter, &initial_table.repl);
+ 	if (IS_ERR(net->ipv4.iptable_filter))
+ 		return PTR_ERR(net->ipv4.iptable_filter);
++
++	net_ipt_module_set(net, VE_IP_FILTER);
+ 	return 0;
+ }
+ 
+ static void __net_exit iptable_filter_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_FILTER))
++		return;
++
+ 	ipt_unregister_table(net->ipv4.iptable_filter);
+ }
+ 
+diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
+index da59182..f6343d8 100644
+--- a/net/ipv4/netfilter/iptable_mangle.c
++++ b/net/ipv4/netfilter/iptable_mangle.c
+@@ -203,16 +203,24 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = {
+ 
+ static int __net_init iptable_mangle_net_init(struct net *net)
+ {
++	if (!net_ipt_module_permitted(net, VE_IP_MANGLE))
++		return 0;
++
+ 	/* Register table */
+ 	net->ipv4.iptable_mangle =
+ 		ipt_register_table(net, &packet_mangler, &initial_table.repl);
+ 	if (IS_ERR(net->ipv4.iptable_mangle))
+ 		return PTR_ERR(net->ipv4.iptable_mangle);
++
++	net_ipt_module_set(net, VE_IP_MANGLE);
+ 	return 0;
+ }
+ 
+ static void __net_exit iptable_mangle_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
++		return;
++
+ 	ipt_unregister_table(net->ipv4.iptable_mangle);
+ }
+ 
+diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+index 5a955c4..dca8da7 100644
+--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
++++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+@@ -15,6 +15,7 @@
+ #include <linux/sysctl.h>
+ #include <net/route.h>
+ #include <net/ip.h>
++#include <linux/nfcalls.h>
+ 
+ #include <linux/netfilter_ipv4.h>
+ #include <net/netfilter/nf_conntrack.h>
+@@ -417,66 +418,226 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+ MODULE_ALIAS("ip_conntrack");
+ MODULE_LICENSE("GPL");
+ 
+-static int __init nf_conntrack_l3proto_ipv4_init(void)
++#ifdef CONFIG_VE_IPTABLES
++#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
++static int nf_ct_proto_ipv4_sysctl_init(void)
+ {
+-	int ret = 0;
++	struct nf_conntrack_l3proto *ipv4 = ve_nf_conntrack_l3proto_ipv4;
++	struct ctl_table *ct_table;
++	struct net *net = get_exec_env()->ve_netns;
+ 
+-	need_conntrack();
++	ct_table = ip_ct_sysctl_table;
+ 
+-	ret = nf_register_sockopt(&so_getorigdst);
+-	if (ret < 0) {
+-		printk(KERN_ERR "Unable to register netfilter socket option\n");
+-		return ret;
++	if (net != &init_net) {
++		ct_table = kmemdup(ct_table, sizeof(ip_ct_sysctl_table),
++				   GFP_KERNEL);
++		if (!ct_table)
++			return -ENOMEM;
++	}
++
++	ipv4->ctl_table_header = NULL;
++	ipv4->ctl_table_path = nf_net_ipv4_netfilter_sysctl_path;
++	ipv4->ctl_table = ct_table;
++
++	ipv4->ctl_table[0].data = &ve_nf_conntrack_max;
++	ipv4->ctl_table[1].data = &ve_nf_conntrack_count;
++	ipv4->ctl_table[3].data = &ve_nf_conntrack_checksum;
++	ipv4->ctl_table[4].data = &ve_nf_ct_log_invalid;
++
++	return 0;
++}
++
++static void nf_ct_proto_ipv4_sysctl_cleanup(void)
++{
++	struct net *net = get_exec_env()->ve_netns;
++
++	if (net != &init_net) {
++		kfree(ve_nf_conntrack_l3proto_ipv4->ctl_table);
+ 	}
++}
++#else
++static inline int nf_ct_proto_ipv4_sysctl_init(void)
++{
++	return 0;
++}
++static inline void nf_ct_proto_ipv4_sysctl_cleanup(void)
++{
++}
++#endif /* SYSCTL && NF_CONNTRACK_PROC_COMPAT */
++
++/*
++ *  Functions init/fini_nf_ct_l3proto_ipv4 glue distributed nf_conntrack
++ *  virtualization efforts. They are to be called from 2 places:
++ * 
++ *  1) on loading/unloading module nf_conntrack_ipv4 from 
++ *     nf_conntrack_l3proto_ipv4_init/fini
++ *  2) on start/stop ve - from do_ve_iptables
++ */
++static int nf_ct_proto_ipv4_init(void)
++{
++	struct nf_conntrack_l3proto *ipv4;
++
++	if (ve_is_super(get_exec_env())) {
++		ipv4 = &nf_conntrack_l3proto_ipv4;
++		goto out;
++	}
++	ipv4 = kmemdup(&nf_conntrack_l3proto_ipv4,
++			sizeof(struct nf_conntrack_l3proto), GFP_KERNEL);
++	if (!ipv4)
++		return -ENOMEM;
++out:
++	ve_nf_conntrack_l3proto_ipv4 = ipv4;
++	return 0;
++}
++
++static void nf_ct_proto_ipv4_fini(void)
++{
++	if (!ve_is_super(get_exec_env()))
++		kfree(ve_nf_conntrack_l3proto_ipv4);
++}
++#endif
++
++int init_nf_ct_l3proto_ipv4(void)
++{
++	int ret = -ENOMEM;
++	int do_hooks = ve_is_super(get_exec_env());
++
++#ifdef CONFIG_VE_IPTABLES
++	if (!ve_is_super(get_exec_env())) 
++		__module_get(THIS_MODULE);
++
++	ret = nf_ct_proto_ipv4_init();
++	if (ret < 0)
++		goto err_out;
++	ret = nf_ct_proto_ipv4_sysctl_init();
++	if (ret < 0)
++		goto no_mem_ipv4;
++	ret = nf_ct_proto_tcp_sysctl_init();
++	if (ret < 0)
++		goto no_mem_tcp;
++	ret = nf_ct_proto_udp_sysctl_init();
++	if (ret < 0)
++		goto no_mem_udp;
++	ret = nf_ct_proto_icmp_sysctl_init();
++	if (ret < 0)
++		goto no_mem_icmp;
++#endif /* CONFIG_VE_IPTABLES */
+ 
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp4);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv4: can't register tcp.\n");
+-		goto cleanup_sockopt;
++		goto cleanup_sys;
+ 	}
+ 
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp4);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv4: can't register udp.\n");
+-		goto cleanup_tcp;
++		goto unreg_tcp;
+ 	}
+ 
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmp);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv4: can't register icmp.\n");
+-		goto cleanup_udp;
++		goto unreg_udp;
+ 	}
+ 
+-	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
++	ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv4);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv4: can't register ipv4\n");
+-		goto cleanup_icmp;
++		goto unreg_icmp;
+ 	}
+ 
+-	ret = nf_register_hooks(ipv4_conntrack_ops,
+-				ARRAY_SIZE(ipv4_conntrack_ops));
+-	if (ret < 0) {
+-		printk("nf_conntrack_ipv4: can't register hooks.\n");
+-		goto cleanup_ipv4;
++	if (do_hooks) {
++		ret = nf_register_hooks(ipv4_conntrack_ops,
++					ARRAY_SIZE(ipv4_conntrack_ops));
++		if (ret < 0) {
++			printk("nf_conntrack_ipv4: can't register hooks.\n");
++			goto unreg_ipv4;
++		}
+ 	}
+-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ 	ret = nf_conntrack_ipv4_compat_init();
+ 	if (ret < 0)
+-		goto cleanup_hooks;
+-#endif
++		goto unreg_hooks;
++	return 0;
++
++unreg_hooks:
++	if (do_hooks)
++		nf_unregister_hooks(ipv4_conntrack_ops,
++				    ARRAY_SIZE(ipv4_conntrack_ops));
++unreg_ipv4:
++	nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4);
++unreg_icmp:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp);
++unreg_udp:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4);
++unreg_tcp:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4);
++cleanup_sys:
++#ifdef CONFIG_VE_IPTABLES
++no_mem_icmp:
++	nf_ct_proto_udp_sysctl_cleanup();
++no_mem_udp:
++	nf_ct_proto_tcp_sysctl_cleanup();
++no_mem_tcp:
++	nf_ct_proto_ipv4_sysctl_cleanup();
++no_mem_ipv4:
++	nf_ct_proto_ipv4_fini();
++err_out:
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++#endif /* CONFIG_VE_IPTABLES */
+ 	return ret;
+-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+- cleanup_hooks:
+-	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+-#endif
+- cleanup_ipv4:
+-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+- cleanup_icmp:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+- cleanup_udp:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+- cleanup_tcp:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
++}
++EXPORT_SYMBOL(init_nf_ct_l3proto_ipv4);
++
++void fini_nf_ct_l3proto_ipv4(void)
++{
++	int do_hooks = ve_is_super(get_exec_env());
++
++	nf_conntrack_ipv4_compat_fini();
++	if (do_hooks)
++		nf_unregister_hooks(ipv4_conntrack_ops,
++				    ARRAY_SIZE(ipv4_conntrack_ops));
++
++	nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4);
++
++#ifdef CONFIG_VE_IPTABLES 
++	nf_ct_proto_icmp_sysctl_cleanup();
++	nf_ct_proto_udp_sysctl_cleanup();
++	nf_ct_proto_tcp_sysctl_cleanup();
++	nf_ct_proto_ipv4_sysctl_cleanup();
++	nf_ct_proto_ipv4_fini();
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++#endif /* CONFIG_VE_IPTABLES */
++}
++EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv4);
++
++static int __init nf_conntrack_l3proto_ipv4_init(void)
++{
++	int ret = 0;
++
++	need_conntrack();
++
++	ret = nf_register_sockopt(&so_getorigdst);
++	if (ret < 0) {
++		printk(KERN_ERR "Unable to register netfilter socket option\n");
++		return ret;
++	}
++
++	ret = init_nf_ct_l3proto_ipv4();
++	if (ret < 0) {
++		printk(KERN_ERR "Unable to initialize netfilter protocols\n");
++		goto cleanup_sockopt;
++	}
++	KSYMRESOLVE(init_nf_ct_l3proto_ipv4);
++	KSYMRESOLVE(fini_nf_ct_l3proto_ipv4);
++	KSYMMODRESOLVE(nf_conntrack_ipv4);
++	return ret;
++
+  cleanup_sockopt:
+ 	nf_unregister_sockopt(&so_getorigdst);
+ 	return ret;
+@@ -485,14 +646,12 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
+ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
+ {
+ 	synchronize_net();
+-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+-	nf_conntrack_ipv4_compat_fini();
+-#endif
+-	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
++
++	KSYMMODUNRESOLVE(nf_conntrack_ipv4);
++	KSYMUNRESOLVE(init_nf_ct_l3proto_ipv4);
++	KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv4);
++
++	fini_nf_ct_l3proto_ipv4();
+ 	nf_unregister_sockopt(&so_getorigdst);
+ }
+ 
+diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+index 40a46d4..f73ad01 100644
+--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
++++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+@@ -9,7 +9,9 @@
+  */
+ #include <linux/types.h>
+ #include <linux/proc_fs.h>
++#include <linux/nsproxy.h>
+ #include <linux/seq_file.h>
++#include <linux/sysctl.h>
+ #include <linux/percpu.h>
+ #include <net/net_namespace.h>
+ 
+@@ -44,7 +46,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
+ 	for (st->bucket = 0;
+ 	     st->bucket < nf_conntrack_htable_size;
+ 	     st->bucket++) {
+-		n = rcu_dereference(nf_conntrack_hash[st->bucket].first);
++		n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first);
+ 		if (n)
+ 			return n;
+ 	}
+@@ -60,7 +62,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq,
+ 	while (head == NULL) {
+ 		if (++st->bucket >= nf_conntrack_htable_size)
+ 			return NULL;
+-		head = rcu_dereference(nf_conntrack_hash[st->bucket].first);
++		head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first);
+ 	}
+ 	return head;
+ }
+@@ -193,7 +195,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+ 	struct hlist_node *n;
+ 
+ 	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+-		n = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
++		n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first);
+ 		if (n)
+ 			return n;
+ 	}
+@@ -209,7 +211,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ 	while (head == NULL) {
+ 		if (++st->bucket >= nf_ct_expect_hsize)
+ 			return NULL;
+-		head = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
++		head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first);
+ 	}
+ 	return head;
+ }
+@@ -326,7 +328,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+ 
+ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+ {
+-	unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
++	unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count);
+ 	const struct ip_conntrack_stat *st = v;
+ 
+ 	if (v == SEQ_START_TOKEN) {
+@@ -377,36 +379,91 @@ static const struct file_operations ct_cpu_seq_fops = {
+ 	.release = seq_release,
+ };
+ 
+-int __init nf_conntrack_ipv4_compat_init(void)
++#ifdef CONFIG_VE_IPTABLES
++#define ve_ip_ct_netfilter_table	(get_exec_env()->_nf_conntrack->_ip_ct_netfilter_table)
++#define ve_ip_ct_sysctl_header		(get_exec_env()->_nf_conntrack->_ip_ct_sysctl_header)
++#else
++#define ve_ip_ct_netfilter_table	ip_ct_netfilter_table
++#define ve_ip_ct_sysctl_header		ip_ct_sysctl_header
++#endif
++
++static ctl_table ip_ct_netfilter_table[] = {
++	{
++		.procname	= "ip_conntrack_max",
++		.data		= &nf_conntrack_max,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{}
++};
++
++static struct ctl_path ip_ct_net_table_path[] = {
++	{ .procname = "net", .ctl_name = CTL_NET, },
++	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
++	{},
++};
++
++int nf_conntrack_ipv4_compat_init(void)
+ {
++	struct net *net = get_exec_env()->ve_netns;
+ 	struct proc_dir_entry *proc, *proc_exp, *proc_stat;
++	static ctl_table *table;
+ 
+-	proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops);
++	proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
+ 	if (!proc)
+ 		goto err1;
+ 
+-	proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440,
++	proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
+ 					&ip_exp_file_ops);
+ 	if (!proc_exp)
+ 		goto err2;
+ 
+ 	proc_stat = proc_create("ip_conntrack", S_IRUGO,
+-				init_net.proc_net_stat, &ct_cpu_seq_fops);
++				net->proc_net_stat, &ct_cpu_seq_fops);
+ 	if (!proc_stat)
+ 		goto err3;
++
++	table = ip_ct_netfilter_table;
++	if (net != &init_net) {
++		 table = kmemdup(table,
++				 sizeof(ip_ct_netfilter_table),
++				 GFP_KERNEL);
++		if (!table)
++			goto err4;
++	}
++
++	table[0].data = &ve_nf_conntrack_max;
++	ve_ip_ct_sysctl_header = register_net_sysctl_table(net,
++							   ip_ct_net_table_path,
++							   table);
++	if (!ve_ip_ct_sysctl_header)
++		goto err5;
++
+ 	return 0;
+ 
++err5:
++	if (net != &init_net)
++		kfree(table);
++err4:
++	remove_proc_entry("ip_conntrack", net->proc_net_stat);
+ err3:
+-	proc_net_remove(&init_net, "ip_conntrack_expect");
++	proc_net_remove(net, "ip_conntrack_expect");
+ err2:
+-	proc_net_remove(&init_net, "ip_conntrack");
++	proc_net_remove(net, "ip_conntrack");
+ err1:
+ 	return -ENOMEM;
+ }
+ 
+-void __exit nf_conntrack_ipv4_compat_fini(void)
++void nf_conntrack_ipv4_compat_fini(void)
+ {
+-	remove_proc_entry("ip_conntrack", init_net.proc_net_stat);
+-	proc_net_remove(&init_net, "ip_conntrack_expect");
+-	proc_net_remove(&init_net, "ip_conntrack");
++	struct net *net = get_exec_env()->ve_netns;
++	struct ctl_table *table = ve_ip_ct_sysctl_header->ctl_table_arg;
++
++	unregister_net_sysctl_table(ve_ip_ct_sysctl_header);
++	if (net != &init_net)
++		kfree(table);
++	remove_proc_entry("ip_conntrack", net->proc_net_stat);
++	proc_net_remove(net, "ip_conntrack_expect");
++	proc_net_remove(net, "ip_conntrack");
+ }
+diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+index 78ab19a..f510c45 100644
+--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
++++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+@@ -7,6 +7,7 @@
+  */
+ 
+ #include <linux/types.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+ #include <linux/netfilter.h>
+ #include <linux/in.h>
+@@ -20,7 +21,7 @@
+ #include <net/netfilter/nf_conntrack_core.h>
+ #include <net/netfilter/nf_log.h>
+ 
+-static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ;
++unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ;
+ 
+ static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ 			      struct nf_conntrack_tuple *tuple)
+@@ -93,7 +94,7 @@ static int icmp_packet(struct nf_conn *ct,
+ 	} else {
+ 		atomic_inc(&ct->proto.icmp.count);
+ 		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+-		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
++		nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmp_timeout);
+ 	}
+ 
+ 	return NF_ACCEPT;
+@@ -149,7 +150,7 @@ icmp_error_message(struct sk_buff *skb,
+ 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
+ 	   been preserved inside the ICMP. */
+ 	if (!nf_ct_invert_tuple(&innertuple, &origtuple,
+-				&nf_conntrack_l3proto_ipv4, innerproto)) {
++				ve_nf_conntrack_l3proto_ipv4, innerproto)) {
+ 		pr_debug("icmp_error_message: no match\n");
+ 		return -NF_ACCEPT;
+ 	}
+@@ -321,3 +322,64 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+ #endif
+ #endif
+ };
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++int nf_ct_proto_icmp_sysctl_init(void)
++{
++	struct nf_conntrack_l4proto *icmp;
++
++	if (ve_is_super(get_exec_env())) {
++		icmp = &nf_conntrack_l4proto_icmp;
++		goto out;
++	}
++
++	icmp = kmemdup(&nf_conntrack_l4proto_icmp,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (!icmp)
++		goto no_mem_ct;
++
++	icmp->ctl_table_header = &ve_icmp_sysctl_header;
++	icmp->ctl_table = kmemdup(icmp_sysctl_table,
++			sizeof(icmp_sysctl_table), GFP_KERNEL);
++	if (icmp->ctl_table == NULL)
++		goto no_mem_sys;
++	icmp->ctl_table[0].data = &ve_nf_ct_icmp_timeout;
++
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	icmp->ctl_compat_table_header = ve_icmp_compat_sysctl_header;
++	icmp->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
++					 sizeof(icmp_compat_sysctl_table),
++					 GFP_KERNEL);
++	if (icmp->ctl_compat_table == NULL)
++		goto no_mem_compat;
++	icmp->ctl_compat_table[0].data = &ve_nf_ct_icmp_timeout;
++#endif
++out:
++	ve_nf_ct_icmp_timeout = nf_ct_icmp_timeout;
++
++	ve_nf_conntrack_l4proto_icmp = icmp;
++	return 0;
++
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++no_mem_compat:
++	kfree(icmp->ctl_table);
++#endif
++no_mem_sys:
++	kfree(icmp);
++no_mem_ct:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_init);
++
++void nf_ct_proto_icmp_sysctl_cleanup(void)
++{
++	if (!ve_is_super(get_exec_env())) {
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++		kfree(ve_nf_conntrack_l4proto_icmp->ctl_compat_table);
++#endif
++		kfree(ve_nf_conntrack_l4proto_icmp->ctl_table);
++		kfree(ve_nf_conntrack_l4proto_icmp);
++	}
++}
++EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_cleanup);
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
+diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
+index d2a887f..f7f832b 100644
+--- a/net/ipv4/netfilter/nf_nat_core.c
++++ b/net/ipv4/netfilter/nf_nat_core.c
+@@ -19,6 +19,8 @@
+ #include <linux/icmp.h>
+ #include <linux/udp.h>
+ #include <linux/jhash.h>
++#include <linux/nfcalls.h>
++#include <bc/kmem.h>
+ 
+ #include <linux/netfilter_ipv4.h>
+ #include <net/netfilter/nf_conntrack.h>
+@@ -33,22 +35,34 @@
+ 
+ static DEFINE_SPINLOCK(nf_nat_lock);
+ 
+-static struct nf_conntrack_l3proto *l3proto __read_mostly;
+ 
+ /* Calculated at init based on memory size */
+ static unsigned int nf_nat_htable_size __read_mostly;
+-static int nf_nat_vmalloced;
+ 
++#define MAX_IP_NAT_PROTO 256
++
++#ifdef CONFIG_VE_IPTABLES
++#define ve_nf_nat_protos	(get_exec_env()->_nf_conntrack->_nf_nat_protos)
++#define ve_nf_nat_l3proto	(get_exec_env()->_nf_conntrack->_nf_nat_l3proto)
++#define ve_bysource		(get_exec_env()->_nf_conntrack->_bysource)
++#define ve_nf_nat_vmalloced	(get_exec_env()->_nf_conntrack->_nf_nat_vmalloced)
++#else
++static struct nf_conntrack_l3proto *l3proto __read_mostly;
++static int nf_nat_vmalloced;
+ static struct hlist_head *bysource __read_mostly;
+ 
+-#define MAX_IP_NAT_PROTO 256
+ static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+ 						__read_mostly;
++#define ve_nf_nat_protos	nf_nat_protos
++#define ve_nf_nat_l3proto	l3proto
++#define ve_bysource		bysource
++#define ve_nf_nat_vmalloced	nf_nat_vmalloced
++#endif
+ 
+ static inline const struct nf_nat_protocol *
+ __nf_nat_proto_find(u_int8_t protonum)
+ {
+-	return rcu_dereference(nf_nat_protos[protonum]);
++	return rcu_dereference(ve_nf_nat_protos[protonum]);
+ }
+ 
+ const struct nf_nat_protocol *
+@@ -155,7 +169,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple,
+ 	const struct hlist_node *n;
+ 
+ 	rcu_read_lock();
+-	hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) {
++	hlist_for_each_entry_rcu(nat, n, &ve_bysource[h], bysource) {
+ 		ct = nat->ct;
+ 		if (same_src(ct, tuple)) {
+ 			/* Copy source part from reply tuple. */
+@@ -278,6 +292,22 @@ out:
+ 	rcu_read_unlock();
+ }
+ 
++void nf_nat_hash_conntrack(struct nf_conn *ct)
++{
++	struct nf_conn_nat *nat;
++	unsigned int srchash;
++
++	srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
++	spin_lock_bh(&nf_nat_lock);
++	/* nf_conntrack_alter_reply might re-allocate exntension aera */
++	nat = nfct_nat(ct);
++	nat->ct = ct;
++	hlist_add_head_rcu(&nat->bysource, &ve_bysource[srchash]);
++	spin_unlock_bh(&nf_nat_lock);
++
++}
++EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack);
++
+ unsigned int
+ nf_nat_setup_info(struct nf_conn *ct,
+ 		  const struct nf_nat_range *range,
+@@ -326,17 +356,8 @@ nf_nat_setup_info(struct nf_conn *ct,
+ 	}
+ 
+ 	/* Place in source hash if this is the first time. */
+-	if (have_to_hash) {
+-		unsigned int srchash;
+-
+-		srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+-		spin_lock_bh(&nf_nat_lock);
+-		/* nf_conntrack_alter_reply might re-allocate exntension aera */
+-		nat = nfct_nat(ct);
+-		nat->ct = ct;
+-		hlist_add_head_rcu(&nat->bysource, &bysource[srchash]);
+-		spin_unlock_bh(&nf_nat_lock);
+-	}
++	if (have_to_hash)
++		nf_nat_hash_conntrack(ct);
+ 
+ 	/* It's done. */
+ 	if (maniptype == IP_NAT_MANIP_DST)
+@@ -426,7 +447,6 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
+ 		struct icmphdr icmp;
+ 		struct iphdr ip;
+ 	} *inside;
+-	const struct nf_conntrack_l4proto *l4proto;
+ 	struct nf_conntrack_tuple inner, target;
+ 	int hdrlen = ip_hdrlen(skb);
+ 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+@@ -463,16 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
+ 		 "dir %s\n", skb, manip,
+ 		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+ 
+-	/* rcu_read_lock()ed by nf_hook_slow */
+-	l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
+-
+ 	if (!nf_ct_get_tuple(skb,
+ 			     ip_hdrlen(skb) + sizeof(struct icmphdr),
+ 			     (ip_hdrlen(skb) +
+ 			      sizeof(struct icmphdr) + inside->ip.ihl * 4),
+ 			     (u_int16_t)AF_INET,
+ 			     inside->ip.protocol,
+-			     &inner, l3proto, l4proto))
++			     &inner, ve_nf_nat_l3proto,
++			     __nf_ct_l4proto_find(PF_INET, inside->ip.protocol)))
+ 		return 0;
+ 
+ 	/* Change inner back to look like incoming packet.  We do the
+@@ -522,11 +540,11 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
+ 	int ret = 0;
+ 
+ 	spin_lock_bh(&nf_nat_lock);
+-	if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
++	if (ve_nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+ 		ret = -EBUSY;
+ 		goto out;
+ 	}
+-	rcu_assign_pointer(nf_nat_protos[proto->protonum], proto);
++	rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], proto);
+  out:
+ 	spin_unlock_bh(&nf_nat_lock);
+ 	return ret;
+@@ -537,7 +555,7 @@ EXPORT_SYMBOL(nf_nat_protocol_register);
+ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
+ {
+ 	spin_lock_bh(&nf_nat_lock);
+-	rcu_assign_pointer(nf_nat_protos[proto->protonum],
++	rcu_assign_pointer(ve_nf_nat_protos[proto->protonum],
+ 			   &nf_nat_unknown_protocol);
+ 	spin_unlock_bh(&nf_nat_lock);
+ 	synchronize_rcu();
+@@ -583,47 +601,62 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
+ 	.flags		= NF_CT_EXT_F_PREALLOC,
+ };
+ 
+-static int __init nf_nat_init(void)
++int nf_nat_init(void)
+ {
+ 	size_t i;
+ 	int ret;
+ 
+ 	need_ipv4_conntrack();
+ 
+-	ret = nf_ct_extend_register(&nat_extend);
+-	if (ret < 0) {
+-		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+-		return ret;
++	if (ve_is_super(get_exec_env())) {
++		ret = nf_ct_extend_register(&nat_extend);
++		if (ret < 0) {
++			printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
++			return ret;
++		}
+ 	}
+ 
+ 	/* Leave them the same for the moment. */
+ 	nf_nat_htable_size = nf_conntrack_htable_size;
+ 
+-	bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
+-					 &nf_nat_vmalloced);
+-	if (!bysource) {
++	ve_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
++					 &ve_nf_nat_vmalloced);
++	if (!ve_bysource) {
+ 		ret = -ENOMEM;
+ 		goto cleanup_extend;
+ 	}
+ 
++#ifdef CONFIG_VE_IPTABLES
++	ve_nf_nat_protos = kcalloc(MAX_IP_NAT_PROTO, sizeof(void *), GFP_KERNEL);
++	if (!ve_nf_nat_protos) {
++		ret = -ENOMEM;
++		goto cleanup_hash;
++	}
++#endif
+ 	/* Sew in builtin protocols. */
+ 	spin_lock_bh(&nf_nat_lock);
+ 	for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+-		rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol);
+-	rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
+-	rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
+-	rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
++		rcu_assign_pointer(ve_nf_nat_protos[i], &nf_nat_unknown_protocol);
++	rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
++	rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
++	rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
+ 	spin_unlock_bh(&nf_nat_lock);
+ 
+-	/* Initialize fake conntrack so that NAT will skip it */
+-	nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
++	if (ve_is_super(get_exec_env())) {
++		/* Initialize fake conntrack so that NAT will skip it */
++		nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
++	}
+ 
+-	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
++	ve_nf_nat_l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
+ 
+ 	BUG_ON(nf_nat_seq_adjust_hook != NULL);
+ 	rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
+ 	return 0;
+ 
++#ifdef CONFIG_VE_IPTABLES
++cleanup_hash:
++#endif
++	nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size);
+  cleanup_extend:
+ 	nf_ct_extend_unregister(&nat_extend);
+ 	return ret;
+@@ -641,18 +674,45 @@ static int clean_nat(struct nf_conn *i, void *data)
+ 	return 0;
+ }
+ 
+-static void __exit nf_nat_cleanup(void)
++void nf_nat_cleanup(void)
+ {
+ 	nf_ct_iterate_cleanup(&clean_nat, NULL);
+ 	synchronize_rcu();
+-	nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size);
+-	nf_ct_l3proto_put(l3proto);
+-	nf_ct_extend_unregister(&nat_extend);
++	nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size);
++	nf_ct_l3proto_put(ve_nf_nat_l3proto);
++#ifdef CONFIG_VE_IPTABLES
++	kfree(ve_nf_nat_protos);
++#endif
++	if (ve_is_super(get_exec_env()))
++		nf_ct_extend_unregister(&nat_extend);
+ 	rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
+ 	synchronize_net();
+ }
+ 
++static int __init init(void)
++{
++	int rv;
++
++	rv = nf_nat_init();
++	if (rv < 0)
++		return rv;
++
++	KSYMRESOLVE(nf_nat_init);
++	KSYMRESOLVE(nf_nat_cleanup);
++	KSYMMODRESOLVE(nf_nat);
++	return 0;
++}
++
++static void __exit fini(void)
++{
++	KSYMMODUNRESOLVE(nf_nat);
++	KSYMUNRESOLVE(nf_nat_cleanup);
++	KSYMUNRESOLVE(nf_nat_init);
++
++	nf_nat_cleanup();
++}
++
+ MODULE_LICENSE("GPL");
+ 
+-module_init(nf_nat_init);
+-module_exit(nf_nat_cleanup);
++module_init(init);
++module_exit(fini);
+diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
+index e8b4d0d..f301178 100644
+--- a/net/ipv4/netfilter/nf_nat_rule.c
++++ b/net/ipv4/netfilter/nf_nat_rule.c
+@@ -15,6 +15,7 @@
+ #include <linux/kmod.h>
+ #include <linux/skbuff.h>
+ #include <linux/proc_fs.h>
++#include <linux/nsproxy.h>
+ #include <net/checksum.h>
+ #include <net/route.h>
+ #include <linux/bitops.h>
+@@ -33,7 +34,7 @@ static struct
+ 	struct ipt_replace repl;
+ 	struct ipt_standard entries[3];
+ 	struct ipt_error term;
+-} nat_initial_table __initdata = {
++} nat_initial_table = {
+ 	.repl = {
+ 		.name = "nat",
+ 		.valid_hooks = NAT_VALID_HOOKS,
+@@ -65,7 +66,12 @@ static struct xt_table __nat_table = {
+ 	.me		= THIS_MODULE,
+ 	.af		= AF_INET,
+ };
++#ifdef CONFIG_VE_IPTABLES
++#define nat_table			\
++	(get_exec_env()->_nf_conntrack->_nf_nat_table)
++#else
+ static struct xt_table *nat_table;
++#endif
+ 
+ /* Source NAT */
+ static unsigned int ipt_snat_target(struct sk_buff *skb,
+@@ -226,14 +232,20 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
+ 	.family		= AF_INET,
+ };
+ 
+-int __init nf_nat_rule_init(void)
++int nf_nat_rule_init(void)
+ {
+ 	int ret;
++	struct net *net = get_exec_env()->ve_netns;
+ 
+-	nat_table = ipt_register_table(&init_net, &__nat_table,
++	nat_table = ipt_register_table(net, &__nat_table,
+ 				       &nat_initial_table.repl);
+ 	if (IS_ERR(nat_table))
+ 		return PTR_ERR(nat_table);
++
++	ret = 0;
++	if (!ve_is_super(get_exec_env()))
++		goto done;
++
+ 	ret = xt_register_target(&ipt_snat_reg);
+ 	if (ret != 0)
+ 		goto unregister_table;
+@@ -242,19 +254,26 @@ int __init nf_nat_rule_init(void)
+ 	if (ret != 0)
+ 		goto unregister_snat;
+ 
++done:
+ 	return ret;
+ 
+  unregister_snat:
+ 	xt_unregister_target(&ipt_snat_reg);
+  unregister_table:
+ 	ipt_unregister_table(nat_table);
++	nat_table = NULL;
+ 
+ 	return ret;
+ }
+ 
+ void nf_nat_rule_cleanup(void)
+ {
++	if (!ve_is_super(get_exec_env()))
++		goto skip;
++
+ 	xt_unregister_target(&ipt_dnat_reg);
+ 	xt_unregister_target(&ipt_snat_reg);
++skip:
+ 	ipt_unregister_table(nat_table);
++	nat_table = NULL;
+ }
+diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
+index b7dd695..9aec464 100644
+--- a/net/ipv4/netfilter/nf_nat_standalone.c
++++ b/net/ipv4/netfilter/nf_nat_standalone.c
+@@ -16,6 +16,7 @@
+ #include <net/ip.h>
+ #include <net/checksum.h>
+ #include <linux/spinlock.h>
++#include <linux/nfcalls.h>
+ 
+ #include <net/netfilter/nf_conntrack.h>
+ #include <net/netfilter/nf_conntrack_core.h>
+@@ -282,30 +283,64 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
+ 	},
+ };
+ 
+-static int __init nf_nat_standalone_init(void)
++int init_nftable_nat(void)
+ {
+-	int ret = 0;
++	int ret;
+ 
+-	need_ipv4_conntrack();
++	if (!ve_is_super(get_exec_env()))
++		__module_get(THIS_MODULE);
+ 
+-#ifdef CONFIG_XFRM
+-	BUG_ON(ip_nat_decode_session != NULL);
+-	rcu_assign_pointer(ip_nat_decode_session, nat_decode_session);
+-#endif
+ 	ret = nf_nat_rule_init();
+ 	if (ret < 0) {
+ 		printk("nf_nat_init: can't setup rules.\n");
+-		goto cleanup_decode_session;
++		goto out_modput;
+ 	}
+ 	ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+ 	if (ret < 0) {
+ 		printk("nf_nat_init: can't register hooks.\n");
+ 		goto cleanup_rule_init;
+ 	}
++	return 0;
++
++cleanup_rule_init:
++	nf_nat_rule_cleanup();
++out_modput:
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
+ 	return ret;
++}
+ 
+- cleanup_rule_init:
++void fini_nftable_nat(void)
++{
++	nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+ 	nf_nat_rule_cleanup();
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++}
++
++static int __init nf_nat_standalone_init(void)
++{
++	int ret = 0;
++
++	need_ipv4_conntrack();
++
++#ifdef CONFIG_XFRM
++	BUG_ON(ip_nat_decode_session != NULL);
++	rcu_assign_pointer(ip_nat_decode_session, nat_decode_session);
++#endif
++
++	if (!ip_conntrack_disable_ve0) {
++		ret = init_nftable_nat();
++		if (ret < 0)
++			goto cleanup_decode_session;
++	}
++
++	KSYMRESOLVE(init_nftable_nat);
++	KSYMRESOLVE(fini_nftable_nat);
++	KSYMMODRESOLVE(iptable_nat);
++
++	return ret;
++
+  cleanup_decode_session:
+ #ifdef CONFIG_XFRM
+ 	rcu_assign_pointer(ip_nat_decode_session, NULL);
+@@ -316,8 +351,12 @@ static int __init nf_nat_standalone_init(void)
+ 
+ static void __exit nf_nat_standalone_fini(void)
+ {
+-	nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+-	nf_nat_rule_cleanup();
++	KSYMMODUNRESOLVE(iptable_nat);
++	KSYMUNRESOLVE(init_nftable_nat);
++	KSYMUNRESOLVE(fini_nftable_nat);
++
++	if (!ip_conntrack_disable_ve0)
++		fini_nftable_nat();
+ #ifdef CONFIG_XFRM
+ 	rcu_assign_pointer(ip_nat_decode_session, NULL);
+ 	synchronize_net();
+diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
+index 552169b..bf8e34e 100644
+--- a/net/ipv4/proc.c
++++ b/net/ipv4/proc.c
+@@ -53,6 +53,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
+ {
+ 	struct net *net = seq->private;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return 0;
++
+ 	socket_seq_show(seq);
+ 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+ 		   sock_prot_inuse_get(net, &tcp_prot),
+@@ -272,7 +275,7 @@ static void icmpmsg_put(struct seq_file *seq)
+ 	count = 0;
+ 	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+ 
+-		if (snmp_fold_field((void **) icmpmsg_statistics, i))
++		if (snmp_fold_field((void **) ve_icmpmsg_statistics, i))
+ 			out[count++] = i;
+ 		if (count < PERLINE)
+ 			continue;
+@@ -284,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq)
+ 		seq_printf(seq, "\nIcmpMsg: ");
+ 		for (j = 0; j < PERLINE; ++j)
+ 			seq_printf(seq, " %lu",
+-				snmp_fold_field((void **) icmpmsg_statistics,
++				snmp_fold_field((void **) ve_icmpmsg_statistics,
+ 				out[j]));
+ 		seq_putc(seq, '\n');
+ 	}
+@@ -296,7 +299,7 @@ static void icmpmsg_put(struct seq_file *seq)
+ 		seq_printf(seq, "\nIcmpMsg:");
+ 		for (j = 0; j < count; ++j)
+ 			seq_printf(seq, " %lu", snmp_fold_field((void **)
+-				icmpmsg_statistics, out[j]));
++				ve_icmpmsg_statistics, out[j]));
+ 	}
+ 
+ #undef PERLINE
+@@ -313,18 +316,18 @@ static void icmp_put(struct seq_file *seq)
+ 	for (i=0; icmpmibmap[i].name != NULL; i++)
+ 		seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ 	seq_printf(seq, "\nIcmp: %lu %lu",
+-		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS),
+-		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS));
++		snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INMSGS),
++		snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INERRORS));
+ 	for (i=0; icmpmibmap[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			snmp_fold_field((void **) icmpmsg_statistics,
++			snmp_fold_field((void **) ve_icmpmsg_statistics,
+ 				icmpmibmap[i].index));
+ 	seq_printf(seq, " %lu %lu",
+-		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS),
+-		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS));
++		snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTMSGS),
++		snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTERRORS));
+ 	for (i=0; icmpmibmap[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			snmp_fold_field((void **) icmpmsg_statistics,
++			snmp_fold_field((void **) ve_icmpmsg_statistics,
+ 				icmpmibmap[i].index | 0x100));
+ }
+ 
+@@ -346,7 +349,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
+ 
+ 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			   snmp_fold_field((void **)ip_statistics,
++			   snmp_fold_field((void **)ve_ip_statistics,
+ 					   snmp4_ipstats_list[i].entry));
+ 
+ 	icmp_put(seq);	/* RFC 2011 compatibility */
+@@ -361,11 +364,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
+ 		/* MaxConn field is signed, RFC 2012 */
+ 		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+ 			seq_printf(seq, " %ld",
+-				   snmp_fold_field((void **)tcp_statistics,
++				   snmp_fold_field((void **)ve_tcp_statistics,
+ 						   snmp4_tcp_list[i].entry));
+ 		else
+ 			seq_printf(seq, " %lu",
+-				   snmp_fold_field((void **)tcp_statistics,
++				   snmp_fold_field((void **)ve_tcp_statistics,
+ 						   snmp4_tcp_list[i].entry));
+ 	}
+ 
+@@ -376,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
+ 	seq_puts(seq, "\nUdp:");
+ 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			   snmp_fold_field((void **)udp_statistics,
++			   snmp_fold_field((void **)ve_udp_statistics,
+ 					   snmp4_udp_list[i].entry));
+ 
+ 	/* the UDP and UDP-Lite MIBs are the same */
+@@ -387,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
+ 	seq_puts(seq, "\nUdpLite:");
+ 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			   snmp_fold_field((void **)udplite_statistics,
++			   snmp_fold_field((void **)ve_udplite_statistics,
+ 					   snmp4_udp_list[i].entry));
+ 
+ 	seq_putc(seq, '\n');
+@@ -423,7 +426,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
+ 	seq_puts(seq, "\nTcpExt:");
+ 	for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			   snmp_fold_field((void **)net_statistics,
++			   snmp_fold_field((void **)ve_net_statistics,
+ 					   snmp4_net_list[i].entry));
+ 
+ 	seq_puts(seq, "\nIpExt:");
+@@ -433,7 +436,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
+ 	seq_puts(seq, "\nIpExt:");
+ 	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ 		seq_printf(seq, " %lu",
+-			   snmp_fold_field((void **)ip_statistics,
++			   snmp_fold_field((void **)ve_ip_statistics,
+ 					   snmp4_ipextstats_list[i].entry));
+ 
+ 	seq_putc(seq, '\n');
+@@ -456,13 +459,26 @@ static const struct file_operations netstat_seq_fops = {
+ static __net_init int ip_proc_init_net(struct net *net)
+ {
+ 	if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops))
+-		return -ENOMEM;
++		goto out;
++	if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops))
++		goto out_netstat;
++	if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops))
++		goto out_snmp;
+ 	return 0;
++
++out_snmp:
++	proc_net_remove(net, "netstat");
++out_netstat:
++	proc_net_remove(net, "sockstat");
++out:
++	return -ENOMEM;
+ }
+ 
+ static __net_exit void ip_proc_exit_net(struct net *net)
+ {
+ 	proc_net_remove(net, "sockstat");
++	proc_net_remove(net, "netstat");
++	proc_net_remove(net, "snmp");
+ }
+ 
+ static __net_initdata struct pernet_operations ip_proc_ops = {
+@@ -472,24 +488,6 @@ static __net_initdata struct pernet_operations ip_proc_ops = {
+ 
+ int __init ip_misc_proc_init(void)
+ {
+-	int rc = 0;
+-
+-	if (register_pernet_subsys(&ip_proc_ops))
+-		goto out_pernet;
+-
+-	if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops))
+-		goto out_netstat;
+-
+-	if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops))
+-		goto out_snmp;
+-out:
+-	return rc;
+-out_snmp:
+-	proc_net_remove(&init_net, "netstat");
+-out_netstat:
+-	unregister_pernet_subsys(&ip_proc_ops);
+-out_pernet:
+-	rc = -ENOMEM;
+-	goto out;
++	return register_pernet_subsys(&ip_proc_ops);
+ }
+ 
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 96be336..d032f59 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -71,6 +71,7 @@
+ #include <linux/types.h>
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
++#include <linux/nsproxy.h>
+ #include <linux/bootmem.h>
+ #include <linux/string.h>
+ #include <linux/socket.h>
+@@ -117,6 +118,7 @@
+ 
+ #define RT_GC_TIMEOUT (300*HZ)
+ 
++int ip_rt_src_check		= 1;
+ static int ip_rt_max_size;
+ static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
+ static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
+@@ -134,7 +136,6 @@ static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
+ 
+ static void rt_worker_func(struct work_struct *work);
+ static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
+-static struct timer_list rt_secret_timer;
+ 
+ /*
+  *	Interface to generic destination cache.
+@@ -253,20 +254,41 @@ static inline void rt_hash_lock_init(void)
+ static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
+ static unsigned			rt_hash_mask __read_mostly;
+ static unsigned int		rt_hash_log  __read_mostly;
+-static atomic_t			rt_genid __read_mostly;
+ 
+ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+ #define RT_CACHE_STAT_INC(field) \
+ 	(__raw_get_cpu_var(rt_cache_stat).field++)
+ 
+-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
++static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
++		int genid)
+ {
+ 	return jhash_3words((__force u32)(__be32)(daddr),
+ 			    (__force u32)(__be32)(saddr),
+-			    idx, atomic_read(&rt_genid))
++			    idx, genid)
+ 		& rt_hash_mask;
+ }
+ 
++void prepare_rt_cache(void)
++{
++#ifdef CONFIG_VE
++	struct rtable *r;
++	int i;
++
++	for (i = rt_hash_mask; i >= 0; i--) {
++		spin_lock_bh(rt_hash_lock_addr(i));
++		for (r = rt_hash_table[i].chain; r; r = r->u.dst.rt_next) {
++			r->fl.owner_env = get_ve0();
++		}
++		spin_unlock_bh(rt_hash_lock_addr(i));
++        }
++#endif
++}
++
++static inline int rt_genid(struct net *net)
++{
++	return atomic_read(&net->ipv4.rt_genid);
++}
++
+ #ifdef CONFIG_PROC_FS
+ struct rt_cache_iter_state {
+ 	struct seq_net_private p;
+@@ -336,7 +358,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+ 	struct rt_cache_iter_state *st = seq->private;
+ 	if (*pos)
+ 		return rt_cache_get_idx(seq, *pos - 1);
+-	st->genid = atomic_read(&rt_genid);
++	st->genid = rt_genid(seq_file_net(seq));
+ 	return SEQ_START_TOKEN;
+ }
+ 
+@@ -683,6 +705,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
+ 	return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
+ }
+ 
++static inline int rt_is_expired(struct rtable *rth)
++{
++	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
++}
++
+ /*
+  * Perform a full scan of hash table and free all entries.
+  * Can be called by a softirq or a process.
+@@ -692,6 +719,7 @@ static void rt_do_flush(int process_context)
+ {
+ 	unsigned int i;
+ 	struct rtable *rth, *next;
++	struct rtable *tail;
+ 
+ 	for (i = 0; i <= rt_hash_mask; i++) {
+ 		if (process_context && need_resched())
+@@ -701,11 +729,40 @@ static void rt_do_flush(int process_context)
+ 			continue;
+ 
+ 		spin_lock_bh(rt_hash_lock_addr(i));
++#ifdef CONFIG_NET_NS
++		{
++		struct rtable ** prev, * p;
++
++		rth = rt_hash_table[i].chain;
++
++		/* defer releasing the head of the list after spin_unlock */
++		for (tail = rth; tail; tail = tail->u.dst.rt_next)
++			if (!rt_is_expired(tail))
++				break;
++		if (rth != tail)
++			rt_hash_table[i].chain = tail;
++
++		/* call rt_free on entries after the tail requiring flush */
++		prev = &rt_hash_table[i].chain;
++		for (p = *prev; p; p = next) {
++			next = p->u.dst.rt_next;
++			if (!rt_is_expired(p)) {
++				prev = &p->u.dst.rt_next;
++			} else {
++				*prev = next;
++				rt_free(p);
++			}
++		}
++		}
++#else
+ 		rth = rt_hash_table[i].chain;
+ 		rt_hash_table[i].chain = NULL;
++		tail = NULL;
++
++#endif
+ 		spin_unlock_bh(rt_hash_lock_addr(i));
+ 
+-		for (; rth; rth = next) {
++		for (; rth != tail; rth = next) {
+ 			next = rth->u.dst.rt_next;
+ 			rt_free(rth);
+ 		}
+@@ -738,7 +795,7 @@ static void rt_check_expire(void)
+ 			continue;
+ 		spin_lock_bh(rt_hash_lock_addr(i));
+ 		while ((rth = *rthp) != NULL) {
+-			if (rth->rt_genid != atomic_read(&rt_genid)) {
++			if (rt_is_expired(rth)) {
+ 				*rthp = rth->u.dst.rt_next;
+ 				rt_free(rth);
+ 				continue;
+@@ -781,21 +838,21 @@ static void rt_worker_func(struct work_struct *work)
+  * many times (2^24) without giving recent rt_genid.
+  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
+  */
+-static void rt_cache_invalidate(void)
++static void rt_cache_invalidate(struct net *net)
+ {
+ 	unsigned char shuffle;
+ 
+ 	get_random_bytes(&shuffle, sizeof(shuffle));
+-	atomic_add(shuffle + 1U, &rt_genid);
++	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
+ }
+ 
+ /*
+  * delay < 0  : invalidate cache (fast : entries will be deleted later)
+  * delay >= 0 : invalidate & flush cache (can be long)
+  */
+-void rt_cache_flush(int delay)
++void rt_cache_flush(struct net *net, int delay)
+ {
+-	rt_cache_invalidate();
++	rt_cache_invalidate(net);
+ 	if (delay >= 0)
+ 		rt_do_flush(!in_softirq());
+ }
+@@ -803,10 +860,12 @@ void rt_cache_flush(int delay)
+ /*
+  * We change rt_genid and let gc do the cleanup
+  */
+-static void rt_secret_rebuild(unsigned long dummy)
++static void rt_secret_rebuild(unsigned long __net)
+ {
+-	rt_cache_invalidate();
+-	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
++	struct net *net = (struct net *)__net;
++
++	rt_cache_invalidate(net);
++	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
+ }
+ 
+ /*
+@@ -882,7 +941,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
+ 			rthp = &rt_hash_table[k].chain;
+ 			spin_lock_bh(rt_hash_lock_addr(k));
+ 			while ((rth = *rthp) != NULL) {
+-				if (rth->rt_genid == atomic_read(&rt_genid) &&
++				if (!rt_is_expired(rth) &&
+ 					!rt_may_expire(rth, tmo, expire)) {
+ 					tmo >>= 1;
+ 					rthp = &rth->u.dst.rt_next;
+@@ -964,7 +1023,7 @@ restart:
+ 
+ 	spin_lock_bh(rt_hash_lock_addr(hash));
+ 	while ((rth = *rthp) != NULL) {
+-		if (rth->rt_genid != atomic_read(&rt_genid)) {
++		if (rt_is_expired(rth)) {
+ 			*rthp = rth->u.dst.rt_next;
+ 			rt_free(rth);
+ 			continue;
+@@ -1140,7 +1199,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
+ 	spin_lock_bh(rt_hash_lock_addr(hash));
+ 	ip_rt_put(rt);
+ 	while ((aux = *rthp) != NULL) {
+-		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
++		if (aux == rt || rt_is_expired(aux)) {
+ 			*rthp = aux->u.dst.rt_next;
+ 			rt_free(aux);
+ 			continue;
+@@ -1182,7 +1241,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+ 
+ 	for (i = 0; i < 2; i++) {
+ 		for (k = 0; k < 2; k++) {
+-			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
++			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
++						rt_genid(net));
+ 
+ 			rthp=&rt_hash_table[hash].chain;
+ 
+@@ -1194,7 +1254,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+ 				    rth->fl.fl4_src != skeys[i] ||
+ 				    rth->fl.oif != ikeys[k] ||
+ 				    rth->fl.iif != 0 ||
+-				    rth->rt_genid != atomic_read(&rt_genid) ||
++				    rt_is_expired(rth) ||
+ 				    !net_eq(dev_net(rth->u.dst.dev), net)) {
+ 					rthp = &rth->u.dst.rt_next;
+ 					continue;
+@@ -1233,7 +1293,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+ 				rt->u.dst.neighbour	= NULL;
+ 				rt->u.dst.hh		= NULL;
+ 				rt->u.dst.xfrm		= NULL;
+-				rt->rt_genid		= atomic_read(&rt_genid);
++				rt->rt_genid		= rt_genid(net);
++#ifdef CONFIG_VE
++				rt->fl.owner_env = get_exec_env();
++#endif
+ 				rt->rt_flags		|= RTCF_REDIRECTED;
+ 
+ 				/* Gateway is different ... */
+@@ -1297,7 +1360,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+ 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ 			   rt->u.dst.expires) {
+ 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+-						rt->fl.oif);
++						rt->fl.oif,
++						rt_genid(dev_net(dst->dev)));
+ #if RT_CACHE_DEBUG >= 1
+ 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
+ 					  NIPQUAD_FMT "/%02x dropped\n",
+@@ -1446,7 +1510,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+ 
+ 	for (k = 0; k < 2; k++) {
+ 		for (i = 0; i < 2; i++) {
+-			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
++			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
++						rt_genid(net));
+ 
+ 			rcu_read_lock();
+ 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+@@ -1461,7 +1526,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
+ 				    rth->fl.iif != 0 ||
+ 				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
+ 				    !net_eq(dev_net(rth->u.dst.dev), net) ||
+-				    rth->rt_genid != atomic_read(&rt_genid))
++				    !rt_is_expired(rth))
+ 					continue;
+ 
+ 				if (new_mtu < 68 || new_mtu >= old_mtu) {
+@@ -1688,15 +1753,18 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ #ifdef CONFIG_NET_CLS_ROUTE
+ 	rth->u.dst.tclassid = itag;
+ #endif
++#ifdef CONFIG_VE
++	rth->fl.owner_env = get_exec_env();
++#endif
+ 	rth->rt_iif	=
+ 	rth->fl.iif	= dev->ifindex;
+-	rth->u.dst.dev	= init_net.loopback_dev;
++	rth->u.dst.dev	= get_exec_env()->ve_netns->loopback_dev;
+ 	dev_hold(rth->u.dst.dev);
+ 	rth->idev	= in_dev_get(rth->u.dst.dev);
+ 	rth->fl.oif	= 0;
+ 	rth->rt_gateway	= daddr;
+ 	rth->rt_spec_dst= spec_dst;
+-	rth->rt_genid	= atomic_read(&rt_genid);
++	rth->rt_genid	= rt_genid(dev_net(dev));
+ 	rth->rt_flags	= RTCF_MULTICAST;
+ 	rth->rt_type	= RTN_MULTICAST;
+ 	if (our) {
+@@ -1711,7 +1779,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ 	RT_CACHE_STAT_INC(in_slow_mc);
+ 
+ 	in_dev_put(in_dev);
+-	hash = rt_hash(daddr, saddr, dev->ifindex);
++	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
+ 	return rt_intern_hash(hash, rth, &skb->rtable);
+ 
+ e_nobufs:
+@@ -1827,6 +1895,9 @@ static int __mkroute_input(struct sk_buff *skb,
+ 	rth->fl.fl4_src	= saddr;
+ 	rth->rt_src	= saddr;
+ 	rth->rt_gateway	= daddr;
++#ifdef CONFIG_VE
++	rth->fl.owner_env = get_exec_env();
++#endif
+ 	rth->rt_iif 	=
+ 		rth->fl.iif	= in_dev->dev->ifindex;
+ 	rth->u.dst.dev	= (out_dev)->dev;
+@@ -1837,7 +1908,7 @@ static int __mkroute_input(struct sk_buff *skb,
+ 
+ 	rth->u.dst.input = ip_forward;
+ 	rth->u.dst.output = ip_output;
+-	rth->rt_genid = atomic_read(&rt_genid);
++	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
+ 
+ 	rt_set_nexthop(rth, res, itag);
+ 
+@@ -1872,7 +1943,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
+ 		return err;
+ 
+ 	/* put it into the cache */
+-	hash = rt_hash(daddr, saddr, fl->iif);
++	hash = rt_hash(daddr, saddr, fl->iif,
++		       rt_genid(dev_net(rth->u.dst.dev)));
+ 	return rt_intern_hash(hash, rth, &skb->rtable);
+ }
+ 
+@@ -1998,7 +2070,7 @@ local_input:
+ 		goto e_nobufs;
+ 
+ 	rth->u.dst.output= ip_rt_bug;
+-	rth->rt_genid = atomic_read(&rt_genid);
++	rth->rt_genid = rt_genid(net);
+ 
+ 	atomic_set(&rth->u.dst.__refcnt, 1);
+ 	rth->u.dst.flags= DST_HOST;
+@@ -2020,6 +2092,9 @@ local_input:
+ 	rth->idev	= in_dev_get(rth->u.dst.dev);
+ 	rth->rt_gateway	= daddr;
+ 	rth->rt_spec_dst= spec_dst;
++#ifdef CONFIG_VE
++	rth->fl.owner_env = get_exec_env();
++#endif
+ 	rth->u.dst.input= ip_local_deliver;
+ 	rth->rt_flags 	= flags|RTCF_LOCAL;
+ 	if (res.type == RTN_UNREACHABLE) {
+@@ -2028,7 +2103,7 @@ local_input:
+ 		rth->rt_flags 	&= ~RTCF_LOCAL;
+ 	}
+ 	rth->rt_type	= res.type;
+-	hash = rt_hash(daddr, saddr, fl.iif);
++	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
+ 	err = rt_intern_hash(hash, rth, &skb->rtable);
+ 	goto done;
+ 
+@@ -2079,7 +2154,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ 
+ 	net = dev_net(dev);
+ 	tos &= IPTOS_RT_MASK;
+-	hash = rt_hash(daddr, saddr, iif);
++	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
+ 
+ 	rcu_read_lock();
+ 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+@@ -2091,7 +2166,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
+ 		    rth->fl.mark == skb->mark &&
+ 		    net_eq(dev_net(rth->u.dst.dev), net) &&
+-		    rth->rt_genid == atomic_read(&rt_genid)) {
++		    !rt_is_expired(rth)) {
+ 			dst_use(&rth->u.dst, jiffies);
+ 			RT_CACHE_STAT_INC(in_hit);
+ 			rcu_read_unlock();
+@@ -2209,6 +2284,9 @@ static int __mkroute_output(struct rtable **result,
+ 	rth->fl.mark    = oldflp->mark;
+ 	rth->rt_dst	= fl->fl4_dst;
+ 	rth->rt_src	= fl->fl4_src;
++#ifdef CONFIG_VE
++	rth->fl.owner_env = get_exec_env();
++#endif
+ 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
+ 	/* get references to the devices that are to be hold by the routing
+ 	   cache entry */
+@@ -2219,7 +2297,7 @@ static int __mkroute_output(struct rtable **result,
+ 	rth->rt_spec_dst= fl->fl4_src;
+ 
+ 	rth->u.dst.output=ip_output;
+-	rth->rt_genid = atomic_read(&rt_genid);
++	rth->rt_genid = rt_genid(dev_net(dev_out));
+ 
+ 	RT_CACHE_STAT_INC(out_slow_tot);
+ 
+@@ -2268,7 +2346,8 @@ static int ip_mkroute_output(struct rtable **rp,
+ 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
+ 	unsigned hash;
+ 	if (err == 0) {
+-		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
++		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
++			       rt_genid(dev_net(dev_out)));
+ 		err = rt_intern_hash(hash, rth, rp);
+ 	}
+ 
+@@ -2313,10 +2392,13 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
+ 		    ipv4_is_zeronet(oldflp->fl4_src))
+ 			goto out;
+ 
+-		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+-		dev_out = ip_dev_find(net, oldflp->fl4_src);
+-		if (dev_out == NULL)
+-			goto out;
++		if (ip_rt_src_check) {
++			/* It is equivalent to
++			   inet_addr_type(saddr) == RTN_LOCAL */
++			dev_out = ip_dev_find(net, oldflp->fl4_src);
++			if (dev_out == NULL)
++				goto out;
++		}
+ 
+ 		/* I removed check for oif == dev_out->oif here.
+ 		   It was wrong for two reasons:
+@@ -2344,6 +2426,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
+ 			   Luckily, this hack is good workaround.
+ 			 */
+ 
++			if (dev_out == NULL) {
++				dev_out = ip_dev_find(net, oldflp->fl4_src);
++				if (dev_out == NULL)
++					goto out;
++			}
++
+ 			fl.oif = dev_out->ifindex;
+ 			goto make_route;
+ 		}
+@@ -2480,7 +2568,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
+ 	unsigned hash;
+ 	struct rtable *rth;
+ 
+-	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
++	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
+ 
+ 	rcu_read_lock_bh();
+ 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+@@ -2493,7 +2581,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
+ 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+ 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
+ 		    net_eq(dev_net(rth->u.dst.dev), net) &&
+-		    rth->rt_genid == atomic_read(&rt_genid)) {
++		    !rt_is_expired(rth)) {
+ 			dst_use(&rth->u.dst, jiffies);
+ 			RT_CACHE_STAT_INC(out_hit);
+ 			rcu_read_unlock_bh();
+@@ -2524,7 +2612,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
+ };
+ 
+ 
+-static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
++static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
+ {
+ 	struct rtable *ort = *rp;
+ 	struct rtable *rt = (struct rtable *)
+@@ -2548,7 +2636,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
+ 		rt->idev = ort->idev;
+ 		if (rt->idev)
+ 			in_dev_hold(rt->idev);
+-		rt->rt_genid = atomic_read(&rt_genid);
++		rt->rt_genid = rt_genid(net);
+ 		rt->rt_flags = ort->rt_flags;
+ 		rt->rt_type = ort->rt_type;
+ 		rt->rt_dst = ort->rt_dst;
+@@ -2584,7 +2672,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
+ 		err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
+ 				    flags ? XFRM_LOOKUP_WAIT : 0);
+ 		if (err == -EREMOTE)
+-			err = ipv4_dst_blackhole(rp, flp);
++			err = ipv4_dst_blackhole(net, rp, flp);
+ 
+ 		return err;
+ 	}
+@@ -2803,7 +2891,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+ 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
+ 			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
+ 				continue;
+-			if (rt->rt_genid != atomic_read(&rt_genid))
++			if (rt_is_expired(rt))
+ 				continue;
+ 			skb->dst = dst_clone(&rt->u.dst);
+ 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+@@ -2827,19 +2915,29 @@ done:
+ 
+ void ip_rt_multicast_event(struct in_device *in_dev)
+ {
+-	rt_cache_flush(0);
++	rt_cache_flush(dev_net(in_dev->dev), 0);
+ }
+ 
+ #ifdef CONFIG_SYSCTL
+-static int flush_delay;
++#warning "Rework this shit via ro net sysctls"
+ 
+ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
+ 					struct file *filp, void __user *buffer,
+ 					size_t *lenp, loff_t *ppos)
+ {
+ 	if (write) {
++		int flush_delay;
++		static DEFINE_MUTEX(flush_mutex);
++		struct net *net;
++
++		mutex_lock(&flush_mutex);
++		ctl->data = &flush_delay;
+ 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+-		rt_cache_flush(flush_delay);
++		ctl->data = NULL;
++		mutex_unlock(&flush_mutex);
++
++		net = (struct net *)ctl->extra1;
++		rt_cache_flush(net, flush_delay);
+ 		return 0;
+ 	}
+ 
+@@ -2855,25 +2953,18 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
+ 						size_t newlen)
+ {
+ 	int delay;
++	struct net *net;
+ 	if (newlen != sizeof(int))
+ 		return -EINVAL;
+ 	if (get_user(delay, (int __user *)newval))
+ 		return -EFAULT;
+-	rt_cache_flush(delay);
++	net = (struct net *)table->extra1;
++	rt_cache_flush(net, delay);
+ 	return 0;
+ }
+ 
+ ctl_table ipv4_route_table[] = {
+ 	{
+-		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
+-		.procname	= "flush",
+-		.data		= &flush_delay,
+-		.maxlen		= sizeof(int),
+-		.mode		= 0200,
+-		.proc_handler	= &ipv4_sysctl_rtcache_flush,
+-		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
+-	},
+-	{
+ 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
+ 		.procname	= "gc_thresh",
+ 		.data		= &ipv4_dst_ops.gc_thresh,
+@@ -3011,8 +3102,97 @@ ctl_table ipv4_route_table[] = {
+ 	},
+ 	{ .ctl_name = 0 }
+ };
++
++static __net_initdata struct ctl_path ipv4_route_path[] = {
++	{ .procname = "net", .ctl_name = CTL_NET, },
++	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
++	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
++	{ },
++};
++
++
++static struct ctl_table ipv4_route_flush_table[] = {
++	{
++		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
++		.procname	= "flush",
++		.maxlen		= sizeof(int),
++		.mode		= 0200,
++		.proc_handler	= &ipv4_sysctl_rtcache_flush,
++		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
++	},
++	{ .ctl_name = 0 },
++};
++
++static __net_init int sysctl_route_net_init(struct net *net)
++{
++	struct ctl_table *tbl;
++
++	tbl = ipv4_route_flush_table;
++	if (net != &init_net) {
++		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
++		if (tbl == NULL)
++			goto err_dup;
++	}
++	tbl[0].extra1 = net;
++
++	net->ipv4.route_hdr =
++		register_net_sysctl_table(net, ipv4_route_path, tbl);
++	if (net->ipv4.route_hdr == NULL)
++		goto err_reg;
++	return 0;
++
++err_reg:
++	if (tbl != ipv4_route_flush_table)
++		kfree(tbl);
++err_dup:
++	return -ENOMEM;
++}
++
++static __net_exit void sysctl_route_net_exit(struct net *net)
++{
++	struct ctl_table *tbl;
++
++	tbl = net->ipv4.route_hdr->ctl_table_arg;
++	unregister_net_sysctl_table(net->ipv4.route_hdr);
++	BUG_ON(tbl == ipv4_route_flush_table);
++	kfree(tbl);
++}
++
++static __net_initdata struct pernet_operations sysctl_route_ops = {
++	.init = sysctl_route_net_init,
++	.exit = sysctl_route_net_exit,
++};
+ #endif
+ 
++
++static __net_init int rt_secret_timer_init(struct net *net)
++{
++	atomic_set(&net->ipv4.rt_genid,
++			(int) ((num_physpages ^ (num_physpages>>8)) ^
++			(jiffies ^ (jiffies >> 7))));
++
++	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
++	net->ipv4.rt_secret_timer.data = (unsigned long)net;
++	init_timer_deferrable(&net->ipv4.rt_secret_timer);
++
++	net->ipv4.rt_secret_timer.expires =
++		jiffies + net_random() % ip_rt_secret_interval +
++		ip_rt_secret_interval;
++	add_timer(&net->ipv4.rt_secret_timer);
++	return 0;
++}
++
++static __net_exit void rt_secret_timer_exit(struct net *net)
++{
++	del_timer_sync(&net->ipv4.rt_secret_timer);
++}
++
++static __net_initdata struct pernet_operations rt_secret_timer_ops = {
++	.init = rt_secret_timer_init,
++	.exit = rt_secret_timer_exit,
++};
++
++
+ #ifdef CONFIG_NET_CLS_ROUTE
+ struct ip_rt_acct *ip_rt_acct __read_mostly;
+ #endif /* CONFIG_NET_CLS_ROUTE */
+@@ -3031,9 +3211,6 @@ int __init ip_rt_init(void)
+ {
+ 	int rc = 0;
+ 
+-	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
+-			     (jiffies ^ (jiffies >> 7))));
+-
+ #ifdef CONFIG_NET_CLS_ROUTE
+ 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+ 	if (!ip_rt_acct)
+@@ -3065,19 +3242,14 @@ int __init ip_rt_init(void)
+ 	devinet_init();
+ 	ip_fib_init();
+ 
+-	rt_secret_timer.function = rt_secret_rebuild;
+-	rt_secret_timer.data = 0;
+-	init_timer_deferrable(&rt_secret_timer);
+-
+ 	/* All the timers, started at system startup tend
+ 	   to synchronize. Perturb it a bit.
+ 	 */
+ 	schedule_delayed_work(&expires_work,
+ 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+ 
+-	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
+-		ip_rt_secret_interval;
+-	add_timer(&rt_secret_timer);
++	if (register_pernet_subsys(&rt_secret_timer_ops))
++		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
+ 
+ 	if (ip_rt_proc_init())
+ 		printk(KERN_ERR "Unable to create route proc files\n");
+@@ -3087,6 +3259,9 @@ int __init ip_rt_init(void)
+ #endif
+ 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+ 
++#ifdef CONFIG_SYSCTL
++	register_pernet_subsys(&sysctl_route_ops);
++#endif
+ 	return rc;
+ }
+ 
+diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
+index c437f80..e69d5ee 100644
+--- a/net/ipv4/sysctl_net_ipv4.c
++++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -28,6 +28,9 @@ static int tcp_retr1_max = 255;
+ static int ip_local_port_range_min[] = { 1, 1 };
+ static int ip_local_port_range_max[] = { 65535, 65535 };
+ 
++int sysctl_tcp_use_sg = 1;
++EXPORT_SYMBOL(sysctl_tcp_use_sg);
++
+ extern seqlock_t sysctl_port_range_lock;
+ extern int sysctl_local_port_range[2];
+ 
+@@ -419,6 +422,13 @@ static struct ctl_table ipv4_table[] = {
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec
+ 	},
++	{
++		.procname	= "tcp_use_sg",
++		.data		= &sysctl_tcp_use_sg,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
+ 
+ #endif
+ 	{
+@@ -586,6 +596,20 @@ static struct ctl_table ipv4_table[] = {
+ 		.proc_handler	= &proc_dointvec
+ 	},
+ 	{
++		.procname       = "tcp_max_tw_kmem_fraction",
++		.data           = &sysctl_tcp_max_tw_kmem_fraction,
++		.maxlen         = sizeof(int),
++		.mode           = 0644,
++		.proc_handler   = proc_dointvec,
++	},
++	{
++		.procname       = "tcp_max_tw_buckets_ub",
++		.data           = &sysctl_tcp_max_tw_buckets_ub,
++		.maxlen         = sizeof(int),
++		.mode           = 0644,
++		.proc_handler   = proc_dointvec,
++	},
++	{
+ 		.ctl_name	= NET_TCP_NO_METRICS_SAVE,
+ 		.procname	= "tcp_no_metrics_save",
+ 		.data		= &sysctl_tcp_nometrics_save,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 1d723de..56f3de7 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -274,6 +274,10 @@
+ #include <net/netdma.h>
+ #include <net/sock.h>
+ 
++#include <bc/sock_orphan.h>
++#include <bc/net.h>
++#include <bc/tcp.h>
++
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+ 
+@@ -340,6 +344,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+ 	unsigned int mask;
+ 	struct sock *sk = sock->sk;
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	int check_send_space;
+ 
+ 	poll_wait(file, sk->sk_sleep, wait);
+ 	if (sk->sk_state == TCP_LISTEN)
+@@ -354,6 +359,21 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+ 	if (sk->sk_err)
+ 		mask = POLLERR;
+ 
++	check_send_space = 1;
++#ifdef CONFIG_BEANCOUNTERS
++	if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) {
++		unsigned long size;
++		size = MAX_TCP_HEADER + tp->mss_cache;
++		if (size > SOCK_MIN_UBCSPACE)
++			size = SOCK_MIN_UBCSPACE;
++		size = skb_charge_size(size);   
++		if (ub_sock_makewres_tcp(sk, size)) {
++			check_send_space = 0;
++			ub_sock_sndqueueadd_tcp(sk, size);
++		}
++	}
++#endif
++
+ 	/*
+ 	 * POLLHUP is certainly not done right. But poll() doesn't
+ 	 * have a notion of HUP in just one direction, and for a
+@@ -397,7 +417,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+ 		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+ 			mask |= POLLIN | POLLRDNORM;
+ 
+-		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
++		if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ 				mask |= POLLOUT | POLLWRNORM;
+ 			} else {  /* send SIGIO later */
+@@ -641,7 +661,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+ 
+ 	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ 	if (skb) {
+-		if (sk_wmem_schedule(sk, skb->truesize)) {
++		if (sk_wmem_schedule(sk, skb->truesize, skb)) {
+ 			/*
+ 			 * Make sure that we have exactly size bytes
+ 			 * available to the caller, no more, no less.
+@@ -687,15 +707,22 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
+ 		int copy, i, can_coalesce;
+ 		int offset = poffset % PAGE_SIZE;
+ 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
++		unsigned long chargesize = 0;
+ 
+ 		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+ new_segment:
++			chargesize = 0;
+ 			if (!sk_stream_memory_free(sk))
+ 				goto wait_for_sndbuf;
+ 
++			chargesize = skb_charge_size(MAX_TCP_HEADER +
++					tp->mss_cache);
++			if (ub_sock_getwres_tcp(sk, chargesize) < 0)
++				goto wait_for_ubspace;
+ 			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ 			if (!skb)
+ 				goto wait_for_memory;
++			ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+ 
+ 			skb_entail(sk, skb);
+ 			copy = size_goal;
+@@ -710,7 +737,7 @@ new_segment:
+ 			tcp_mark_push(tp, skb);
+ 			goto new_segment;
+ 		}
+-		if (!sk_wmem_schedule(sk, copy))
++		if (!sk_wmem_schedule(sk, copy, skb))
+ 			goto wait_for_memory;
+ 
+ 		if (can_coalesce) {
+@@ -751,10 +778,15 @@ new_segment:
+ wait_for_sndbuf:
+ 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ wait_for_memory:
++		ub_sock_retwres_tcp(sk, chargesize,
++			skb_charge_size(MAX_TCP_HEADER + tp->mss_cache));
++		chargesize = 0;
++wait_for_ubspace:
+ 		if (copied)
+ 			tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ 
+-		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
++		err = __sk_stream_wait_memory(sk, &timeo, chargesize);
++		if (err != 0)
+ 			goto do_error;
+ 
+ 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+@@ -791,12 +823,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+ 	return res;
+ }
+ 
+-#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+-#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+-
+-static inline int select_size(struct sock *sk)
++static inline int select_size(struct sock *sk, struct tcp_sock *tp)
+ {
+-	struct tcp_sock *tp = tcp_sk(sk);
+ 	int tmp = tp->mss_cache;
+ 
+ 	if (sk->sk_route_caps & NETIF_F_SG) {
+@@ -855,6 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ 	while (--iovlen >= 0) {
+ 		int seglen = iov->iov_len;
+ 		unsigned char __user *from = iov->iov_base;
++		unsigned long chargesize = 0;
+ 
+ 		iov++;
+ 
+@@ -865,18 +894,27 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ 
+ 			if (!tcp_send_head(sk) ||
+ 			    (copy = size_goal - skb->len) <= 0) {
++				unsigned long size;
+ 
+ new_segment:
+ 				/* Allocate new segment. If the interface is SG,
+ 				 * allocate skb fitting to single page.
+ 				 */
++				chargesize = 0;
+ 				if (!sk_stream_memory_free(sk))
+ 					goto wait_for_sndbuf;
+ 
+-				skb = sk_stream_alloc_skb(sk, select_size(sk),
++				size = select_size(sk, tp);
++				chargesize = skb_charge_size(MAX_TCP_HEADER +
++						size);
++				if (ub_sock_getwres_tcp(sk, chargesize) < 0)
++					goto wait_for_ubspace;
++				skb = sk_stream_alloc_skb(sk, size,
+ 						sk->sk_allocation);
+ 				if (!skb)
+ 					goto wait_for_memory;
++				ub_skb_set_charge(skb, sk, chargesize,
++						UB_TCPSNDBUF);
+ 
+ 				/*
+ 				 * Check whether we can use HW checksum.
+@@ -922,6 +960,7 @@ new_segment:
+ 				} else if (page) {
+ 					if (off == PAGE_SIZE) {
+ 						put_page(page);
++						ub_sock_tcp_detachpage(sk);
+ 						TCP_PAGE(sk) = page = NULL;
+ 						off = 0;
+ 					}
+@@ -931,10 +970,13 @@ new_segment:
+ 				if (copy > PAGE_SIZE - off)
+ 					copy = PAGE_SIZE - off;
+ 
+-				if (!sk_wmem_schedule(sk, copy))
++				if (!sk_wmem_schedule(sk, copy, skb))
+ 					goto wait_for_memory;
+ 
+ 				if (!page) {
++					chargesize = PAGE_SIZE;
++					if (ub_sock_tcp_chargepage(sk) < 0)
++						goto wait_for_ubspace;
+ 					/* Allocate new cache page. */
+ 					if (!(page = sk_stream_alloc_page(sk)))
+ 						goto wait_for_memory;
+@@ -966,7 +1008,8 @@ new_segment:
+ 					} else if (off + copy < PAGE_SIZE) {
+ 						get_page(page);
+ 						TCP_PAGE(sk) = page;
+-					}
++					} else
++						ub_sock_tcp_detachpage(sk);
+ 				}
+ 
+ 				TCP_OFF(sk) = off + copy;
+@@ -997,10 +1040,15 @@ new_segment:
+ wait_for_sndbuf:
+ 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ wait_for_memory:
++			ub_sock_retwres_tcp(sk, chargesize,
++				skb_charge_size(MAX_TCP_HEADER+tp->mss_cache));
++			chargesize = 0;
++wait_for_ubspace:
+ 			if (copied)
+ 				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+ 
+-			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
++			err = __sk_stream_wait_memory(sk, &timeo, chargesize);
++			if (err != 0)
+ 				goto do_error;
+ 
+ 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+@@ -1100,7 +1148,18 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
+ #if TCP_DEBUG
+ 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ 
+-	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
++	if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) {
++		printk("KERNEL: assertion: skb==NULL || "
++				"before(tp->copied_seq, skb->end_seq)\n");
++		printk("VE%u pid %d comm %.16s\n", 
++				(get_exec_env() ? VEID(get_exec_env()) : 0),
++				current->pid, current->comm);
++		printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied,
++				tp->copied_seq, tp->rcv_nxt);
++		printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n",
++				skb->len, TCP_SKB_CB(skb)->seq, 
++				TCP_SKB_CB(skb)->end_seq);
++	}
+ #endif
+ 
+ 	if (inet_csk_ack_scheduled(sk)) {
+@@ -1362,7 +1421,23 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ 				goto found_ok_skb;
+ 			if (tcp_hdr(skb)->fin)
+ 				goto found_fin_ok;
+-			BUG_TRAP(flags & MSG_PEEK);
++			if (!(flags & MSG_PEEK)) {
++				printk("KERNEL: assertion: flags&MSG_PEEK\n");
++				printk("VE%u pid %d comm %.16s\n", 
++						(get_exec_env() ? 
++						 VEID(get_exec_env()) : 0),
++						current->pid, current->comm);
++				printk("flags=0x%x, len=%d, copied_seq=%d, "
++						"rcv_nxt=%d\n", flags,
++						(int)len, tp->copied_seq,
++						tp->rcv_nxt);
++				printk("skb->len=%d, *seq=%d, skb->seq=%d, "
++						"skb->end_seq=%d, offset=%d\n",
++						skb->len, *seq, 
++						TCP_SKB_CB(skb)->seq,
++						TCP_SKB_CB(skb)->end_seq, 
++						offset);
++			}
+ 			skb = skb->next;
+ 		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+ 
+@@ -1425,8 +1500,19 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ 
+ 			tp->ucopy.len = len;
+ 
+-			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
+-				 (flags & (MSG_PEEK | MSG_TRUNC)));
++			if (!(tp->copied_seq == tp->rcv_nxt || 
++						(flags&(MSG_PEEK|MSG_TRUNC)))) {
++				printk("KERNEL: assertion: tp->copied_seq == "
++						"tp->rcv_nxt || ...\n");
++				printk("VE%u pid %d comm %.16s\n", 
++						(get_exec_env() ?
++						 VEID(get_exec_env()) : 0),
++						current->pid, current->comm);
++				printk("flags=0x%x, len=%d, copied_seq=%d, "
++						"rcv_nxt=%d\n", flags,
++						(int)len, tp->copied_seq,
++						tp->rcv_nxt);
++			}
+ 
+ 			/* Ugly... If prequeue is not empty, we have to
+ 			 * process it before releasing socket, otherwise
+@@ -1837,7 +1923,7 @@ adjudge_to_death:
+ 	state = sk->sk_state;
+ 	sock_hold(sk);
+ 	sock_orphan(sk);
+-	atomic_inc(sk->sk_prot->orphan_count);
++	ub_inc_orphan_count(sk);
+ 
+ 	/* It is the last release_sock in its life. It will remove backlog. */
+ 	release_sock(sk);
+@@ -1887,12 +1973,19 @@ adjudge_to_death:
+ 		}
+ 	}
+ 	if (sk->sk_state != TCP_CLOSE) {
++		int orphans = ub_get_orphan_count(sk);
++
+ 		sk_mem_reclaim(sk);
+-		if (tcp_too_many_orphans(sk,
+-				atomic_read(sk->sk_prot->orphan_count))) {
+-			if (net_ratelimit())
++		if (ub_too_many_orphans(sk, orphans)) {
++			if (net_ratelimit()) {
++				int ubid = 0;
++#ifdef CONFIG_USER_RESOURCE
++				ubid = sock_has_ubc(sk) ?
++				   top_beancounter(sock_bc(sk)->ub)->ub_uid : 0;
++#endif
+ 				printk(KERN_INFO "TCP: too many of orphaned "
+-				       "sockets\n");
++				       "sockets (%d in CT%d)\n", orphans, ubid);
++			}
+ 			tcp_set_state(sk, TCP_CLOSE);
+ 			tcp_send_active_reset(sk, GFP_ATOMIC);
+ 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+@@ -1968,6 +2061,7 @@ int tcp_disconnect(struct sock *sk, int flags)
+ 	tp->snd_ssthresh = 0x7fffffff;
+ 	tp->snd_cwnd_cnt = 0;
+ 	tp->bytes_acked = 0;
++	tp->advmss = 65535;
+ 	tcp_set_ca_state(sk, TCP_CA_Open);
+ 	tcp_clear_retrans(tp);
+ 	inet_csk_delack_init(sk);
+@@ -2632,7 +2726,7 @@ void __init tcp_init(void)
+ 	tcp_hashinfo.bind_bucket_cachep =
+ 		kmem_cache_create("tcp_bind_bucket",
+ 				  sizeof(struct inet_bind_bucket), 0,
+-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
++				  SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
+ 
+ 	/* Size and allocate the main established and bind bucket
+ 	 * hash tables.
+@@ -2701,6 +2795,11 @@ void __init tcp_init(void)
+ 	sysctl_tcp_mem[1] = limit;
+ 	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ 
++	if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096)
++		sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096;
++	if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096)
++		sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096;
++
+ 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+ 	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
+ 	max_share = min(4UL*1024*1024, limit);
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index cad73b7..bdb0162 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -73,6 +73,8 @@
+ #include <asm/unaligned.h>
+ #include <net/netdma.h>
+ 
++#include <bc/tcp.h>
++
+ int sysctl_tcp_timestamps __read_mostly = 1;
+ int sysctl_tcp_window_scaling __read_mostly = 1;
+ int sysctl_tcp_sack __read_mostly = 1;
+@@ -308,7 +310,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
+ 	/* Check #1 */
+ 	if (tp->rcv_ssthresh < tp->window_clamp &&
+ 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
+-	    !tcp_memory_pressure) {
++	    ub_tcp_rmem_allows_expand(sk)) {
+ 		int incr;
+ 
+ 		/* Check #2. Increase window, if skb with such overhead
+@@ -378,6 +380,8 @@ static void tcp_init_buffer_space(struct sock *sk)
+ 
+ 	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+ 	tp->snd_cwnd_stamp = tcp_time_stamp;
++
++	ub_tcp_update_maxadvmss(sk);
+ }
+ 
+ /* 5. Recalculate window clamp after socket hit its memory bounds. */
+@@ -390,7 +394,7 @@ static void tcp_clamp_window(struct sock *sk)
+ 
+ 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+-	    !tcp_memory_pressure &&
++	    !ub_tcp_memory_pressure(sk) &&
+ 	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ 				    sysctl_tcp_rmem[2]);
+@@ -3877,19 +3881,19 @@ static void tcp_ofo_queue(struct sock *sk)
+ static int tcp_prune_ofo_queue(struct sock *sk);
+ static int tcp_prune_queue(struct sock *sk);
+ 
+-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
++static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+ {
+ 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+-	    !sk_rmem_schedule(sk, size)) {
++	    !sk_rmem_schedule(sk, skb)) {
+ 
+ 		if (tcp_prune_queue(sk) < 0)
+ 			return -1;
+ 
+-		if (!sk_rmem_schedule(sk, size)) {
++		if (!sk_rmem_schedule(sk, skb)) {
+ 			if (!tcp_prune_ofo_queue(sk))
+ 				return -1;
+ 
+-			if (!sk_rmem_schedule(sk, size))
++			if (!sk_rmem_schedule(sk, skb))
+ 				return -1;
+ 		}
+ 	}
+@@ -3945,8 +3949,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+ 		if (eaten <= 0) {
+ queue_and_out:
+ 			if (eaten < 0 &&
+-			    tcp_try_rmem_schedule(sk, skb->truesize))
+-				goto drop;
++			    tcp_try_rmem_schedule(sk, skb))
++				goto drop_part;
+ 
+ 			skb_set_owner_r(skb, sk);
+ 			__skb_queue_tail(&sk->sk_receive_queue, skb);
+@@ -3990,6 +3994,12 @@ out_of_window:
+ drop:
+ 		__kfree_skb(skb);
+ 		return;
++
++drop_part:
++		if (after(tp->copied_seq, tp->rcv_nxt))
++			tp->rcv_nxt = tp->copied_seq;
++		__kfree_skb(skb);
++		return;
+ 	}
+ 
+ 	/* Out of window. F.e. zero window probe. */
+@@ -4016,7 +4026,7 @@ drop:
+ 
+ 	TCP_ECN_check_ce(tp, skb);
+ 
+-	if (tcp_try_rmem_schedule(sk, skb->truesize))
++	if (tcp_try_rmem_schedule(sk, skb))
+ 		goto drop;
+ 
+ 	/* Disable header prediction. */
+@@ -4160,6 +4170,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ 		nskb = alloc_skb(copy + header, GFP_ATOMIC);
+ 		if (!nskb)
+ 			return;
++		if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) {
++			kfree_skb(nskb);
++			return;
++		}
+ 
+ 		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
+ 		skb_set_network_header(nskb, (skb_network_header(skb) -
+@@ -4287,7 +4301,7 @@ static int tcp_prune_queue(struct sock *sk)
+ 
+ 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ 		tcp_clamp_window(sk);
+-	else if (tcp_memory_pressure)
++	else if (ub_tcp_memory_pressure(sk))
+ 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ 
+ 	tcp_collapse_ofo_queue(sk);
+@@ -4352,7 +4366,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
+ 		return 0;
+ 
+ 	/* If we are under global TCP memory pressure, do not expand.  */
+-	if (tcp_memory_pressure)
++	if (ub_tcp_memory_pressure(sk))
+ 		return 0;
+ 
+ 	/* If we are under soft global TCP memory pressure, do not expand.  */
+@@ -4801,6 +4815,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ 
+ 				if ((int)skb->truesize > sk->sk_forward_alloc)
+ 					goto step5;
++				/* This is OK not to try to free memory here.
++				 * Do this below on slow path. Den */
++				if (ub_tcprcvbuf_charge(sk, skb) < 0)
++					goto step5;
+ 
+ 				NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+ 
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index ffe869a..ca6b5d3 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -73,6 +73,8 @@
+ #include <net/xfrm.h>
+ #include <net/netdma.h>
+ 
++#include <bc/tcp.h>
++
+ #include <linux/inet.h>
+ #include <linux/ipv6.h>
+ #include <linux/stddef.h>
+@@ -699,7 +701,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+ 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+ 
+ 	tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+-			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
++			tcptw->tw_rcv_wnd >>
++				(tw->tw_rcv_wscale & TW_WSCALE_MASK),
+ 			tcptw->tw_ts_recent);
+ 
+ 	inet_twsk_put(tw);
+@@ -1228,6 +1231,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+ 	.destructor	=	tcp_v4_reqsk_destructor,
+ 	.send_reset	=	tcp_v4_send_reset,
+ };
++EXPORT_SYMBOL_GPL(tcp_request_sock_ops);
+ 
+ #ifdef CONFIG_TCP_MD5SIG
+ static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+@@ -1532,6 +1536,10 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
+ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ {
+ 	struct sock *rsk;
++	struct user_beancounter *ub;
++
++	ub = set_exec_ub(sock_bc(sk)->ub);
++
+ #ifdef CONFIG_TCP_MD5SIG
+ 	/*
+ 	 * We really want to reject the packet as early as possible
+@@ -1550,7 +1558,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 			goto reset;
+ 		}
+ 		TCP_CHECK_TIMER(sk);
+-		return 0;
++		goto restore_context;
+ 	}
+ 
+ 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+@@ -1566,7 +1574,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 				rsk = nsk;
+ 				goto reset;
+ 			}
+-			return 0;
++			goto restore_context;
+ 		}
+ 	}
+ 
+@@ -1576,6 +1584,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 		goto reset;
+ 	}
+ 	TCP_CHECK_TIMER(sk);
++
++restore_context:
++	(void)set_exec_ub(ub);
+ 	return 0;
+ 
+ reset:
+@@ -1587,7 +1598,7 @@ discard:
+ 	 * might be destroyed here. This current version compiles correctly,
+ 	 * but you have been warned.
+ 	 */
+-	return 0;
++	goto restore_context;
+ 
+ csum_err:
+ 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
+@@ -1849,6 +1860,8 @@ static int tcp_v4_init_sock(struct sock *sk)
+ 	tp->snd_cwnd_clamp = ~0;
+ 	tp->mss_cache = 536;
+ 
++	tp->advmss = 65535; /* max value */
++
+ 	tp->reordering = sysctl_tcp_reordering;
+ 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+ 
+@@ -1910,6 +1923,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
+ 	 * If sendmsg cached page exists, toss it.
+ 	 */
+ 	if (sk->sk_sndmsg_page) {
++		/* queue is empty, uncharge */
++		ub_sock_tcp_detachpage(sk);
+ 		__free_page(sk->sk_sndmsg_page);
+ 		sk->sk_sndmsg_page = NULL;
+ 	}
+@@ -2463,6 +2478,87 @@ void __init tcp_v4_init(void)
+ 		panic("Failed to create the TCP control socket.\n");
+ }
+ 
++#ifdef CONFIG_VE
++static void tcp_kill_ve_onesk(struct sock *sk)
++{
++	struct tcp_sock *tp = tcp_sk(sk);
++
++	/* Check the assumed state of the socket. */
++	if (!sock_flag(sk, SOCK_DEAD)) {
++		static int printed;
++invalid:
++		if (!printed)
++			printk(KERN_DEBUG "Killing sk: dead %d, state %d, "
++				"wrseq %u unseq %u, wrqu %d.\n",
++				sock_flag(sk, SOCK_DEAD), sk->sk_state,
++				tp->write_seq, tp->snd_una,
++				!skb_queue_empty(&sk->sk_write_queue));
++		printed = 1;
++		return;
++	}
++
++	tcp_send_active_reset(sk, GFP_ATOMIC);
++	switch (sk->sk_state) {
++		case TCP_FIN_WAIT1:
++		case TCP_CLOSING:
++			/* In these 2 states the peer may want us to retransmit
++			 * some data and/or FIN.  Entering "resetting mode"
++			 * instead.
++			 */
++			tcp_time_wait(sk, TCP_CLOSE, 0);
++			break;
++		case TCP_FIN_WAIT2:
++			/* By some reason the socket may stay in this state
++			 * without turning into a TW bucket.  Fix it.
++			 */
++			tcp_time_wait(sk, TCP_FIN_WAIT2, 0);
++			break;
++		case TCP_LAST_ACK:
++			/* Just jump into CLOSED state. */
++			tcp_done(sk);
++			break;
++		default:
++			/* The socket must be already close()d. */
++			goto invalid;
++	}
++}
++
++void tcp_v4_kill_ve_sockets(struct ve_struct *envid)
++{
++	struct inet_ehash_bucket *head;
++	int i;
++
++	/* alive */
++	local_bh_disable();
++	head = tcp_hashinfo.ehash;
++	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
++		struct sock *sk;
++		struct hlist_node *node;
++		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
++more_work:
++		write_lock(lock);
++		sk_for_each(sk, node, &head[i].chain) {
++			if (ve_accessible_strict(sk->owner_env, envid)) {
++				sock_hold(sk);
++				write_unlock(lock);
++
++				bh_lock_sock(sk);
++				/* sk might have disappeared from the hash before
++				 * we got the lock */
++				if (sk->sk_state != TCP_CLOSE)
++					tcp_kill_ve_onesk(sk);
++				bh_unlock_sock(sk);
++				sock_put(sk);
++				goto more_work;
++			}
++		}
++		write_unlock(lock);
++	}
++	local_bh_enable();
++}
++EXPORT_SYMBOL(tcp_v4_kill_ve_sockets);
++#endif
++
+ EXPORT_SYMBOL(ipv4_specific);
+ EXPORT_SYMBOL(tcp_hashinfo);
+ EXPORT_SYMBOL(tcp_prot);
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index 8245247..8bbda56 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -28,6 +28,9 @@
+ #include <net/inet_common.h>
+ #include <net/xfrm.h>
+ 
++#include <bc/net.h>
++#include <bc/sock_orphan.h>
++
+ #ifdef CONFIG_SYSCTL
+ #define SYNC_INIT 0 /* let the user enable it */
+ #else
+@@ -38,6 +41,11 @@ int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
+ EXPORT_SYMBOL(sysctl_tcp_syncookies);
+ 
+ int sysctl_tcp_abort_on_overflow __read_mostly;
++int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384;
++int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536;
++
++EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction);
++EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub);
+ 
+ struct inet_timewait_death_row tcp_death_row = {
+ 	.sysctl_max_tw_buckets = NR_FILE * 2,
+@@ -53,6 +61,7 @@ struct inet_timewait_death_row tcp_death_row = {
+ 	.twcal_hand	= -1,
+ 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ 					    (unsigned long)&tcp_death_row),
++	.ub_managed	= 1,
+ };
+ 
+ EXPORT_SYMBOL_GPL(tcp_death_row);
+@@ -281,7 +290,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ 		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
+ 
+-	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
++	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets &&
++			ub_timewait_check(sk, &tcp_death_row))
+ 		tw = inet_twsk_alloc(sk, state);
+ 
+ 	if (tw != NULL) {
+@@ -294,6 +304,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ 		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
+ 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
+ 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
++		if (sk->sk_user_data != NULL)
++			tw->tw_rcv_wscale |= TW_WSCALE_SPEC;
+ 
+ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ 		if (tw->tw_family == PF_INET6) {
+@@ -328,6 +340,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ 			}
+ 		} while (0);
+ #endif
++		tw->tw_owner_env = VEID(sk->owner_env);
+ 
+ 		/* Linkage updates. */
+ 		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+@@ -348,11 +361,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
+ 				   TCP_TIMEWAIT_LEN);
+ 		inet_twsk_put(tw);
+ 	} else {
++		int ubid = 0;
+ 		/* Sorry, if we're out of memory, just CLOSE this
+ 		 * socket up.  We've got bigger problems than
+ 		 * non-graceful socket closings.
+ 		 */
+-		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
++#ifdef CONFIG_BEANCOUNTERS
++		if (sock_has_ubc(sk))
++			ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid;
++#endif
++		LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid);
+ 	}
+ 
+ 	tcp_update_metrics(sk);
+@@ -393,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
+ 		struct tcp_sock *newtp;
+ 
+ 		/* Now setup tcp_sock */
++		newsk->owner_env = sk->owner_env;
++
+ 		newtp = tcp_sk(newsk);
+ 		newtp->pred_flags = 0;
+ 		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index ad993ec..4459fd3 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -41,6 +41,9 @@
+ #include <linux/compiler.h>
+ #include <linux/module.h>
+ 
++#include <bc/net.h>
++#include <bc/tcp.h>
++
+ /* People can turn this off for buggy TCP's found in printers etc. */
+ int sysctl_tcp_retrans_collapse __read_mostly = 1;
+ 
+@@ -455,6 +458,13 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
+ #endif
+ }
+ 
++static int skb_header_size(struct sock *sk, int tcp_hlen)
++{
++	struct ip_options *opt = inet_sk(sk)->opt;
++	return tcp_hlen + sizeof(struct iphdr) +
++		(opt ? opt->optlen : 0)	+ ETH_HLEN /* For hard header */;
++}
++
+ /* This routine actually transmits TCP packets queued in by
+  * tcp_do_sendmsg().  This is used by both the initial
+  * transmission and possible later retransmissions.
+@@ -474,6 +484,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ 	struct tcp_sock *tp;
+ 	struct tcp_skb_cb *tcb;
+ 	int tcp_header_size;
++	int header_size;
+ #ifdef CONFIG_TCP_MD5SIG
+ 	struct tcp_md5sig_key *md5;
+ 	__u8 *md5_hash_location;
+@@ -533,6 +544,20 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ 				     TCPOLEN_SACK_PERBLOCK));
+ 	}
+ 
++	/* Unfortunately, we can have skb from outside world here
++	 * with size insufficient for header. It is impossible to make
++	 * guess when we queue skb, so the decision should be made
++	 * here. Den
++	 */
++	header_size = skb_header_size(sk, tcp_header_size);
++	if (skb->data - header_size < skb->head) {
++		int delta = header_size - skb_headroom(skb);
++		err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta),
++				0, GFP_ATOMIC);
++		if (err)
++			return err;
++	}
++
+ 	if (tcp_packets_in_flight(tp) == 0)
+ 		tcp_ca_event(sk, CA_EVENT_TX_START);
+ 
+@@ -706,15 +731,23 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ 	if (nsize < 0)
+ 		nsize = 0;
+ 
+-	if (skb_cloned(skb) &&
+-	    skb_is_nonlinear(skb) &&
+-	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+-		return -ENOMEM;
++	if (skb_cloned(skb) && skb_is_nonlinear(skb)) {
++		unsigned long chargesize;
++		chargesize = skb_bc(skb)->charged;
++		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++			return -ENOMEM;
++		ub_sock_tcp_unchargesend(sk, chargesize);
++		ub_tcpsndbuf_charge_forced(sk, skb);
++	}
+ 
+ 	/* Get a new skb... force flag on. */
+ 	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+ 	if (buff == NULL)
+ 		return -ENOMEM; /* We'll just try again later. */
++	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++		kfree_skb(buff);
++		return -ENOMEM;
++	}
+ 
+ 	sk->sk_wmem_queued += buff->truesize;
+ 	sk_mem_charge(sk, buff->truesize);
+@@ -1216,6 +1249,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ 	if (unlikely(buff == NULL))
+ 		return -ENOMEM;
+ 
++	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++		kfree_skb(buff);
++		return -ENOMEM;
++	}
++
+ 	sk->sk_wmem_queued += buff->truesize;
+ 	sk_mem_charge(sk, buff->truesize);
+ 	buff->truesize += nlen;
+@@ -1651,7 +1689,7 @@ u32 __tcp_select_window(struct sock *sk)
+ 	if (free_space < (full_space >> 1)) {
+ 		icsk->icsk_ack.quick = 0;
+ 
+-		if (tcp_memory_pressure)
++		if (ub_tcp_shrink_rcvbuf(sk))
+ 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
+ 					       4U * tp->advmss);
+ 
+@@ -2096,6 +2134,7 @@ void tcp_send_fin(struct sock *sk)
+ 				break;
+ 			yield();
+ 		}
++		ub_tcpsndbuf_charge_forced(sk, skb);
+ 
+ 		/* Reserve space for headers and prepare control bits. */
+ 		skb_reserve(skb, MAX_TCP_HEADER);
+@@ -2154,6 +2193,10 @@ int tcp_send_synack(struct sock *sk)
+ 			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ 			if (nskb == NULL)
+ 				return -ENOMEM;
++			if (ub_tcpsndbuf_charge(sk, skb) < 0) {
++				kfree_skb(nskb);
++				return -ENOMEM;
++			}
+ 			tcp_unlink_write_queue(skb, sk);
+ 			skb_header_release(nskb);
+ 			__tcp_add_write_queue_head(sk, nskb);
+@@ -2282,6 +2325,7 @@ static void tcp_connect_init(struct sock *sk)
+ 	struct dst_entry *dst = __sk_dst_get(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	__u8 rcv_wscale;
++	static int once = 0;
+ 
+ 	/* We'll fix this up when we get a response from the other end.
+ 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+@@ -2301,9 +2345,23 @@ static void tcp_connect_init(struct sock *sk)
+ 	tcp_mtup_init(sk);
+ 	tcp_sync_mss(sk, dst_mtu(dst));
+ 
++	if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) {
++		once = 1;
++
++		printk("Oops in connect_init! dst->advmss=%d\n",
++						dst_metric(dst, RTAX_ADVMSS));
++		printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU));
++		printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, "
++				"advmss=%d, user_mss=%d\n",
++				sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss,
++				tp->mss_cache, tp->advmss, tp->rx_opt.user_mss);
++	}
++
+ 	if (!tp->window_clamp)
+ 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
++	if (tp->advmss == 0)
++		tp->advmss = 1460;
+ 	tcp_initialize_rcv_mss(sk);
+ 
+ 	tcp_select_initial_window(tcp_full_space(sk),
+@@ -2344,6 +2402,10 @@ int tcp_connect(struct sock *sk)
+ 	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ 	if (unlikely(buff == NULL))
+ 		return -ENOBUFS;
++	if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++		kfree_skb(buff);
++		return -ENOBUFS;
++	}
+ 
+ 	/* Reserve space for headers. */
+ 	skb_reserve(buff, MAX_TCP_HEADER);
+diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
+index 63ed9d6..2432a49 100644
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -22,6 +22,8 @@
+ 
+ #include <linux/module.h>
+ #include <net/tcp.h>
++#include <bc/sock_orphan.h>
++#include <bc/tcp.h>
+ 
+ int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
+ int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
+@@ -67,7 +69,8 @@ static void tcp_write_err(struct sock *sk)
+ static int tcp_out_of_resources(struct sock *sk, int do_reset)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+-	int orphans = atomic_read(&tcp_orphan_count);
++	int orphans = ub_get_orphan_count(sk);
++	int orph = orphans;
+ 
+ 	/* If peer does not open window for long time, or did not transmit
+ 	 * anything for long time, penalize it. */
+@@ -78,10 +81,16 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
+ 	if (sk->sk_err_soft)
+ 		orphans <<= 1;
+ 
+-	if (tcp_too_many_orphans(sk, orphans)) {
+-		if (net_ratelimit())
+-			printk(KERN_INFO "Out of socket memory\n");
+-
++	if (ub_too_many_orphans(sk, orphans)) {
++		if (net_ratelimit()) {
++			int ubid = 0;
++#ifdef CONFIG_USER_RESOURCE
++			ubid = sock_has_ubc(sk) ?
++				top_beancounter(sock_bc(sk)->ub)->ub_uid : 0;
++#endif
++			printk(KERN_INFO "Orphaned socket dropped "
++			       "(%d,%d in CT%d)\n", orph, orphans, ubid);
++		}
+ 		/* Catch exceptional cases, when connection requires reset.
+ 		 *      1. Last segment was sent recently. */
+ 		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+@@ -174,9 +183,12 @@ static int tcp_write_timeout(struct sock *sk)
+ static void tcp_delack_timer(unsigned long data)
+ {
+ 	struct sock *sk = (struct sock*)data;
++	struct ve_struct *env;
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
++	env = set_exec_env(sk->owner_env);
++
+ 	bh_lock_sock(sk);
+ 	if (sock_owned_by_user(sk)) {
+ 		/* Try again later. */
+@@ -225,11 +237,12 @@ static void tcp_delack_timer(unsigned long data)
+ 	TCP_CHECK_TIMER(sk);
+ 
+ out:
+-	if (tcp_memory_pressure)
++	if (ub_tcp_memory_pressure(sk))
+ 		sk_mem_reclaim(sk);
+ out_unlock:
+ 	bh_unlock_sock(sk);
+ 	sock_put(sk);
++	(void)set_exec_env(env);
+ }
+ 
+ static void tcp_probe_timer(struct sock *sk)
+@@ -284,8 +297,11 @@ static void tcp_probe_timer(struct sock *sk)
+ static void tcp_retransmit_timer(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	struct ve_struct *env;
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 
++	env = set_exec_env(sk->owner_env);
++
+ 	if (!tp->packets_out)
+ 		goto out;
+ 
+@@ -390,15 +406,19 @@ out_reset_timer:
+ 	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
+ 		__sk_dst_reset(sk);
+ 
+-out:;
++out:
++	(void)set_exec_env(env);
+ }
+ 
+ static void tcp_write_timer(unsigned long data)
+ {
+ 	struct sock *sk = (struct sock*)data;
++	struct ve_struct *env;
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 	int event;
+ 
++	env = set_exec_env(sk->owner_env);
++
+ 	bh_lock_sock(sk);
+ 	if (sock_owned_by_user(sk)) {
+ 		/* Try again later */
+@@ -432,6 +452,7 @@ out:
+ out_unlock:
+ 	bh_unlock_sock(sk);
+ 	sock_put(sk);
++	(void)set_exec_env(env);
+ }
+ 
+ /*
+@@ -459,10 +480,13 @@ void tcp_set_keepalive(struct sock *sk, int val)
+ static void tcp_keepalive_timer (unsigned long data)
+ {
+ 	struct sock *sk = (struct sock *) data;
++	struct ve_struct *env;
+ 	struct inet_connection_sock *icsk = inet_csk(sk);
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	__u32 elapsed;
+ 
++	env = set_exec_env(sk->owner_env);
++
+ 	/* Only process if socket is not in use. */
+ 	bh_lock_sock(sk);
+ 	if (sock_owned_by_user(sk)) {
+@@ -534,4 +558,5 @@ death:
+ out:
+ 	bh_unlock_sock(sk);
+ 	sock_put(sk);
++	(void)set_exec_env(env);
+ }
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 56fcda3..2a1087b 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -159,7 +159,9 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ 	struct sock *sk2;
+ 	int    error = 1;
+ 	struct net *net = sock_net(sk);
++	struct ve_struct *ve;
+ 
++	ve = get_exec_env();
+ 	write_lock_bh(&udp_hash_lock);
+ 
+ 	if (!snum) {
+@@ -176,7 +178,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ 		for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+ 			int size = 0;
+ 
+-			head = &udptable[rover & (UDP_HTABLE_SIZE - 1)];
++			head = &udptable[udp_hashfn(rover, VEID(ve))];
+ 			if (hlist_empty(head))
+ 				goto gotit;
+ 
+@@ -213,7 +215,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ gotit:
+ 		snum = rover;
+ 	} else {
+-		head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
++		head = &udptable[udp_hashfn(snum, VEID(ve))];
+ 
+ 		sk_for_each(sk2, node, head)
+ 			if (sk2->sk_hash == snum                             &&
+@@ -229,7 +231,7 @@ gotit:
+ 	inet_sk(sk)->num = snum;
+ 	sk->sk_hash = snum;
+ 	if (sk_unhashed(sk)) {
+-		head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
++		head = &udptable[udp_hashfn(snum, VEID(ve))];
+ 		sk_add_node(sk, head);
+ 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ 	}
+@@ -264,9 +266,11 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+ 	struct hlist_node *node;
+ 	unsigned short hnum = ntohs(dport);
+ 	int badness = -1;
++	struct ve_struct *ve;
+ 
++	ve = get_exec_env();
+ 	read_lock(&udp_hash_lock);
+-	sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
++	sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) {
+ 		struct inet_sock *inet = inet_sk(sk);
+ 
+ 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+@@ -1070,7 +1074,8 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
+ 	int dif;
+ 
+ 	read_lock(&udp_hash_lock);
+-	sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
++	sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest),
++				VEID(skb->owner_env))]);
+ 	dif = skb->dev->ifindex;
+ 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+ 	if (sk) {
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+index ff61a5c..5935c08 100644
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -388,9 +388,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
+ 	    dev->type == ARPHRD_TUNNEL6 ||
+ 	    dev->type == ARPHRD_SIT ||
+ 	    dev->type == ARPHRD_NONE) {
+-		printk(KERN_INFO
+-		       "%s: Disabled Privacy Extensions\n",
+-		       dev->name);
++		ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n",
++			dev->name));
+ 		ndev->cnf.use_tempaddr = -1;
+ 	} else {
+ 		in6_dev_hold(ndev);
+@@ -584,7 +583,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
+ 		goto out;
+ 	}
+ 
+-	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
++	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC);
+ 
+ 	if (ifa == NULL) {
+ 		ADBG(("ipv6_add_addr: malloc failed\n"));
+@@ -2025,7 +2024,7 @@ err_exit:
+ /*
+  *	Manual configuration of address on an interface
+  */
+-static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
++int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+ 			  unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
+ 			  __u32 valid_lft)
+ {
+@@ -2097,6 +2096,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+ 
+ 	return PTR_ERR(ifp);
+ }
++EXPORT_SYMBOL_GPL(inet6_addr_add);
+ 
+ static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
+ 			  unsigned int plen)
+@@ -2142,7 +2142,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
+ 	struct in6_ifreq ireq;
+ 	int err;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+@@ -2161,7 +2161,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
+ 	struct in6_ifreq ireq;
+ 	int err;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+@@ -2664,6 +2664,9 @@ static int addrconf_ifdown(struct net_device *dev, int how)
+ static void addrconf_rs_timer(unsigned long data)
+ {
+ 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
++	struct ve_struct *old_env;
++	
++	old_env = set_exec_env(ifp->idev->dev->owner_env);
+ 
+ 	if (ifp->idev->cnf.forwarding)
+ 		goto out;
+@@ -2698,6 +2701,7 @@ static void addrconf_rs_timer(unsigned long data)
+ 
+ out:
+ 	in6_ifa_put(ifp);
++	(void)set_exec_env(old_env);
+ }
+ 
+ /*
+@@ -2773,7 +2777,9 @@ static void addrconf_dad_timer(unsigned long data)
+ 	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+ 	struct inet6_dev *idev = ifp->idev;
+ 	struct in6_addr mcaddr;
++	struct ve_struct *old_env;
+ 
++	old_env = set_exec_env(ifp->idev->dev->owner_env);
+ 	read_lock_bh(&idev->lock);
+ 	if (idev->dead) {
+ 		read_unlock_bh(&idev->lock);
+@@ -2804,6 +2810,7 @@ static void addrconf_dad_timer(unsigned long data)
+ 	ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+ out:
+ 	in6_ifa_put(ifp);
++	(void)set_exec_env(old_env);
+ }
+ 
+ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+@@ -3026,6 +3033,7 @@ static void addrconf_verify(unsigned long foo)
+ 	struct inet6_ifaddr *ifp;
+ 	unsigned long now, next;
+ 	int i;
++	struct ve_struct *old_env;
+ 
+ 	spin_lock_bh(&addrconf_verify_lock);
+ 	now = jiffies;
+@@ -3046,6 +3054,8 @@ restart:
+ 			if (ifp->flags & IFA_F_PERMANENT)
+ 				continue;
+ 
++			old_env = set_exec_env(ifp->idev->dev->owner_env);
++
+ 			spin_lock(&ifp->lock);
+ 			age = (now - ifp->tstamp) / HZ;
+ 
+@@ -3061,9 +3071,11 @@ restart:
+ 				in6_ifa_hold(ifp);
+ 				read_unlock(&addrconf_hash_lock);
+ 				ipv6_del_addr(ifp);
++				(void)set_exec_env(old_env);
+ 				goto restart;
+ 			} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
+ 				spin_unlock(&ifp->lock);
++				set_exec_env(old_env);
+ 				continue;
+ 			} else if (age >= ifp->prefered_lft) {
+ 				/* jiffies - ifp->tsamp > age >= ifp->prefered_lft */
+@@ -3085,6 +3097,7 @@ restart:
+ 
+ 					ipv6_ifa_notify(0, ifp);
+ 					in6_ifa_put(ifp);
++					(void)set_exec_env(old_env);
+ 					goto restart;
+ 				}
+ #ifdef CONFIG_IPV6_PRIVACY
+@@ -3106,6 +3119,7 @@ restart:
+ 						ipv6_create_tempaddr(ifpub, ifp);
+ 						in6_ifa_put(ifpub);
+ 						in6_ifa_put(ifp);
++						(void)set_exec_env(old_env);
+ 						goto restart;
+ 					}
+ 				} else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+@@ -3118,6 +3132,7 @@ restart:
+ 					next = ifp->tstamp + ifp->prefered_lft * HZ;
+ 				spin_unlock(&ifp->lock);
+ 			}
++			(void)set_exec_env(old_env);
+ 		}
+ 		read_unlock(&addrconf_hash_lock);
+ 	}
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index e84b3fd..10c74ae 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -58,6 +58,10 @@
+ #ifdef CONFIG_IPV6_TUNNEL
+ #include <net/ip6_tunnel.h>
+ #endif
++#ifdef CONFIG_IPV6_MIP6
++#include <net/mip6.h>
++#endif
++#include <bc/net.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -147,6 +151,10 @@ lookup_protocol:
+ 			goto out_rcu_unlock;
+ 	}
+ 
++	err = vz_security_protocol_check(answer->protocol);
++	if (err < 0)
++		goto out_rcu_unlock;
++
+ 	err = -EPERM;
+ 	if (answer->capability > 0 && !capable(answer->capability))
+ 		goto out_rcu_unlock;
+@@ -164,6 +172,13 @@ lookup_protocol:
+ 	if (sk == NULL)
+ 		goto out;
+ 
++	err = -ENOBUFS;
++	if (ub_sock_charge(sk, PF_INET6, sock->type))
++		goto out_sk_free;
++	/* if charge was successful, sock_init_data() MUST be called to
++	 * set sk->sk_type. otherwise sk will be uncharged to wrong resource
++	 */
++
+ 	sock_init_data(sock, sk);
+ 
+ 	err = 0;
+@@ -238,6 +253,9 @@ out:
+ out_rcu_unlock:
+ 	rcu_read_unlock();
+ 	goto out;
++out_sk_free:
++	sk_free(sk);
++	return err;
+ }
+ 
+ 
+@@ -803,45 +821,48 @@ static void ipv6_packet_cleanup(void)
+ 	dev_remove_pack(&ipv6_packet_type);
+ }
+ 
+-static int __init init_ipv6_mibs(void)
++int init_ipv6_mibs(void)
+ {
+-	if (snmp_mib_init((void **)ipv6_statistics,
++	if (snmp_mib_init((void **)ve_ipv6_statistics,
+ 			  sizeof(struct ipstats_mib)) < 0)
+ 		goto err_ip_mib;
+-	if (snmp_mib_init((void **)icmpv6_statistics,
++	if (snmp_mib_init((void **)ve_icmpv6_statistics,
+ 			  sizeof(struct icmpv6_mib)) < 0)
+ 		goto err_icmp_mib;
+-	if (snmp_mib_init((void **)icmpv6msg_statistics,
++	if (snmp_mib_init((void **)ve_icmpv6msg_statistics,
+ 			  sizeof(struct icmpv6msg_mib)) < 0)
+ 		goto err_icmpmsg_mib;
+-	if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
++	if (snmp_mib_init((void **)ve_udp_stats_in6,
++			  sizeof (struct udp_mib)) < 0)
+ 		goto err_udp_mib;
+-	if (snmp_mib_init((void **)udplite_stats_in6,
++	if (snmp_mib_init((void **)ve_udplite_stats_in6,
+ 			  sizeof (struct udp_mib)) < 0)
+ 		goto err_udplite_mib;
+ 	return 0;
+ 
+ err_udplite_mib:
+-	snmp_mib_free((void **)udp_stats_in6);
++	snmp_mib_free((void **)ve_udp_stats_in6);
+ err_udp_mib:
+-	snmp_mib_free((void **)icmpv6msg_statistics);
++	snmp_mib_free((void **)ve_icmpv6msg_statistics);
+ err_icmpmsg_mib:
+-	snmp_mib_free((void **)icmpv6_statistics);
++	snmp_mib_free((void **)ve_icmpv6_statistics);
+ err_icmp_mib:
+-	snmp_mib_free((void **)ipv6_statistics);
++	snmp_mib_free((void **)ve_ipv6_statistics);
+ err_ip_mib:
+ 	return -ENOMEM;
+ 
+ }
++EXPORT_SYMBOL(init_ipv6_mibs);
+ 
+-static void cleanup_ipv6_mibs(void)
++void cleanup_ipv6_mibs(void)
+ {
+-	snmp_mib_free((void **)ipv6_statistics);
+-	snmp_mib_free((void **)icmpv6_statistics);
+-	snmp_mib_free((void **)icmpv6msg_statistics);
+-	snmp_mib_free((void **)udp_stats_in6);
+-	snmp_mib_free((void **)udplite_stats_in6);
++	snmp_mib_free((void **)ve_ipv6_statistics);
++	snmp_mib_free((void **)ve_icmpv6_statistics);
++	snmp_mib_free((void **)ve_icmpv6msg_statistics);
++	snmp_mib_free((void **)ve_udp_stats_in6);
++	snmp_mib_free((void **)ve_udplite_stats_in6);
+ }
++EXPORT_SYMBOL(cleanup_ipv6_mibs);
+ 
+ static int inet6_net_init(struct net *net)
+ {
+diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
+index 580014a..f099a61 100644
+--- a/net/ipv6/inet6_hashtables.c
++++ b/net/ipv6/inet6_hashtables.c
+@@ -68,7 +68,8 @@ struct sock *__inet6_lookup_established(struct net *net,
+ 	/* Optimize here for direct hit, only listening connections can
+ 	 * have wildcards anyways.
+ 	 */
+-	unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
++	struct ve_struct *env = get_exec_env();
++	unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env));
+ 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+ 	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+ 
+@@ -102,9 +103,10 @@ struct sock *inet6_lookup_listener(struct net *net,
+ 	const struct hlist_node *node;
+ 	struct sock *result = NULL;
+ 	int score, hiscore = 0;
++	struct ve_struct *ve = get_exec_env();
+ 
+ 	read_lock(&hashinfo->lhash_lock);
+-	sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
++	sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(ve))]) {
+ 		if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
+ 				sk->sk_family == PF_INET6) {
+ 			const struct ipv6_pinfo *np = inet6_sk(sk);
+@@ -156,7 +158,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
+ 
+ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ 				     struct sock *sk, const __u16 lport,
+-				     struct inet_timewait_sock **twp)
++				     struct inet_timewait_sock **twp,
++				     struct ve_struct *ve)
+ {
+ 	struct inet_hashinfo *hinfo = death_row->hashinfo;
+ 	struct inet_sock *inet = inet_sk(sk);
+@@ -166,7 +169,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ 	const int dif = sk->sk_bound_dev_if;
+ 	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
+ 	const unsigned int hash = inet6_ehashfn(daddr, lport, saddr,
+-						inet->dport);
++						inet->dport, VEID(ve));
+ 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ 	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
+ 	struct sock *sk2;
+diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
+index 1ee4fa1..45acc3e 100644
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -184,11 +184,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
+ 
+ 	h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
+ 
+-	/*
+-	 * No protection necessary, this is the only list mutatation
+-	 * operation, tables never disappear once they exist.
+-	 */
++	write_lock_bh(&tb->tb6_lock);
+ 	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
++	write_unlock_bh(&tb->tb6_lock);
+ }
+ 
+ #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+@@ -1370,10 +1368,14 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
+ 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ 		head = &net->ipv6.fib_table_hash[h];
+ 		hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
++			struct ve_struct *old_env;
++			
++			old_env = set_exec_env(table->owner_env);
+ 			write_lock_bh(&table->tb6_lock);
+ 			fib6_clean_tree(net, &table->tb6_root,
+ 					func, prune, arg);
+ 			write_unlock_bh(&table->tb6_lock);
++			(void)set_exec_env(old_env);
+ 		}
+ 	}
+ 	rcu_read_unlock();
+@@ -1506,6 +1508,9 @@ static int fib6_net_init(struct net *net)
+ 	if (!net->ipv6.fib6_main_tbl)
+ 		goto out_fib_table_hash;
+ 
++#ifdef CONFIG_VE
++	net->ipv6.fib6_main_tbl->owner_env = get_exec_env();
++#endif
+ 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+ 	net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
+@@ -1516,6 +1521,10 @@ static int fib6_net_init(struct net *net)
+ 					   GFP_KERNEL);
+ 	if (!net->ipv6.fib6_local_tbl)
+ 		goto out_fib6_main_tbl;
++
++#ifdef CONFIG_VE
++	net->ipv6.fib6_local_tbl->owner_env = get_exec_env();
++#endif
+ 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+ 	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
+@@ -1564,7 +1573,7 @@ int __init fib6_init(void)
+ 
+ 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
+ 					   sizeof(struct fib6_node),
+-					   0, SLAB_HWCACHE_ALIGN,
++					   0, SLAB_HWCACHE_ALIGN|SLAB_UBC,
+ 					   NULL);
+ 	if (!fib6_node_kmem)
+ 		goto out;
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index 48cdce9..0976ff5 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -516,6 +516,20 @@ int ip6_forward(struct sk_buff *skb)
+ 		return -EMSGSIZE;
+ 	}
+ 
++	/*
++	 * We try to optimize forwarding of VE packets:
++	 * do not decrement TTL (and so save skb_cow)
++	 * during forwarding of outgoing pkts from VE.
++	 * For incoming pkts we still do ttl decr,
++	 * since such skb is not cloned and does not require
++	 * actual cow. So, there is at least one place
++	 * in pkts path with mandatory ttl decr, that is
++	 * sufficient to prevent routing loops.
++	 */
++	hdr = ipv6_hdr(skb);
++	if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */
++		goto no_ttl_decr;
++
+ 	if (skb_cow(skb, dst->dev->hard_header_len)) {
+ 		IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+ 		goto drop;
+@@ -527,6 +541,7 @@ int ip6_forward(struct sk_buff *skb)
+ 
+ 	hdr->hop_limit--;
+ 
++no_ttl_decr:
+ 	IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ 	return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
+ 		       ip6_forward_finish);
+diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
+index fd632dd..a78e9cb 100644
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -246,6 +246,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(ipv6_sock_mc_join);
+ 
+ /*
+  *	socket leave on multicast group
+@@ -2202,15 +2203,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma)
+ static void mld_gq_timer_expire(unsigned long data)
+ {
+ 	struct inet6_dev *idev = (struct inet6_dev *)data;
++	struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
+ 
+ 	idev->mc_gq_running = 0;
+ 	mld_send_report(idev, NULL);
+ 	__in6_dev_put(idev);
++	set_exec_env(old_env);
+ }
+ 
+ static void mld_ifc_timer_expire(unsigned long data)
+ {
+ 	struct inet6_dev *idev = (struct inet6_dev *)data;
++	struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
+ 
+ 	mld_send_cr(idev);
+ 	if (idev->mc_ifc_count) {
+@@ -2219,6 +2223,7 @@ static void mld_ifc_timer_expire(unsigned long data)
+ 			mld_ifc_start_timer(idev, idev->mc_maxdelay);
+ 	}
+ 	__in6_dev_put(idev);
++	set_exec_env(old_env);
+ }
+ 
+ static void mld_ifc_event(struct inet6_dev *idev)
+@@ -2233,6 +2238,7 @@ static void mld_ifc_event(struct inet6_dev *idev)
+ static void igmp6_timer_handler(unsigned long data)
+ {
+ 	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
++	struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env);
+ 
+ 	if (MLD_V1_SEEN(ma->idev))
+ 		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+@@ -2244,6 +2250,7 @@ static void igmp6_timer_handler(unsigned long data)
+ 	ma->mca_flags &= ~MAF_TIMER_RUNNING;
+ 	spin_unlock(&ma->mca_lock);
+ 	ma_put(ma);
++	set_exec_env(old_env);
+ }
+ 
+ /* Device going down */
+diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
+index 2eff3ae..8753e85 100644
+--- a/net/ipv6/netfilter/ip6_queue.c
++++ b/net/ipv6/netfilter/ip6_queue.c
+@@ -442,7 +442,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
+ 	if (type <= IPQM_BASE)
+ 		return;
+ 
+-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		RCV_SKB_FAIL(-EPERM);
+ 
+ 	write_lock_bh(&queue_lock);
+@@ -472,8 +472,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
+ static void
+ ipq_rcv_skb(struct sk_buff *skb)
+ {
++	struct ve_struct *old_ve;
++
+ 	mutex_lock(&ipqnl_mutex);
++	old_ve = set_exec_env(skb->owner_env);
+ 	__ipq_rcv_skb(skb);
++	(void)set_exec_env(old_ve);
+ 	mutex_unlock(&ipqnl_mutex);
+ }
+ 
+@@ -483,9 +487,6 @@ ipq_rcv_dev_event(struct notifier_block *this,
+ {
+ 	struct net_device *dev = ptr;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	/* Drop any packets associated with the downed device */
+ 	if (event == NETDEV_DOWN)
+ 		ipq_dev_drop(dev->ifindex);
+@@ -505,7 +506,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
+ 	if (event == NETLINK_URELEASE &&
+ 	    n->protocol == NETLINK_IP6_FW && n->pid) {
+ 		write_lock_bh(&queue_lock);
+-		if ((n->net == &init_net) && (n->pid == peer_pid))
++		if (n->pid == peer_pid)
+ 			__ipq_reset();
+ 		write_unlock_bh(&queue_lock);
+ 	}
+diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
+index 0b4557e..0b6f441 100644
+--- a/net/ipv6/netfilter/ip6_tables.c
++++ b/net/ipv6/netfilter/ip6_tables.c
+@@ -1874,7 +1874,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -1985,7 +1985,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+ {
+ 	int ret;
+ 
+-	if (!capable(CAP_NET_ADMIN))
++	if (!capable(CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	switch (cmd) {
+@@ -2084,7 +2084,7 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *table,
+ 	int ret;
+ 	struct xt_table_info *newinfo;
+ 	struct xt_table_info bootstrap
+-		= { 0, 0, 0, { 0 }, { 0 }, { } };
++		= { 0, 0, 0, 0, { 0 }, { 0 }, { } };
+ 	void *loc_cpu_entry;
+ 	struct xt_table *new_table;
+ 
+@@ -2241,11 +2241,22 @@ static struct xt_match icmp6_matchstruct __read_mostly = {
+ 
+ static int __net_init ip6_tables_net_init(struct net *net)
+ {
+-	return xt_proto_init(net, AF_INET6);
++	int res;
++
++	if (!net_ipt_module_permitted(net, VE_IP_IPTABLES6))
++		return 0;
++
++	res = xt_proto_init(net, AF_INET6);
++	if (!res)
++		net_ipt_module_set(net, VE_IP_IPTABLES6);
++	return res;
+ }
+ 
+ static void __net_exit ip6_tables_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6))
++		return;
++
+ 	xt_proto_fini(net, AF_INET6);
+ }
+ 
+diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
+index f979e48..d03046a 100644
+--- a/net/ipv6/netfilter/ip6table_filter.c
++++ b/net/ipv6/netfilter/ip6table_filter.c
+@@ -120,16 +120,24 @@ module_param(forward, bool, 0000);
+ 
+ static int __net_init ip6table_filter_net_init(struct net *net)
+ {
++	if (!net_ipt_module_permitted(net, VE_IP_FILTER6))
++		return 0;
++
+ 	/* Register table */
+ 	net->ipv6.ip6table_filter =
+ 		ip6t_register_table(net, &packet_filter, &initial_table.repl);
+ 	if (IS_ERR(net->ipv6.ip6table_filter))
+ 		return PTR_ERR(net->ipv6.ip6table_filter);
++
++	net_ipt_module_set(net, VE_IP_FILTER6);
+ 	return 0;
+ }
+ 
+ static void __net_exit ip6table_filter_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_FILTER6))
++		return;
++
+ 	ip6t_unregister_table(net->ipv6.ip6table_filter);
+ }
+ 
+diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
+index f405cea..a4727b3 100644
+--- a/net/ipv6/netfilter/ip6table_mangle.c
++++ b/net/ipv6/netfilter/ip6table_mangle.c
+@@ -160,16 +160,24 @@ static struct nf_hook_ops ip6t_ops[] __read_mostly = {
+ 
+ static int __net_init ip6table_mangle_net_init(struct net *net)
+ {
++	if (!net_ipt_module_permitted(net, VE_IP_MANGLE6))
++		return 0;
++
+ 	/* Register table */
+ 	net->ipv6.ip6table_mangle =
+ 		ip6t_register_table(net, &packet_mangler, &initial_table.repl);
+ 	if (IS_ERR(net->ipv6.ip6table_mangle))
+ 		return PTR_ERR(net->ipv6.ip6table_mangle);
++
++	net_ipt_module_set(net, VE_IP_MANGLE6);
+ 	return 0;
+ }
+ 
+ static void __net_exit ip6table_mangle_net_exit(struct net *net)
+ {
++	if (!net_is_ipt_module_set(net, VE_IP_MANGLE6))
++		return;
++
+ 	ip6t_unregister_table(net->ipv6.ip6table_mangle);
+ }
+ 
+diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+index 85050c0..e6f8f7d 100644
+--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
++++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+@@ -14,6 +14,7 @@
+ #include <linux/in6.h>
+ #include <linux/netfilter.h>
+ #include <linux/module.h>
++#include <linux/nfcalls.h>
+ #include <linux/skbuff.h>
+ #include <linux/icmp.h>
+ #include <linux/sysctl.h>
+@@ -359,39 +360,52 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai at toshiba.co.jp>");
+ 
+-static int __init nf_conntrack_l3proto_ipv6_init(void)
++int init_nf_ct_l3proto_ipv6(void)
+ {
+-	int ret = 0;
+-
+-	need_conntrack();
+-
++	int ret = -ENOMEM;
++
++#ifdef CONFIG_VE_IPTABLES
++	if (!ve_is_super(get_exec_env())) 
++		__module_get(THIS_MODULE);
++
++	ret = nf_ct_proto_tcp_sysctl_init();
++	if (ret < 0)
++		goto no_mem_tcp;
++	ret = nf_ct_proto_udp_sysctl_init();
++	if (ret < 0)
++		goto no_mem_udp;
++	ret = nf_ct_proto_icmpv6_sysctl_init();
++	if (ret < 0)
++		goto no_mem_icmp;
++#endif /* CONFIG_VE_IPTABLES */
+ 	ret = nf_ct_frag6_init();
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't initialize frag6.\n");
+-		return ret;
++		goto cleanup_sys;
+ 	}
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
++
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp6);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't register tcp.\n");
+ 		goto cleanup_frag6;
+ 	}
+ 
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp6);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't register udp.\n");
+-		goto cleanup_tcp;
++		goto unreg_tcp;
+ 	}
+ 
+-	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6);
++	ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmpv6);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't register icmpv6.\n");
+-		goto cleanup_udp;
++		goto unreg_udp;
+ 	}
+ 
+-	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
++	ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv6);
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't register ipv6\n");
+-		goto cleanup_icmpv6;
++		goto unreg_icmpv6;
+ 	}
+ 
+ 	ret = nf_register_hooks(ipv6_conntrack_ops,
+@@ -399,32 +413,77 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
+ 	if (ret < 0) {
+ 		printk("nf_conntrack_ipv6: can't register pre-routing defrag "
+ 		       "hook.\n");
+-		goto cleanup_ipv6;
++		goto unreg_ipv6;
+ 	}
+-	return ret;
++	return 0;
+ 
+- cleanup_ipv6:
+-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+- cleanup_icmpv6:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+- cleanup_udp:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+- cleanup_tcp:
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+- cleanup_frag6:
++unreg_ipv6:
++	nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6);
++unreg_icmpv6:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6);
++unreg_udp:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6);
++unreg_tcp:
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6);
++cleanup_frag6:
+ 	nf_ct_frag6_cleanup();
++cleanup_sys:
++#ifdef CONFIG_VE_IPTABLES
++no_mem_icmp:
++	nf_ct_proto_udp_sysctl_cleanup();
++no_mem_udp:
++	nf_ct_proto_tcp_sysctl_cleanup();
++no_mem_tcp:
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++#endif /* CONFIG_VE_IPTABLES */
+ 	return ret;
+ }
++EXPORT_SYMBOL(init_nf_ct_l3proto_ipv6);
+ 
+-static void __exit nf_conntrack_l3proto_ipv6_fini(void)
++void fini_nf_ct_l3proto_ipv6(void)
+ {
+-	synchronize_net();
+ 	nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+-	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+-	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
++	nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6);
++	nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6);
+ 	nf_ct_frag6_cleanup();
++
++#ifdef CONFIG_VE_IPTABLES
++	nf_ct_proto_icmpv6_sysctl_cleanup();
++	nf_ct_proto_udp_sysctl_cleanup();
++	nf_ct_proto_tcp_sysctl_cleanup();
++	if (!ve_is_super(get_exec_env()))
++		module_put(THIS_MODULE);
++#endif /* CONFIG_VE_IPTABLES */
++}
++EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv6);
++
++static int __init nf_conntrack_l3proto_ipv6_init(void)
++{
++	int ret = 0;
++
++	need_conntrack();
++
++	ret = init_nf_ct_l3proto_ipv6();
++	if (ret < 0) {
++		printk(KERN_ERR "Unable to initialize netfilter protocols\n");
++		return ret;
++	}
++	KSYMRESOLVE(init_nf_ct_l3proto_ipv6);
++	KSYMRESOLVE(fini_nf_ct_l3proto_ipv6);
++	KSYMMODRESOLVE(nf_conntrack_ipv6);
++	return 0;
++}
++
++static void __exit nf_conntrack_l3proto_ipv6_fini(void)
++{
++	synchronize_net();
++	KSYMMODUNRESOLVE(nf_conntrack_ipv6);
++	KSYMUNRESOLVE(init_nf_ct_l3proto_ipv6);
++	KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv6);
++	fini_nf_ct_l3proto_ipv6();
+ }
+ 
+ module_init(nf_conntrack_l3proto_ipv6_init);
+diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+index ee713b0..cae064f 100644
+--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
++++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+@@ -10,6 +10,7 @@
+  */
+ 
+ #include <linux/types.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+ #include <linux/module.h>
+ #include <linux/netfilter.h>
+@@ -95,7 +96,7 @@ static int icmpv6_packet(struct nf_conn *ct,
+ 	} else {
+ 		atomic_inc(&ct->proto.icmp.count);
+ 		nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+-		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
++		nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmpv6_timeout);
+ 	}
+ 
+ 	return NF_ACCEPT;
+@@ -150,7 +151,7 @@ icmpv6_error_message(struct sk_buff *skb,
+ 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
+ 	   been preserved inside the ICMP. */
+ 	if (!nf_ct_invert_tuple(&intuple, &origtuple,
+-				&nf_conntrack_l3proto_ipv6, inproto)) {
++				ve_nf_conntrack_l3proto_ipv6, inproto)) {
+ 		pr_debug("icmpv6_error: Can't invert tuple\n");
+ 		return -NF_ACCEPT;
+ 	}
+@@ -282,3 +283,48 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
+ 	.ctl_table		= icmpv6_sysctl_table,
+ #endif
+ };
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++int nf_ct_proto_icmpv6_sysctl_init(void)
++{
++	struct nf_conntrack_l4proto *icmp6;
++
++	if (ve_is_super(get_exec_env())) {
++		icmp6 = &nf_conntrack_l4proto_icmpv6;
++		goto out;
++	}
++
++	icmp6 = kmemdup(&nf_conntrack_l4proto_icmpv6,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (!icmp6)
++		goto no_mem_ct;
++
++	icmp6->ctl_table_header = &ve_icmpv6_sysctl_header;
++	icmp6->ctl_table = kmemdup(icmpv6_sysctl_table,
++			sizeof(icmpv6_sysctl_table), GFP_KERNEL);
++	if (!icmp6->ctl_table)
++		goto no_mem_sys;
++
++	icmp6->ctl_table[0].data = &ve_nf_ct_icmpv6_timeout;
++out:
++	ve_nf_ct_icmpv6_timeout = nf_ct_icmpv6_timeout;
++
++	ve_nf_conntrack_l4proto_icmpv6 = icmp6;
++	return 0;
++
++no_mem_sys:
++	kfree(icmp6);
++no_mem_ct:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_init);
++
++void nf_ct_proto_icmpv6_sysctl_cleanup(void)
++{
++	if (!ve_is_super(get_exec_env())) {
++		kfree(ve_nf_conntrack_l4proto_icmpv6->ctl_table);
++		kfree(ve_nf_conntrack_l4proto_icmpv6);
++	}
++}
++EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_cleanup);
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
+diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
+index cf20bc4..9faaa59 100644
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -145,11 +145,12 @@ static void nf_skb_free(struct sk_buff *skb)
+ }
+ 
+ /* Memory Tracking Functions. */
+-static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
++static inline void frag_kfree_skb(struct netns_frags *nf,
++		struct sk_buff *skb, unsigned int *work)
+ {
+ 	if (work)
+ 		*work -= skb->truesize;
+-	atomic_sub(skb->truesize, &nf_init_frags.mem);
++	atomic_sub(skb->truesize, &nf->mem);
+ 	nf_skb_free(skb);
+ 	kfree_skb(skb);
+ }
+@@ -169,10 +170,10 @@ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
+ 	inet_frag_kill(&fq->q, &nf_frags);
+ }
+ 
+-static void nf_ct_frag6_evictor(void)
++static void nf_ct_frag6_evictor(struct netns_frags *nf)
+ {
+ 	local_bh_disable();
+-	inet_frag_evictor(&nf_init_frags, &nf_frags);
++	inet_frag_evictor(nf, &nf_frags);
+ 	local_bh_enable();
+ }
+ 
+@@ -198,7 +199,7 @@ out:
+ /* Creation primitives. */
+ 
+ static __inline__ struct nf_ct_frag6_queue *
+-fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst)
++fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst)
+ {
+ 	struct inet_frag_queue *q;
+ 	struct ip6_create_arg arg;
+@@ -211,7 +212,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst)
+ 	read_lock_bh(&nf_frags.lock);
+ 	hash = ip6qhashfn(id, src, dst);
+ 
+-	q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
++	q = inet_frag_find(&net->ipv6.ct_frags, &nf_frags, &arg, hash);
+ 	local_bh_enable();
+ 	if (q == NULL)
+ 		goto oom;
+@@ -224,7 +225,8 @@ oom:
+ }
+ 
+ 
+-static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
++static int nf_ct_frag6_queue(struct net *net, struct nf_ct_frag6_queue *fq,
++		struct sk_buff *skb,
+ 			     const struct frag_hdr *fhdr, int nhoff)
+ {
+ 	struct sk_buff *prev, *next;
+@@ -365,7 +367,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+ 				fq->q.fragments = next;
+ 
+ 			fq->q.meat -= free_it->len;
+-			frag_kfree_skb(free_it, NULL);
++			frag_kfree_skb(fq->q.net, free_it, NULL);
+ 		}
+ 	}
+ 
+@@ -381,7 +383,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+ 	skb->dev = NULL;
+ 	fq->q.stamp = skb->tstamp;
+ 	fq->q.meat += skb->len;
+-	atomic_add(skb->truesize, &nf_init_frags.mem);
++	atomic_add(skb->truesize, &net->ipv6.ct_frags.mem);
+ 
+ 	/* The first fragment.
+ 	 * nhoffset is obtained from the first fragment, of course.
+@@ -391,7 +393,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+ 		fq->q.last_in |= INET_FRAG_FIRST_IN;
+ 	}
+ 	write_lock(&nf_frags.lock);
+-	list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list);
++	list_move_tail(&fq->q.lru_list, &net->ipv6.ct_frags.lru_list);
+ 	write_unlock(&nf_frags.lock);
+ 	return 0;
+ 
+@@ -409,7 +411,8 @@ err:
+  *	the last and the first frames arrived and all the bits are here.
+  */
+ static struct sk_buff *
+-nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
++nf_ct_frag6_reasm(struct net *net, struct nf_ct_frag6_queue *fq,
++		struct net_device *dev)
+ {
+ 	struct sk_buff *fp, *op, *head = fq->q.fragments;
+ 	int    payload_len;
+@@ -458,7 +461,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+ 		clone->ip_summed = head->ip_summed;
+ 
+ 		NFCT_FRAG6_CB(clone)->orig = NULL;
+-		atomic_add(clone->truesize, &nf_init_frags.mem);
++		atomic_add(clone->truesize, &net->ipv6.ct_frags.mem);
+ 	}
+ 
+ 	/* We have to remove fragment header from datagram and to relocate
+@@ -472,7 +475,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+ 	skb_shinfo(head)->frag_list = head->next;
+ 	skb_reset_transport_header(head);
+ 	skb_push(head, head->data - skb_network_header(head));
+-	atomic_sub(head->truesize, &nf_init_frags.mem);
++	atomic_sub(head->truesize, &net->ipv6.ct_frags.mem);
+ 
+ 	for (fp=head->next; fp; fp = fp->next) {
+ 		head->data_len += fp->len;
+@@ -482,7 +485,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+ 		else if (head->ip_summed == CHECKSUM_COMPLETE)
+ 			head->csum = csum_add(head->csum, fp->csum);
+ 		head->truesize += fp->truesize;
+-		atomic_sub(fp->truesize, &nf_init_frags.mem);
++		atomic_sub(fp->truesize, &net->ipv6.ct_frags.mem);
+ 	}
+ 
+ 	head->next = NULL;
+@@ -599,6 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+ 	int fhoff, nhoff;
+ 	u8 prevhdr;
+ 	struct sk_buff *ret_skb = NULL;
++	struct net *net = dev_net(dev);
+ 
+ 	/* Jumbo payload inhibits frag. header */
+ 	if (ipv6_hdr(skb)->payload_len == 0) {
+@@ -632,10 +636,11 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+ 		goto ret_orig;
+ 	}
+ 
+-	if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh)
+-		nf_ct_frag6_evictor();
++	if (atomic_read(&net->ipv6.ct_frags.mem) >
++			net->ipv6.ct_frags.high_thresh)
++		nf_ct_frag6_evictor(&net->ipv6.ct_frags);
+ 
+-	fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr);
++	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr);
+ 	if (fq == NULL) {
+ 		pr_debug("Can't find and can't create new queue\n");
+ 		goto ret_orig;
+@@ -643,7 +648,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+ 
+ 	spin_lock_bh(&fq->q.lock);
+ 
+-	if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
++	if (nf_ct_frag6_queue(net, fq, clone, fhdr, nhoff) < 0) {
+ 		spin_unlock_bh(&fq->q.lock);
+ 		pr_debug("Can't insert skb to queue\n");
+ 		fq_put(fq);
+@@ -652,7 +657,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+ 
+ 	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ 	    fq->q.meat == fq->q.len) {
+-		ret_skb = nf_ct_frag6_reasm(fq, dev);
++		ret_skb = nf_ct_frag6_reasm(net, fq, dev);
+ 		if (ret_skb == NULL)
+ 			pr_debug("Can't reassemble fragmented packets\n");
+ 	}
+@@ -687,8 +692,32 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
+ 	nf_conntrack_put_reasm(skb);
+ }
+ 
++static int nf_ct_frag6_init_net(struct net *net)
++{
++	struct netns_frags *frags = &net->ipv6.ct_frags;
++
++	frags->timeout = IPV6_FRAG_TIMEOUT;
++	frags->high_thresh = 256 * 1024;
++	frags->low_thresh = 192 * 1024;
++	inet_frags_init_net(frags);
++
++	return 0; /* FIXME : sysctls */
++}
++
++static void nf_ct_frag6_exit_net(struct net *net)
++{
++	inet_frags_exit_net(&net->ipv6.ct_frags, &nf_frags);
++}
++
++static struct pernet_operations nf_ct_frag6_ops = {
++	.init = nf_ct_frag6_init_net,
++	.exit = nf_ct_frag6_exit_net,
++};
++
+ int nf_ct_frag6_init(void)
+ {
++	register_pernet_subsys(&nf_ct_frag6_ops);
++
+ 	nf_frags.hashfn = nf_hashfn;
+ 	nf_frags.constructor = ip6_frag_init;
+ 	nf_frags.destructor = NULL;
+@@ -697,10 +726,6 @@ int nf_ct_frag6_init(void)
+ 	nf_frags.match = ip6_frag_match;
+ 	nf_frags.frag_expire = nf_ct_frag6_expire;
+ 	nf_frags.secret_interval = 10 * 60 * HZ;
+-	nf_init_frags.timeout = IPV6_FRAG_TIMEOUT;
+-	nf_init_frags.high_thresh = 256 * 1024;
+-	nf_init_frags.low_thresh = 192 * 1024;
+-	inet_frags_init_net(&nf_init_frags);
+ 	inet_frags_init(&nf_frags);
+ 
+ 	return 0;
+@@ -709,7 +734,5 @@ int nf_ct_frag6_init(void)
+ void nf_ct_frag6_cleanup(void)
+ {
+ 	inet_frags_fini(&nf_frags);
+-
+-	nf_init_frags.low_thresh = 0;
+-	nf_ct_frag6_evictor();
++	unregister_pernet_subsys(&nf_ct_frag6_ops);
+ }
+diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
+index df0736a..ea78916 100644
+--- a/net/ipv6/proc.c
++++ b/net/ipv6/proc.c
+@@ -31,8 +31,6 @@
+ #include <net/transp_v6.h>
+ #include <net/ipv6.h>
+ 
+-static struct proc_dir_entry *proc_net_devsnmp6;
+-
+ static int sockstat6_seq_show(struct seq_file *seq, void *v)
+ {
+ 	struct net *net = seq->private;
+@@ -174,11 +172,11 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
+ 		snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
+ 		snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg);
+ 	} else {
+-		snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
+-		snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
+-		snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics);
+-		snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
+-		snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
++		snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list);
++		snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list);
++		snmp6_seq_show_icmpv6msg(seq, (void **)ve_icmpv6msg_statistics);
++		snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list);
++		snmp6_seq_show_item(seq, (void **)ve_udplite_stats_in6, snmp6_udplite6_list);
+ 	}
+ 	return 0;
+ }
+@@ -237,18 +235,17 @@ static const struct file_operations snmp6_seq_fops = {
+ int snmp6_register_dev(struct inet6_dev *idev)
+ {
+ 	struct proc_dir_entry *p;
++	struct net *net;
+ 
+ 	if (!idev || !idev->dev)
+ 		return -EINVAL;
+ 
+-	if (dev_net(idev->dev) != &init_net)
+-		return 0;
+-
+-	if (!proc_net_devsnmp6)
++	net = dev_net(idev->dev);
++	if (!net->ipv6.proc_dev_snmp)
+ 		return -ENOENT;
+ 
+ 	p = proc_create_data(idev->dev->name, S_IRUGO,
+-			     proc_net_devsnmp6, &snmp6_seq_fops, idev);
++			     net->ipv6.proc_dev_snmp, &snmp6_seq_fops, idev);
+ 	if (!p)
+ 		return -ENOMEM;
+ 
+@@ -258,12 +255,14 @@ int snmp6_register_dev(struct inet6_dev *idev)
+ 
+ int snmp6_unregister_dev(struct inet6_dev *idev)
+ {
+-	if (!proc_net_devsnmp6)
++	struct net *net = dev_net(idev->dev);
++
++	if (!net->ipv6.proc_dev_snmp)
+ 		return -ENOENT;
+ 	if (!idev || !idev->stats.proc_dir_entry)
+ 		return -EINVAL;
+ 	remove_proc_entry(idev->stats.proc_dir_entry->name,
+-			  proc_net_devsnmp6);
++			  net->ipv6.proc_dev_snmp);
+ 	idev->stats.proc_dir_entry = NULL;
+ 	return 0;
+ }
+@@ -272,12 +271,24 @@ static int ipv6_proc_init_net(struct net *net)
+ {
+ 	if (!proc_net_fops_create(net, "sockstat6", S_IRUGO,
+ 			&sockstat6_seq_fops))
+-		return -ENOMEM;
++		goto err_sockstat;
++
++	net->ipv6.proc_dev_snmp = proc_net_mkdir(net,
++			"dev_snmp6", net->proc_net);
++	if (!net->ipv6.proc_dev_snmp)
++		goto err_dev_snmp;
++
+ 	return 0;
++
++err_dev_snmp:
++	proc_net_remove(net, "sockstat6");
++err_sockstat:
++	return -ENOMEM;
+ }
+ 
+ static void ipv6_proc_exit_net(struct net *net)
+ {
++	proc_net_remove(net, "dev_snmp6");
+ 	proc_net_remove(net, "sockstat6");
+ }
+ 
+@@ -296,14 +307,9 @@ int __init ipv6_misc_proc_init(void)
+ 	if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops))
+ 		goto proc_snmp6_fail;
+ 
+-	proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net);
+-	if (!proc_net_devsnmp6)
+-		goto proc_dev_snmp6_fail;
+ out:
+ 	return rc;
+ 
+-proc_dev_snmp6_fail:
+-	proc_net_remove(&init_net, "snmp6");
+ proc_snmp6_fail:
+ 	unregister_pernet_subsys(&ipv6_proc_ops);
+ proc_net_fail:
+@@ -314,7 +320,6 @@ proc_net_fail:
+ void ipv6_misc_proc_exit(void)
+ {
+ 	proc_net_remove(&init_net, "sockstat6");
+-	proc_net_remove(&init_net, "dev_snmp6");
+ 	proc_net_remove(&init_net, "snmp6");
+ 	unregister_pernet_subsys(&ipv6_proc_ops);
+ }
+diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
+index a60d7d1..408859e 100644
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -198,8 +198,10 @@ static void ip6_frag_expire(unsigned long data)
+ 	struct frag_queue *fq;
+ 	struct net_device *dev = NULL;
+ 	struct net *net;
++	struct ve_struct *old_ve;
+ 
+ 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
++	old_ve = set_exec_env(fq->q.owner_ve);
+ 
+ 	spin_lock(&fq->q.lock);
+ 
+@@ -234,6 +236,8 @@ out:
+ 		dev_put(dev);
+ 	spin_unlock(&fq->q.lock);
+ 	fq_put(fq);
++
++	(void)set_exec_env(old_ve);
+ }
+ 
+ static __inline__ struct frag_queue *
+@@ -510,6 +514,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+ 		clone->csum = 0;
+ 		clone->ip_summed = head->ip_summed;
+ 		atomic_add(clone->truesize, &fq->q.net->mem);
++		clone->owner_env = head->owner_env;
+ 	}
+ 
+ 	/* We have to remove fragment header from datagram and to relocate
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index 7ff6870..4d83e48 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -1881,10 +1881,12 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+ 		rt->rt6i_flags |= RTF_ANYCAST;
+ 	else
+ 		rt->rt6i_flags |= RTF_LOCAL;
+-	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+-	if (rt->rt6i_nexthop == NULL) {
+-		dst_free(&rt->u.dst);
+-		return ERR_PTR(-ENOMEM);
++	rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
++	if (IS_ERR(rt->rt6i_nexthop)) {
++		void *err = rt->rt6i_nexthop;
++		rt->rt6i_nexthop = NULL;
++		dst_free((struct dst_entry *) rt);
++		return err;
+ 	}
+ 
+ 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 40ea9c3..cdc8697 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -62,6 +62,8 @@
+ #include <net/netdma.h>
+ #include <net/inet_common.h>
+ 
++#include <bc/tcp.h>
++
+ #include <asm/uaccess.h>
+ 
+ #include <linux/proc_fs.h>
+@@ -77,7 +79,7 @@ static void	tcp_v6_send_check(struct sock *sk, int len,
+ 
+ static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+ 
+-static struct inet_connection_sock_af_ops ipv6_mapped;
++struct inet_connection_sock_af_ops ipv6_mapped;
+ static struct inet_connection_sock_af_ops ipv6_specific;
+ #ifdef CONFIG_TCP_MD5SIG
+ static struct tcp_sock_af_ops tcp_sock_ipv6_specific;
+@@ -1580,6 +1582,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 	struct ipv6_pinfo *np = inet6_sk(sk);
+ 	struct tcp_sock *tp;
+ 	struct sk_buff *opt_skb = NULL;
++	struct user_beancounter *ub;
+ 
+ 	/* Imagine: socket is IPv6. IPv4 packet arrives,
+ 	   goes to IPv4 receive handler and backlogged.
+@@ -1592,6 +1595,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 	if (skb->protocol == htons(ETH_P_IP))
+ 		return tcp_v4_do_rcv(sk, skb);
+ 
++	ub = set_exec_ub(sock_bc(sk)->ub);
++
+ #ifdef CONFIG_TCP_MD5SIG
+ 	if (tcp_v6_inbound_md5_hash (sk, skb))
+ 		goto discard;
+@@ -1628,7 +1633,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 		TCP_CHECK_TIMER(sk);
+ 		if (opt_skb)
+ 			goto ipv6_pktoptions;
+-		return 0;
++		goto restore_context;
+ 	}
+ 
+ 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+@@ -1649,7 +1654,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 				goto reset;
+ 			if (opt_skb)
+ 				__kfree_skb(opt_skb);
+-			return 0;
++			goto restore_context;
+ 		}
+ 	}
+ 
+@@ -1659,6 +1664,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+ 	TCP_CHECK_TIMER(sk);
+ 	if (opt_skb)
+ 		goto ipv6_pktoptions;
++
++restore_context:
++	(void)set_exec_ub(ub);
+ 	return 0;
+ 
+ reset:
+@@ -1667,7 +1675,7 @@ discard:
+ 	if (opt_skb)
+ 		__kfree_skb(opt_skb);
+ 	kfree_skb(skb);
+-	return 0;
++	goto restore_context;
+ csum_err:
+ 	TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ 	goto discard;
+@@ -1699,7 +1707,7 @@ ipv6_pktoptions:
+ 
+ 	if (opt_skb)
+ 		kfree_skb(opt_skb);
+-	return 0;
++	goto restore_context;
+ }
+ 
+ static int tcp_v6_rcv(struct sk_buff *skb)
+@@ -1881,7 +1889,7 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+  *	TCP over IPv4 via INET6 API
+  */
+ 
+-static struct inet_connection_sock_af_ops ipv6_mapped = {
++struct inet_connection_sock_af_ops ipv6_mapped = {
+ 	.queue_xmit	   = ip_queue_xmit,
+ 	.send_check	   = tcp_v4_send_check,
+ 	.rebuild_header	   = inet_sk_rebuild_header,
+@@ -1900,6 +1908,8 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
+ #endif
+ };
+ 
++EXPORT_SYMBOL_GPL(ipv6_mapped);
++
+ #ifdef CONFIG_TCP_MD5SIG
+ static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+ 	.md5_lookup	=	tcp_v4_md5_lookup,
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index dd30962..d7b151d 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -65,9 +65,11 @@ static struct sock *__udp6_lib_lookup(struct net *net,
+ 	struct hlist_node *node;
+ 	unsigned short hnum = ntohs(dport);
+ 	int badness = -1;
++	struct ve_struct *ve;
+ 
++	ve = get_exec_env();
+ 	read_lock(&udp_hash_lock);
+-	sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
++	sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) {
+ 		struct inet_sock *inet = inet_sk(sk);
+ 
+ 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+@@ -363,7 +365,7 @@ static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr,
+ 	int dif;
+ 
+ 	read_lock(&udp_hash_lock);
+-	sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
++	sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest), VEID(skb->owner_env))]);
+ 	dif = inet6_iif(skb);
+ 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+ 	if (!sk) {
+diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
+index 8f1e054..e32613a 100644
+--- a/net/ipv6/xfrm6_policy.c
++++ b/net/ipv6/xfrm6_policy.c
+@@ -14,6 +14,7 @@
+ #include <linux/err.h>
+ #include <linux/kernel.h>
+ #include <linux/netdevice.h>
++#include <linux/nsproxy.h>
+ #include <net/addrconf.h>
+ #include <net/dst.h>
+ #include <net/xfrm.h>
+@@ -38,7 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr,
+ 	if (saddr)
+ 		memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src));
+ 
+-	dst = ip6_route_output(&init_net, NULL, &fl);
++	dst = ip6_route_output(get_exec_env()->ve_netns, NULL, &fl);
+ 
+ 	err = dst->error;
+ 	if (dst->error) {
+diff --git a/net/netfilter/core.c b/net/netfilter/core.c
+index 292fa28..6bf46b5 100644
+--- a/net/netfilter/core.c
++++ b/net/netfilter/core.c
+@@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops *reg)
+ 	struct nf_hook_ops *elem;
+ 	int err;
+ 
++	BUG_ON(!ve_is_super(get_exec_env()));
++
+ 	err = mutex_lock_interruptible(&nf_hook_mutex);
+ 	if (err < 0)
+ 		return err;
+@@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook);
+ 
+ void nf_unregister_hook(struct nf_hook_ops *reg)
+ {
++	BUG_ON(!ve_is_super(get_exec_env()));
++
+ 	mutex_lock(&nf_hook_mutex);
+ 	list_del_rcu(&reg->list);
+ 	mutex_unlock(&nf_hook_mutex);
+@@ -169,8 +173,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
+ 	struct net *net;
+ 
+ 	net = indev == NULL ? dev_net(outdev) : dev_net(indev);
+-	if (net != &init_net)
+-		return 1;
+ #endif
+ 
+ 	/* We may already have this, but read-locks nest anyway */
+diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
+index 662c1cc..e811c0b 100644
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -30,6 +30,8 @@
+ #include <linux/socket.h>
+ #include <linux/mm.h>
+ 
++#include <net/sock.h>
++
+ #include <net/netfilter/nf_conntrack.h>
+ #include <net/netfilter/nf_conntrack_l3proto.h>
+ #include <net/netfilter/nf_conntrack_l4proto.h>
+@@ -53,8 +55,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
+ int nf_conntrack_max __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_conntrack_max);
+ 
++#ifndef CONFIG_VE_IPTABLES
+ struct hlist_head *nf_conntrack_hash __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
++#endif
+ 
+ struct nf_conn nf_conntrack_untracked __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
+@@ -179,7 +183,14 @@ static void
+ destroy_conntrack(struct nf_conntrack *nfct)
+ {
+ 	struct nf_conn *ct = (struct nf_conn *)nfct;
++	struct nf_conn_help *help = nfct_help(ct);
++	struct nf_conntrack_l3proto *l3proto;
+ 	struct nf_conntrack_l4proto *l4proto;
++#ifdef CONFIG_VE_IPTABLES
++	struct ve_struct *old_ve;
++
++	old_ve = set_exec_env(ct->ct_owner_env);
++#endif
+ 
+ 	pr_debug("destroy_conntrack(%p)\n", ct);
+ 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+@@ -188,10 +199,17 @@ destroy_conntrack(struct nf_conntrack *nfct)
+ 	nf_conntrack_event(IPCT_DESTROY, ct);
+ 	set_bit(IPS_DYING_BIT, &ct->status);
+ 
++	if (help && help->helper && help->helper->destroy)
++		help->helper->destroy(ct);
++
+ 	/* To make sure we don't get any weird locking issues here:
+ 	 * destroy_conntrack() MUST NOT be called with a write lock
+ 	 * to nf_conntrack_lock!!! -HW */
+ 	rcu_read_lock();
++	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
++	if (l3proto && l3proto->destroy)
++		l3proto->destroy(ct);
++
+ 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ 	if (l4proto && l4proto->destroy)
+ 		l4proto->destroy(ct);
+@@ -219,6 +237,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
+ 
+ 	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ 	nf_conntrack_free(ct);
++#ifdef CONFIG_VE_IPTABLES
++	(void)set_exec_env(old);
++#endif
+ }
+ 
+ static void death_by_timeout(unsigned long ul_conntrack)
+@@ -255,7 +276,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple)
+ 	 * at least once for the stats anyway.
+ 	 */
+ 	local_bh_disable();
+-	hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) {
++	hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) {
+ 		if (nf_ct_tuple_equal(tuple, &h->tuple)) {
+ 			NF_CT_STAT_INC(found);
+ 			local_bh_enable();
+@@ -294,9 +315,9 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
+ 				       unsigned int repl_hash)
+ {
+ 	hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
+-			   &nf_conntrack_hash[hash]);
++			   &ve_nf_conntrack_hash[hash]);
+ 	hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode,
+-			   &nf_conntrack_hash[repl_hash]);
++			   &ve_nf_conntrack_hash[repl_hash]);
+ }
+ 
+ void nf_conntrack_hash_insert(struct nf_conn *ct)
+@@ -350,11 +371,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
+ 	/* See if there's one in the list already, including reverse:
+ 	   NAT could have grabbed it without realizing, since we're
+ 	   not in the hash.  If there is, we lost race. */
+-	hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode)
++	hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode)
+ 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ 				      &h->tuple))
+ 			goto out;
+-	hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode)
++	hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[repl_hash], hnode)
+ 		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ 				      &h->tuple))
+ 			goto out;
+@@ -405,7 +426,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
+ 	 * least once for the stats anyway.
+ 	 */
+ 	rcu_read_lock_bh();
+-	hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) {
++	hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) {
+ 		if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
+ 		    nf_ct_tuple_equal(tuple, &h->tuple)) {
+ 			NF_CT_STAT_INC(found);
+@@ -435,7 +456,7 @@ static noinline int early_drop(unsigned int hash)
+ 
+ 	rcu_read_lock();
+ 	for (i = 0; i < nf_conntrack_htable_size; i++) {
+-		hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash],
++		hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash],
+ 					 hnode) {
+ 			tmp = nf_ct_tuplehash_to_ctrack(h);
+ 			if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
+@@ -464,9 +485,11 @@ static noinline int early_drop(unsigned int hash)
+ }
+ 
+ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+-				   const struct nf_conntrack_tuple *repl)
++				   const struct nf_conntrack_tuple *repl,
++				   struct user_beancounter *ub)
+ {
+ 	struct nf_conn *ct = NULL;
++	struct user_beancounter *old_ub;
+ 
+ 	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
+ 		get_random_bytes(&nf_conntrack_hash_rnd, 4);
+@@ -474,25 +497,28 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+ 	}
+ 
+ 	/* We don't want any race condition at early drop stage */
+-	atomic_inc(&nf_conntrack_count);
++	atomic_inc(&ve_nf_conntrack_count);
+ 
+-	if (nf_conntrack_max &&
+-	    unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) {
++	if (ve_nf_conntrack_max &&
++	    unlikely(atomic_read(&ve_nf_conntrack_count) >
++			ve_nf_conntrack_max)) {
+ 		unsigned int hash = hash_conntrack(orig);
+ 		if (!early_drop(hash)) {
+-			atomic_dec(&nf_conntrack_count);
++			atomic_dec(&ve_nf_conntrack_count);
+ 			if (net_ratelimit())
+-				printk(KERN_WARNING
+-				       "nf_conntrack: table full, dropping"
+-				       " packet.\n");
++				ve_printk(VE_LOG_BOTH, KERN_WARNING
++				       "nf_conntrack: CT %d: table full, dropping"
++				       " packet.\n", VEID(get_exec_env()));
+ 			return ERR_PTR(-ENOMEM);
+ 		}
+ 	}
+ 
++	old_ub = set_exec_ub(ub);
+ 	ct = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC);
++	(void)set_exec_ub(old_ub);
+ 	if (ct == NULL) {
+ 		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
+-		atomic_dec(&nf_conntrack_count);
++		atomic_dec(&ve_nf_conntrack_count);
+ 		return ERR_PTR(-ENOMEM);
+ 	}
+ 
+@@ -502,6 +528,9 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+ 	/* Don't set timer yet: wait for confirmation */
+ 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
+ 	INIT_RCU_HEAD(&ct->rcu);
++#ifdef CONFIG_VE_IPTABLES
++	ct->ct_owner_env = get_exec_env();
++#endif
+ 
+ 	return ct;
+ }
+@@ -513,7 +542,7 @@ static void nf_conntrack_free_rcu(struct rcu_head *head)
+ 
+ 	nf_ct_ext_free(ct);
+ 	kmem_cache_free(nf_conntrack_cachep, ct);
+-	atomic_dec(&nf_conntrack_count);
++	atomic_dec(&ve_nf_conntrack_count);
+ }
+ 
+ void nf_conntrack_free(struct nf_conn *ct)
+@@ -536,13 +565,20 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
+ 	struct nf_conn_help *help;
+ 	struct nf_conntrack_tuple repl_tuple;
+ 	struct nf_conntrack_expect *exp;
++	struct user_beancounter *ub = NULL;
+ 
+ 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+ 		pr_debug("Can't invert tuple.\n");
+ 		return NULL;
+ 	}
+ 
+-	ct = nf_conntrack_alloc(tuple, &repl_tuple);
++#ifdef CONFIG_BEANCOUNTERS
++	if (skb->dev != NULL)  /* received skb */
++		ub = netdev_bc(skb->dev)->exec_ub;
++	else if (skb->sk != NULL) /* sent skb */
++		ub = sock_bc(skb->sk)->ub;
++#endif
++	ct = nf_conntrack_alloc(tuple, &repl_tuple, ub);
+ 	if (ct == NULL || IS_ERR(ct)) {
+ 		pr_debug("Can't allocate conntrack.\n");
+ 		return (struct nf_conntrack_tuple_hash *)ct;
+@@ -589,7 +625,8 @@ init_conntrack(const struct nf_conntrack_tuple *tuple,
+ 	}
+ 
+ 	/* Overload tuple linked list to put us in unconfirmed list. */
+-	hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed);
++	hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode,
++		       &ve_unconfirmed);
+ 
+ 	spin_unlock_bh(&nf_conntrack_lock);
+ 
+@@ -918,13 +955,13 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
+ 
+ 	spin_lock_bh(&nf_conntrack_lock);
+ 	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
+-		hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) {
++		hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[*bucket], hnode) {
+ 			ct = nf_ct_tuplehash_to_ctrack(h);
+ 			if (iter(ct, data))
+ 				goto found;
+ 		}
+ 	}
+-	hlist_for_each_entry(h, n, &unconfirmed, hnode) {
++	hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) {
+ 		ct = nf_ct_tuplehash_to_ctrack(h);
+ 		if (iter(ct, data))
+ 			set_bit(IPS_DYING_BIT, &ct->status);
+@@ -979,7 +1016,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush);
+    supposed to kill the mall. */
+ void nf_conntrack_cleanup(void)
+ {
+-	rcu_assign_pointer(ip_ct_attach, NULL);
++	struct ve_struct *ve = get_exec_env();
++
++	if (ve_is_super(ve))
++		rcu_assign_pointer(ip_ct_attach, NULL);
+ 
+ 	/* This makes sure all current packets have passed through
+ 	   netfilter framework.  Roll on, two-stage module
+@@ -989,10 +1029,12 @@ void nf_conntrack_cleanup(void)
+ 	nf_ct_event_cache_flush();
+  i_see_dead_people:
+ 	nf_conntrack_flush();
+-	if (atomic_read(&nf_conntrack_count) != 0) {
++	if (atomic_read(&ve_nf_conntrack_count) != 0) {
+ 		schedule();
+ 		goto i_see_dead_people;
+ 	}
++	if (!ve_is_super(ve))
++		goto skip_ct_cache;
+ 	/* wait until all references to nf_conntrack_untracked are dropped */
+ 	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ 		schedule();
+@@ -1000,12 +1042,17 @@ void nf_conntrack_cleanup(void)
+ 	rcu_assign_pointer(nf_ct_destroy, NULL);
+ 
+ 	kmem_cache_destroy(nf_conntrack_cachep);
+-	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
+-			     nf_conntrack_htable_size);
+-
+-	nf_conntrack_proto_fini();
++skip_ct_cache:
+ 	nf_conntrack_helper_fini();
+ 	nf_conntrack_expect_fini();
++
++	nf_conntrack_proto_fini();
++	nf_ct_proto_generic_sysctl_cleanup();
++	nf_ct_free_hashtable(ve_nf_conntrack_hash, ve_nf_conntrack_vmalloc,
++			nf_conntrack_htable_size);
++#ifdef CONFIG_VE_IPTABLES
++	kfree(ve->_nf_conntrack);
++#endif
+ }
+ 
+ struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced)
+@@ -1016,13 +1063,13 @@ struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced)
+ 	*vmalloced = 0;
+ 
+ 	size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head));
+-	hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN,
++	hash = (void*)__get_free_pages(GFP_KERNEL_UBC|__GFP_NOWARN,
+ 				       get_order(sizeof(struct hlist_head)
+ 						 * size));
+ 	if (!hash) {
+ 		*vmalloced = 1;
+ 		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+-		hash = vmalloc(sizeof(struct hlist_head) * size);
++		hash = ub_vmalloc(sizeof(struct hlist_head) * size);
+ 	}
+ 
+ 	if (hash)
+@@ -1064,8 +1111,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+ 	 */
+ 	spin_lock_bh(&nf_conntrack_lock);
+ 	for (i = 0; i < nf_conntrack_htable_size; i++) {
+-		while (!hlist_empty(&nf_conntrack_hash[i])) {
+-			h = hlist_entry(nf_conntrack_hash[i].first,
++		while (!hlist_empty(&ve_nf_conntrack_hash[i])) {
++			h = hlist_entry(ve_nf_conntrack_hash[i].first,
+ 					struct nf_conntrack_tuple_hash, hnode);
+ 			hlist_del_rcu(&h->hnode);
+ 			bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+@@ -1073,12 +1120,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+ 		}
+ 	}
+ 	old_size = nf_conntrack_htable_size;
+-	old_vmalloced = nf_conntrack_vmalloc;
+-	old_hash = nf_conntrack_hash;
++	old_vmalloced = ve_nf_conntrack_vmalloc;
++	old_hash = ve_nf_conntrack_hash;
+ 
+ 	nf_conntrack_htable_size = hashsize;
+-	nf_conntrack_vmalloc = vmalloced;
+-	nf_conntrack_hash = hash;
++	ve_nf_conntrack_vmalloc = vmalloced;
++	ve_nf_conntrack_hash = hash;
+ 	nf_conntrack_hash_rnd = rnd;
+ 	spin_unlock_bh(&nf_conntrack_lock);
+ 
+@@ -1090,53 +1137,82 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
+ module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ 		  &nf_conntrack_htable_size, 0600);
+ 
+-int __init nf_conntrack_init(void)
++int nf_conntrack_init(void)
+ {
++	struct ve_struct *ve = get_exec_env();
+ 	int max_factor = 8;
+-	int ret;
++	int ret = 0, i;
++
++	if (ve_is_super(ve)) {
++
++		/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
++		 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
++		if (!nf_conntrack_htable_size) {
++			nf_conntrack_htable_size
++				= (((num_physpages << PAGE_SHIFT) / 16384)
++						/ sizeof(struct hlist_head));
++			if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
++				nf_conntrack_htable_size = 16384;
++			if (nf_conntrack_htable_size < 32)
++				nf_conntrack_htable_size = 32;
++
++			/* Use a max. factor of four by default to get the same
++			 * max as with the old struct list_heads. When a table
++			 * size is given we use the old value of 8 to avoid
++			 * reducing the max. entries. */
++			max_factor = 4;
++		}
++		nf_conntrack_max = max_factor * nf_conntrack_htable_size;
++
++		printk("nf_conntrack version %s (%u buckets, %d max)\n",
++			NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
++			nf_conntrack_max);
++	}
+ 
+-	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
+-	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
+-	if (!nf_conntrack_htable_size) {
+-		nf_conntrack_htable_size
+-			= (((num_physpages << PAGE_SHIFT) / 16384)
+-			   / sizeof(struct hlist_head));
+-		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+-			nf_conntrack_htable_size = 16384;
+-		if (nf_conntrack_htable_size < 32)
+-			nf_conntrack_htable_size = 32;
+-
+-		/* Use a max. factor of four by default to get the same max as
+-		 * with the old struct list_heads. When a table size is given
+-		 * we use the old value of 8 to avoid reducing the max.
+-		 * entries. */
+-		max_factor = 4;
++#ifdef CONFIG_VE_IPTABLES
++	ve->_nf_conntrack = kzalloc(sizeof(struct ve_nf_conntrack), GFP_KERNEL);
++	if (!ve->_nf_conntrack) {
++		ret = -ENOMEM;
++		goto out;
+ 	}
+-	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
+-						  &nf_conntrack_vmalloc);
+-	if (!nf_conntrack_hash) {
++
++	ve_nf_conntrack_max = nf_conntrack_max;
++	ve_nf_conntrack_checksum = nf_conntrack_checksum;
++	ve_nf_ct_expect_max = nf_ct_expect_max;
++	atomic_set(&ve_nf_conntrack_count, 0);
++	INIT_HLIST_HEAD(&ve_unconfirmed);
++#endif
++	ve_nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
++						  &ve_nf_conntrack_vmalloc);
++	if (!ve_nf_conntrack_hash) {
+ 		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+ 		goto err_out;
+ 	}
+ 
+-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+-
+-	printk("nf_conntrack version %s (%u buckets, %d max)\n",
+-	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
+-	       nf_conntrack_max);
+-
+-	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
++	if (ve_is_super(ve)) {
++		nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
+ 						sizeof(struct nf_conn),
+-						0, 0, NULL);
+-	if (!nf_conntrack_cachep) {
+-		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+-		goto err_free_hash;
++						0, SLAB_UBC, NULL);
++		if (!nf_conntrack_cachep) {
++			printk(KERN_ERR "Unable to create nf_conn slab cache\n");
++			goto err_free_hash;
++		}
+ 	}
+ 
+-	ret = nf_conntrack_proto_init();
++	ret = nf_ct_proto_generic_sysctl_init();
+ 	if (ret < 0)
+ 		goto err_free_conntrack_slab;
+ 
++	ret = nf_conntrack_proto_init();
++	if (ret < 0)
++		goto err_generic_proto;
++
++	/* Don't NEED lock here, but good form anyway. */
++	spin_lock_bh(&nf_conntrack_lock);
++	for (i = 0; i < AF_MAX; i++)
++		ve_nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic;
++	spin_unlock_bh(&nf_conntrack_lock);
++
+ 	ret = nf_conntrack_expect_init();
+ 	if (ret < 0)
+ 		goto out_fini_proto;
+@@ -1145,27 +1221,36 @@ int __init nf_conntrack_init(void)
+ 	if (ret < 0)
+ 		goto out_fini_expect;
+ 
+-	/* For use by REJECT target */
+-	rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
+-	rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
++	if (ve_is_super(ve)) {
++		/* For use by REJECT target */
++		rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
++		rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+ 
+-	/* Set up fake conntrack:
+-	    - to never be deleted, not in any hashes */
+-	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+-	/*  - and look it like as a confirmed connection */
+-	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
++		/* Set up fake conntrack:
++		   - to never be deleted, not in any hashes */
++		atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
++		/*  - and look it like as a confirmed connection */
++		set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
++	}
+ 
+-	return ret;
++	return 0;
+ 
+ out_fini_expect:
+ 	nf_conntrack_expect_fini();
+ out_fini_proto:
+ 	nf_conntrack_proto_fini();
++err_generic_proto:
++	nf_ct_proto_generic_sysctl_cleanup();
+ err_free_conntrack_slab:
+-	kmem_cache_destroy(nf_conntrack_cachep);
++	if (ve_is_super(ve))
++		kmem_cache_destroy(nf_conntrack_cachep);
+ err_free_hash:
+-	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc,
++	nf_ct_free_hashtable(ve_nf_conntrack_hash, nf_conntrack_vmalloc,
+ 			     nf_conntrack_htable_size);
+ err_out:
+-	return -ENOMEM;
++#ifdef CONFIG_VE_IPTABLES
++	kfree(ve->_nf_conntrack);
++out:
++#endif
++	return ret;
+ }
+diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
+index 83c41ac..d0ddfb6 100644
+--- a/net/netfilter/nf_conntrack_ecache.c
++++ b/net/netfilter/nf_conntrack_ecache.c
+@@ -53,6 +53,9 @@ void nf_ct_deliver_cached_events(const struct nf_conn *ct)
+ {
+ 	struct nf_conntrack_ecache *ecache;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return;
++
+ 	local_bh_disable();
+ 	ecache = &__get_cpu_var(nf_conntrack_ecache);
+ 	if (ecache->ct == ct)
+@@ -66,6 +69,9 @@ void __nf_ct_event_cache_init(struct nf_conn *ct)
+ {
+ 	struct nf_conntrack_ecache *ecache;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return;
++
+ 	/* take care of delivering potentially old events */
+ 	ecache = &__get_cpu_var(nf_conntrack_ecache);
+ 	BUG_ON(ecache->ct == ct);
+@@ -84,6 +90,9 @@ void nf_ct_event_cache_flush(void)
+ 	struct nf_conntrack_ecache *ecache;
+ 	int cpu;
+ 
++	if (!ve_is_super(get_exec_env()))
++		return;
++
+ 	for_each_possible_cpu(cpu) {
+ 		ecache = &per_cpu(nf_conntrack_ecache, cpu);
+ 		if (ecache->ct)
+diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
+index e8f0dea..88f3fa8 100644
+--- a/net/netfilter/nf_conntrack_expect.c
++++ b/net/netfilter/nf_conntrack_expect.c
+@@ -28,17 +28,26 @@
+ #include <net/netfilter/nf_conntrack_helper.h>
+ #include <net/netfilter/nf_conntrack_tuple.h>
+ 
++#ifndef CONFIG_VE_IPTABLES
+ struct hlist_head *nf_ct_expect_hash __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
++#endif
+ 
+ unsigned int nf_ct_expect_hsize __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+ 
+ static unsigned int nf_ct_expect_hash_rnd __read_mostly;
+-static unsigned int nf_ct_expect_count;
+ unsigned int nf_ct_expect_max __read_mostly;
+ static int nf_ct_expect_hash_rnd_initted __read_mostly;
++#ifdef CONFIG_VE_IPTABLES
++#define ve_nf_ct_expect_count	(get_exec_env()->_nf_conntrack->_nf_ct_expect_count)
++#define ve_nf_ct_expect_vmalloc	(get_exec_env()->_nf_conntrack->_nf_ct_expect_vmalloc)
++#else
++static unsigned int nf_ct_expect_count;
+ static int nf_ct_expect_vmalloc;
++#define ve_nf_ct_expect_count	nf_ct_expect_count
++#define ve_nf_ct_expect_vmalloc	nf_ct_expect_vmalloc
++#endif
+ 
+ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+ 
+@@ -51,7 +60,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+ 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
+ 
+ 	hlist_del_rcu(&exp->hnode);
+-	nf_ct_expect_count--;
++	ve_nf_ct_expect_count--;
+ 
+ 	hlist_del(&exp->lnode);
+ 	master_help->expecting[exp->class]--;
+@@ -93,11 +102,11 @@ __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple)
+ 	struct hlist_node *n;
+ 	unsigned int h;
+ 
+-	if (!nf_ct_expect_count)
++	if (!ve_nf_ct_expect_count)
+ 		return NULL;
+ 
+ 	h = nf_ct_expect_dst_hash(tuple);
+-	hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) {
++	hlist_for_each_entry_rcu(i, n, &ve_nf_ct_expect_hash[h], hnode) {
+ 		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
+ 			return i;
+ 	}
+@@ -130,11 +139,11 @@ nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple)
+ 	struct hlist_node *n;
+ 	unsigned int h;
+ 
+-	if (!nf_ct_expect_count)
++	if (!ve_nf_ct_expect_count)
+ 		return NULL;
+ 
+ 	h = nf_ct_expect_dst_hash(tuple);
+-	hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) {
++	hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) {
+ 		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
+ 		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ 			exp = i;
+@@ -308,7 +317,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
+ }
+ EXPORT_SYMBOL_GPL(nf_ct_expect_put);
+ 
+-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
++void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+ {
+ 	struct nf_conn_help *master_help = nfct_help(exp->master);
+ 	const struct nf_conntrack_expect_policy *p;
+@@ -319,8 +328,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+ 	hlist_add_head(&exp->lnode, &master_help->expectations);
+ 	master_help->expecting[exp->class]++;
+ 
+-	hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
+-	nf_ct_expect_count++;
++	hlist_add_head_rcu(&exp->hnode, &ve_nf_ct_expect_hash[h]);
++	ve_nf_ct_expect_count++;
+ 
+ 	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
+ 		    (unsigned long)exp);
+@@ -331,6 +340,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+ 	atomic_inc(&exp->use);
+ 	NF_CT_STAT_INC(expect_create);
+ }
++EXPORT_SYMBOL_GPL(nf_ct_expect_insert);
+ 
+ /* Race with expectations being used means we could have none to find; OK. */
+ static void evict_oldest_expect(struct nf_conn *master,
+@@ -383,7 +393,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect)
+ 		goto out;
+ 	}
+ 	h = nf_ct_expect_dst_hash(&expect->tuple);
+-	hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) {
++	hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) {
+ 		if (expect_matches(i, expect)) {
+ 			/* Refresh timer: if it's dying, ignore.. */
+ 			if (refresh_timer(i)) {
+@@ -406,7 +416,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect)
+ 		}
+ 	}
+ 
+-	if (nf_ct_expect_count >= nf_ct_expect_max) {
++	if (ve_nf_ct_expect_count >= ve_nf_ct_expect_max) {
+ 		if (net_ratelimit())
+ 			printk(KERN_WARNING
+ 			       "nf_conntrack: expectation table full\n");
+@@ -434,7 +444,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+ 	struct hlist_node *n;
+ 
+ 	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+-		n = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
++		n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first);
+ 		if (n)
+ 			return n;
+ 	}
+@@ -450,7 +460,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ 	while (head == NULL) {
+ 		if (++st->bucket >= nf_ct_expect_hsize)
+ 			return NULL;
+-		head = rcu_dereference(nf_ct_expect_hash[st->bucket].first);
++		head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first);
+ 	}
+ 	return head;
+ }
+@@ -537,12 +547,13 @@ static const struct file_operations exp_file_ops = {
+ };
+ #endif /* CONFIG_PROC_FS */
+ 
+-static int __init exp_proc_init(void)
++static int exp_proc_init(void)
+ {
+ #ifdef CONFIG_PROC_FS
+ 	struct proc_dir_entry *proc;
+ 
+-	proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops);
++	proc = proc_net_fops_create(get_exec_env()->ve_netns,
++				    "nf_conntrack_expect", 0440, &exp_file_ops);
+ 	if (!proc)
+ 		return -ENOMEM;
+ #endif /* CONFIG_PROC_FS */
+@@ -552,13 +563,13 @@ static int __init exp_proc_init(void)
+ static void exp_proc_remove(void)
+ {
+ #ifdef CONFIG_PROC_FS
+-	proc_net_remove(&init_net, "nf_conntrack_expect");
++	proc_net_remove(get_exec_env()->ve_netns, "nf_conntrack_expect");
+ #endif /* CONFIG_PROC_FS */
+ }
+ 
+ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600);
+ 
+-int __init nf_conntrack_expect_init(void)
++int nf_conntrack_expect_init(void)
+ {
+ 	int err = -ENOMEM;
+ 
+@@ -569,16 +580,20 @@ int __init nf_conntrack_expect_init(void)
+ 	}
+ 	nf_ct_expect_max = nf_ct_expect_hsize * 4;
+ 
+-	nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
+-						  &nf_ct_expect_vmalloc);
+-	if (nf_ct_expect_hash == NULL)
++	ve_nf_ct_expect_count = 0;
++	ve_nf_ct_expect_max = nf_ct_expect_max;
++	ve_nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize,
++						  &ve_nf_ct_expect_vmalloc);
++	if (ve_nf_ct_expect_hash == NULL)
+ 		goto err1;
+ 
+-	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
++	if (ve_is_super(get_exec_env())) {
++		nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+ 					sizeof(struct nf_conntrack_expect),
+-					0, 0, NULL);
+-	if (!nf_ct_expect_cachep)
+-		goto err2;
++					0, SLAB_UBC, NULL);
++		if (!nf_ct_expect_cachep)
++			goto err2;
++	}
+ 
+ 	err = exp_proc_init();
+ 	if (err < 0)
+@@ -587,9 +602,10 @@ int __init nf_conntrack_expect_init(void)
+ 	return 0;
+ 
+ err3:
+-	kmem_cache_destroy(nf_ct_expect_cachep);
++	if (ve_is_super(get_exec_env()))
++		kmem_cache_destroy(nf_ct_expect_cachep);
+ err2:
+-	nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
++	nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc,
+ 			     nf_ct_expect_hsize);
+ err1:
+ 	return err;
+@@ -598,7 +614,8 @@ err1:
+ void nf_conntrack_expect_fini(void)
+ {
+ 	exp_proc_remove();
+-	kmem_cache_destroy(nf_ct_expect_cachep);
+-	nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc,
++	if (ve_is_super(get_exec_env()))
++		kmem_cache_destroy(nf_ct_expect_cachep);
++	nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc,
+ 			     nf_ct_expect_hsize);
+ }
+diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
+index 7d1b117..b06f23f 100644
+--- a/net/netfilter/nf_conntrack_helper.c
++++ b/net/netfilter/nf_conntrack_helper.c
+@@ -33,6 +33,13 @@ static struct hlist_head *nf_ct_helper_hash __read_mostly;
+ static unsigned int nf_ct_helper_hsize __read_mostly;
+ static unsigned int nf_ct_helper_count __read_mostly;
+ static int nf_ct_helper_vmalloc;
++#ifdef CONFIG_VE_IPTABLES
++#define ve_nf_ct_helper_hash	(get_exec_env()->_nf_conntrack->_nf_ct_helper_hash)
++#define ve_nf_ct_helper_vmalloc	(get_exec_env()->_nf_conntrack->_nf_ct_helper_vmalloc)
++#else
++#define ve_nf_ct_helper_hash	nf_ct_helper_hash
++#define ve_nf_ct_helper_vmalloc	nf_ct_helper_vmalloc
++#endif
+ 
+ 
+ /* Stupid hash, but collision free for the default registrations of the
+@@ -55,7 +62,7 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
+ 		return NULL;
+ 
+ 	h = helper_hash(tuple);
+-	hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) {
++	hlist_for_each_entry_rcu(helper, n, &ve_nf_ct_helper_hash[h], hnode) {
+ 		if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
+ 			return helper;
+ 	}
+@@ -71,7 +78,7 @@ __nf_conntrack_helper_find_byname(const char *name)
+ 	unsigned int i;
+ 
+ 	for (i = 0; i < nf_ct_helper_hsize; i++) {
+-		hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) {
++		hlist_for_each_entry_rcu(h, n, &ve_nf_ct_helper_hash[i], hnode) {
+ 			if (!strcmp(h->name, name))
+ 				return h;
+ 		}
+@@ -114,7 +121,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
+ 	BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
+ 
+ 	mutex_lock(&nf_ct_helper_mutex);
+-	hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
++	hlist_add_head_rcu(&me->hnode, &ve_nf_ct_helper_hash[h]);
+ 	nf_ct_helper_count++;
+ 	mutex_unlock(&nf_ct_helper_mutex);
+ 
+@@ -144,7 +151,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+ 	/* Get rid of expectations */
+ 	for (i = 0; i < nf_ct_expect_hsize; i++) {
+ 		hlist_for_each_entry_safe(exp, n, next,
+-					  &nf_ct_expect_hash[i], hnode) {
++					  &ve_nf_ct_expect_hash[i], hnode) {
+ 			struct nf_conn_help *help = nfct_help(exp->master);
+ 			if ((help->helper == me || exp->helper == me) &&
+ 			    del_timer(&exp->timeout)) {
+@@ -155,10 +162,10 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+ 	}
+ 
+ 	/* Get rid of expecteds, set helpers to NULL. */
+-	hlist_for_each_entry(h, n, &unconfirmed, hnode)
++	hlist_for_each_entry(h, n, &ve_unconfirmed, hnode)
+ 		unhelp(h, me);
+ 	for (i = 0; i < nf_conntrack_htable_size; i++) {
+-		hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode)
++		hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[i], hnode)
+ 			unhelp(h, me);
+ 	}
+ 	spin_unlock_bh(&nf_conntrack_lock);
+@@ -176,26 +183,29 @@ int nf_conntrack_helper_init(void)
+ 	int err;
+ 
+ 	nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
+-	nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize,
+-						  &nf_ct_helper_vmalloc);
+-	if (!nf_ct_helper_hash)
++	ve_nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize,
++						  &ve_nf_ct_helper_vmalloc);
++	if (!ve_nf_ct_helper_hash)
+ 		return -ENOMEM;
+ 
+-	err = nf_ct_extend_register(&helper_extend);
+-	if (err < 0)
+-		goto err1;
++	if (ve_is_super(get_exec_env())) {
++		err = nf_ct_extend_register(&helper_extend);
++		if (err < 0)
++			goto err1;
++	}
+ 
+ 	return 0;
+ 
+ err1:
+-	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
++	nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc,
+ 			     nf_ct_helper_hsize);
+ 	return err;
+ }
+ 
+ void nf_conntrack_helper_fini(void)
+ {
+-	nf_ct_extend_unregister(&helper_extend);
+-	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc,
++	if (ve_is_super(get_exec_env()))
++		nf_ct_extend_unregister(&helper_extend);
++	nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc,
+ 			     nf_ct_helper_hsize);
+ }
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index 0edefcf..e9bee13 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -26,6 +26,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/notifier.h>
++#include <net/sock.h>
+ 
+ #include <linux/netfilter.h>
+ #include <net/netlink.h>
+@@ -43,6 +44,8 @@
+ 
+ #include <linux/netfilter/nfnetlink.h>
+ #include <linux/netfilter/nfnetlink_conntrack.h>
++#include <bc/beancounter.h>
++#include <bc/sock.h>
+ 
+ MODULE_LICENSE("GPL");
+ 
+@@ -551,7 +554,8 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ 	last = (struct nf_conn *)cb->args[1];
+ 	for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
+ restart:
+-		hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]],
++		hlist_for_each_entry_rcu(h, n,
++					 &ve_nf_conntrack_hash[cb->args[0]],
+ 					 hnode) {
+ 			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ 				continue;
+@@ -1123,14 +1127,15 @@ static int
+ ctnetlink_create_conntrack(struct nlattr *cda[],
+ 			   struct nf_conntrack_tuple *otuple,
+ 			   struct nf_conntrack_tuple *rtuple,
+-			   struct nf_conn *master_ct)
++			   struct nf_conn *master_ct,
++			   struct user_beancounter *ub)
+ {
+ 	struct nf_conn *ct;
+ 	int err = -EINVAL;
+ 	struct nf_conn_help *help;
+ 	struct nf_conntrack_helper *helper;
+ 
+-	ct = nf_conntrack_alloc(otuple, rtuple);
++	ct = nf_conntrack_alloc(otuple, rtuple, ub);
+ 	if (ct == NULL || IS_ERR(ct))
+ 		return -ENOMEM;
+ 
+@@ -1240,11 +1245,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ 
+ 		spin_unlock_bh(&nf_conntrack_lock);
+ 		err = -ENOENT;
+-		if (nlh->nlmsg_flags & NLM_F_CREATE)
++		if (nlh->nlmsg_flags & NLM_F_CREATE) {
++			struct user_beancounter *ub = NULL;
++
++#ifdef CONFIG_BEANCOUNTERS
++			if (skb->sk)
++				ub = sock_bc(skb->sk)->ub;
++#endif
+ 			err = ctnetlink_create_conntrack(cda,
+ 							 &otuple,
+ 							 &rtuple,
+-							 master_ct);
++							 master_ct,
++							 ub);
++		}
+ 		if (err < 0 && master_ct)
+ 			nf_ct_put(master_ct);
+ 
+@@ -1466,7 +1479,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+ 	last = (struct nf_conntrack_expect *)cb->args[1];
+ 	for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+ restart:
+-		hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]],
++		hlist_for_each_entry(exp, n, &ve_nf_ct_expect_hash[cb->args[0]],
+ 				     hnode) {
+ 			if (l3proto && exp->tuple.src.l3num != l3proto)
+ 				continue;
+@@ -1612,7 +1625,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ 		}
+ 		for (i = 0; i < nf_ct_expect_hsize; i++) {
+ 			hlist_for_each_entry_safe(exp, n, next,
+-						  &nf_ct_expect_hash[i],
++						  &ve_nf_ct_expect_hash[i],
+ 						  hnode) {
+ 				m_help = nfct_help(exp->master);
+ 				if (m_help->helper == h
+@@ -1628,7 +1641,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ 		spin_lock_bh(&nf_conntrack_lock);
+ 		for (i = 0; i < nf_ct_expect_hsize; i++) {
+ 			hlist_for_each_entry_safe(exp, n, next,
+-						  &nf_ct_expect_hash[i],
++						  &ve_nf_ct_expect_hash[i],
+ 						  hnode) {
+ 				if (del_timer(&exp->timeout)) {
+ 					nf_ct_unlink_expect(exp);
+diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
+index a49fc93..49fc01f 100644
+--- a/net/netfilter/nf_conntrack_proto.c
++++ b/net/netfilter/nf_conntrack_proto.c
+@@ -28,7 +28,7 @@
+ #include <net/netfilter/nf_conntrack_l4proto.h>
+ #include <net/netfilter/nf_conntrack_core.h>
+ 
+-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
++struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
+ struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_ct_l3protos);
+ 
+@@ -40,7 +40,8 @@ nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path,
+ 		      struct ctl_table *table, unsigned int *users)
+ {
+ 	if (*header == NULL) {
+-		*header = register_sysctl_paths(path, table);
++		*header = register_net_sysctl_table(get_exec_env()->ve_netns,
++						    path, table);
+ 		if (*header == NULL)
+ 			return -ENOMEM;
+ 	}
+@@ -56,7 +57,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header,
+ 	if (users != NULL && --*users > 0)
+ 		return;
+ 
+-	unregister_sysctl_table(*header);
++	unregister_net_sysctl_table(*header);
+ 	*header = NULL;
+ }
+ #endif
+@@ -64,10 +65,10 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header,
+ struct nf_conntrack_l4proto *
+ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+ {
+-	if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
+-		return &nf_conntrack_l4proto_generic;
++	if (unlikely(l3proto >= AF_MAX || ve_nf_ct_protos[l3proto] == NULL))
++		return ve_nf_conntrack_l4proto_generic;
+ 
+-	return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
++	return rcu_dereference(ve_nf_ct_protos[l3proto][l4proto]);
+ }
+ EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
+ 
+@@ -81,7 +82,7 @@ nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto)
+ 	rcu_read_lock();
+ 	p = __nf_ct_l4proto_find(l3proto, l4proto);
+ 	if (!try_module_get(p->me))
+-		p = &nf_conntrack_l4proto_generic;
++		p = ve_nf_conntrack_l4proto_generic;
+ 	rcu_read_unlock();
+ 
+ 	return p;
+@@ -188,7 +189,8 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+ 		return -EBUSY;
+ 
+ 	mutex_lock(&nf_ct_proto_mutex);
+-	if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) {
++	if (ve_nf_ct_l3protos[proto->l3proto] != 
++				&nf_conntrack_l3proto_generic) {
+ 		ret = -EBUSY;
+ 		goto out_unlock;
+ 	}
+@@ -197,7 +199,7 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+ 	if (ret < 0)
+ 		goto out_unlock;
+ 
+-	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
++	rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], proto);
+ 
+ out_unlock:
+ 	mutex_unlock(&nf_ct_proto_mutex);
+@@ -210,8 +212,8 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+ 	BUG_ON(proto->l3proto >= AF_MAX);
+ 
+ 	mutex_lock(&nf_ct_proto_mutex);
+-	BUG_ON(nf_ct_l3protos[proto->l3proto] != proto);
+-	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
++	BUG_ON(ve_nf_ct_l3protos[proto->l3proto] != proto);
++	rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto],
+ 			   &nf_conntrack_l3proto_generic);
+ 	nf_ct_l3proto_unregister_sysctl(proto);
+ 	mutex_unlock(&nf_ct_proto_mutex);
+@@ -279,7 +281,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+ 		return -EBUSY;
+ 
+ 	mutex_lock(&nf_ct_proto_mutex);
+-	if (!nf_ct_protos[l4proto->l3proto]) {
++	if (!ve_nf_ct_protos[l4proto->l3proto]) {
+ 		/* l3proto may be loaded latter. */
+ 		struct nf_conntrack_l4proto **proto_array;
+ 		int i;
+@@ -293,10 +295,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+ 		}
+ 
+ 		for (i = 0; i < MAX_NF_CT_PROTO; i++)
+-			proto_array[i] = &nf_conntrack_l4proto_generic;
+-		nf_ct_protos[l4proto->l3proto] = proto_array;
+-	} else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
+-					&nf_conntrack_l4proto_generic) {
++			proto_array[i] = ve_nf_conntrack_l4proto_generic;
++		ve_nf_ct_protos[l4proto->l3proto] = proto_array;
++	} else if (ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
++					   ve_nf_conntrack_l4proto_generic) {
+ 		ret = -EBUSY;
+ 		goto out_unlock;
+ 	}
+@@ -305,7 +307,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+ 	if (ret < 0)
+ 		goto out_unlock;
+ 
+-	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
++	rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ 			   l4proto);
+ 
+ out_unlock:
+@@ -319,9 +321,9 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+ 	BUG_ON(l4proto->l3proto >= PF_MAX);
+ 
+ 	mutex_lock(&nf_ct_proto_mutex);
+-	BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
+-	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+-			   &nf_conntrack_l4proto_generic);
++	BUG_ON(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
++	rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
++			   ve_nf_conntrack_l4proto_generic);
+ 	nf_ct_l4proto_unregister_sysctl(l4proto);
+ 	mutex_unlock(&nf_ct_proto_mutex);
+ 
+@@ -337,12 +339,12 @@ int nf_conntrack_proto_init(void)
+ 	unsigned int i;
+ 	int err;
+ 
+-	err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic);
++	err = nf_ct_l4proto_register_sysctl(ve_nf_conntrack_l4proto_generic);
+ 	if (err < 0)
+ 		return err;
+ 
+ 	for (i = 0; i < AF_MAX; i++)
+-		rcu_assign_pointer(nf_ct_l3protos[i],
++		rcu_assign_pointer(ve_nf_ct_l3protos[i],
+ 				   &nf_conntrack_l3proto_generic);
+ 	return 0;
+ }
+@@ -351,9 +353,13 @@ void nf_conntrack_proto_fini(void)
+ {
+ 	unsigned int i;
+ 
+-	nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic);
++	nf_ct_l4proto_unregister_sysctl(ve_nf_conntrack_l4proto_generic);
+ 
+ 	/* free l3proto protocol tables */
+ 	for (i = 0; i < PF_MAX; i++)
+-		kfree(nf_ct_protos[i]);
++		kfree(ve_nf_ct_protos[i]);
++#ifdef CONFIG_VE_IPTABLES
++	if (!ve_is_super(get_exec_env()))
++		kfree(ve_nf_conntrack_l4proto_generic);
++#endif
+ }
+diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
+index e31b0e7..e65f9a7 100644
+--- a/net/netfilter/nf_conntrack_proto_generic.c
++++ b/net/netfilter/nf_conntrack_proto_generic.c
+@@ -8,6 +8,7 @@
+ 
+ #include <linux/types.h>
+ #include <linux/jiffies.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+ #include <linux/netfilter.h>
+ #include <net/netfilter/nf_conntrack_l4proto.h>
+@@ -48,7 +49,7 @@ static int packet(struct nf_conn *ct,
+ 		  int pf,
+ 		  unsigned int hooknum)
+ {
+-	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout);
++	nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_generic_timeout);
+ 	return NF_ACCEPT;
+ }
+ 
+@@ -107,3 +108,62 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
+ #endif
+ #endif
+ };
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++int nf_ct_proto_generic_sysctl_init(void)
++{
++	struct nf_conntrack_l4proto *generic;
++
++	if (ve_is_super(get_exec_env())) {
++		generic = &nf_conntrack_l4proto_generic;
++		goto out;
++	}
++
++	generic = kmemdup(&nf_conntrack_l4proto_generic,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (generic == NULL)
++		goto no_mem_ct;
++
++	generic->ctl_table_header = &ve_generic_sysctl_header;
++	generic->ctl_table = kmemdup(generic_sysctl_table,
++			sizeof(generic_sysctl_table), GFP_KERNEL);
++	if (generic->ctl_table == NULL)
++		goto no_mem_sys;
++
++	generic->ctl_table[0].data = &ve_nf_ct_generic_timeout;
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	generic->ctl_compat_table_header = ve_generic_compat_sysctl_header;
++	generic->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
++			sizeof(generic_compat_sysctl_table), GFP_KERNEL);
++	if (generic->ctl_compat_table == NULL)
++		goto no_mem_compat;
++	generic->ctl_compat_table[0].data = &ve_nf_ct_generic_timeout;
++#endif
++out:
++	ve_nf_ct_generic_timeout = nf_ct_generic_timeout;
++
++	ve_nf_conntrack_l4proto_generic = generic;
++	return 0;
++
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++no_mem_compat:
++	kfree(generic->ctl_table);
++#endif
++no_mem_sys:
++	kfree(generic);
++no_mem_ct:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_init);
++
++void nf_ct_proto_generic_sysctl_cleanup(void)
++{
++	if (!ve_is_super(get_exec_env())) {
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++		kfree(ve_nf_conntrack_l4proto_generic->ctl_compat_table);
++#endif
++		kfree(ve_nf_conntrack_l4proto_generic->ctl_table);
++	}
++}
++EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_cleanup);
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
+diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
+index dd28fb2..4c6989c 100644
+--- a/net/netfilter/nf_conntrack_proto_tcp.c
++++ b/net/netfilter/nf_conntrack_proto_tcp.c
+@@ -7,6 +7,7 @@
+  */
+ 
+ #include <linux/types.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+ #include <linux/module.h>
+ #include <linux/in.h>
+@@ -661,7 +662,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
+ 	} else {
+ 		res = false;
+ 		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
+-		    nf_ct_tcp_be_liberal)
++		    ve_nf_ct_tcp_be_liberal)
+ 			res = true;
+ 		if (!res && LOG_INVALID(IPPROTO_TCP))
+ 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+@@ -953,9 +954,9 @@ static int tcp_packet(struct nf_conn *ct,
+ 	if (old_state != new_state
+ 	    && new_state == TCP_CONNTRACK_FIN_WAIT)
+ 		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+-	timeout = ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans
+-		  && tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans
+-		  ? nf_ct_tcp_timeout_max_retrans : tcp_timeouts[new_state];
++	timeout = ct->proto.tcp.retrans >= ve_nf_ct_tcp_max_retrans
++		  && ve_nf_ct_tcp_timeouts[new_state] > ve_nf_ct_tcp_timeout_max_retrans
++		  ? ve_nf_ct_tcp_timeout_max_retrans : ve_nf_ct_tcp_timeouts[new_state];
+ 	write_unlock_bh(&tcp_lock);
+ 
+ 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+@@ -1024,7 +1025,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ 
+ 		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+ 		ct->proto.tcp.seen[1].flags = 0;
+-	} else if (nf_ct_tcp_loose == 0) {
++	} else if (ve_nf_ct_tcp_loose == 0) {
+ 		/* Don't try to pick up connections. */
+ 		return false;
+ 	} else {
+@@ -1419,3 +1420,115 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
+ #endif
+ };
+ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++int nf_ct_proto_tcp_sysctl_init(void)
++{
++	struct nf_conntrack_l4proto *tcp4, *tcp6;
++
++	if (ve_is_super(get_exec_env())) {
++		tcp4 = &nf_conntrack_l4proto_tcp4;
++		tcp6 = &nf_conntrack_l4proto_tcp6;
++		goto out;
++	}
++
++	tcp4 = kmemdup(&nf_conntrack_l4proto_tcp4,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (tcp4 == NULL)
++		goto no_mem_ct4;
++
++	tcp4->ctl_table_users = &ve_tcp_sysctl_table_users;
++	tcp4->ctl_table_header = &ve_tcp_sysctl_header;
++	tcp4->ctl_table = kmemdup(tcp_sysctl_table,
++			sizeof(tcp_sysctl_table), GFP_KERNEL);
++	if (tcp4->ctl_table == NULL)
++		goto no_mem_sys;
++
++	tcp4->ctl_table[0].data = &ve_nf_ct_tcp_timeouts[1];
++	tcp4->ctl_table[1].data = &ve_nf_ct_tcp_timeouts[2];
++	tcp4->ctl_table[2].data = &ve_nf_ct_tcp_timeouts[3];
++	tcp4->ctl_table[3].data = &ve_nf_ct_tcp_timeouts[4];
++	tcp4->ctl_table[4].data = &ve_nf_ct_tcp_timeouts[5];
++	tcp4->ctl_table[5].data = &ve_nf_ct_tcp_timeouts[6];
++	tcp4->ctl_table[6].data = &ve_nf_ct_tcp_timeouts[7];
++	tcp4->ctl_table[7].data = &ve_nf_ct_tcp_timeouts[8];
++	tcp4->ctl_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans;
++	tcp4->ctl_table[9].data = &ve_nf_ct_tcp_loose;
++	tcp4->ctl_table[10].data = &ve_nf_ct_tcp_be_liberal;
++	tcp4->ctl_table[11].data = &ve_nf_ct_tcp_max_retrans;
++
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	tcp4->ctl_compat_table_header = ve_tcp_compat_sysctl_header;
++	tcp4->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
++			sizeof(tcp_compat_sysctl_table), GFP_KERNEL);
++	if (tcp4->ctl_compat_table == NULL)
++		goto no_mem_compat;
++
++	tcp4->ctl_compat_table[0].data = &ve_nf_ct_tcp_timeouts[1];
++	tcp4->ctl_compat_table[1].data = &ve_nf_ct_tcp_timeouts[2];
++	tcp4->ctl_compat_table[2].data = &ve_nf_ct_tcp_timeouts[3];
++	tcp4->ctl_compat_table[3].data = &ve_nf_ct_tcp_timeouts[4];
++	tcp4->ctl_compat_table[4].data = &ve_nf_ct_tcp_timeouts[5];
++	tcp4->ctl_compat_table[5].data = &ve_nf_ct_tcp_timeouts[6];
++	tcp4->ctl_compat_table[6].data = &ve_nf_ct_tcp_timeouts[7];
++	tcp4->ctl_compat_table[7].data = &ve_nf_ct_tcp_timeouts[8];
++	tcp4->ctl_compat_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans;
++	tcp4->ctl_compat_table[9].data = &ve_nf_ct_tcp_loose;
++	tcp4->ctl_compat_table[10].data = &ve_nf_ct_tcp_be_liberal;
++	tcp4->ctl_compat_table[11].data = &ve_nf_ct_tcp_max_retrans;
++#endif
++
++	tcp6 = kmemdup(&nf_conntrack_l4proto_tcp6,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (!tcp6)
++		goto no_mem_ct6;
++
++	tcp6->ctl_table_users = &ve_tcp_sysctl_table_users;
++	tcp6->ctl_table_header = &ve_tcp_sysctl_header;
++	tcp6->ctl_table = tcp4->ctl_table;
++out:
++	ve_nf_ct_tcp_timeouts[1] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
++	ve_nf_ct_tcp_timeouts[2] = tcp_timeouts[TCP_CONNTRACK_SYN_RECV];
++	ve_nf_ct_tcp_timeouts[3] = tcp_timeouts[TCP_CONNTRACK_ESTABLISHED];
++	ve_nf_ct_tcp_timeouts[4] = tcp_timeouts[TCP_CONNTRACK_FIN_WAIT];
++	ve_nf_ct_tcp_timeouts[5] = tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT];
++	ve_nf_ct_tcp_timeouts[6] = tcp_timeouts[TCP_CONNTRACK_LAST_ACK];
++	ve_nf_ct_tcp_timeouts[7] = tcp_timeouts[TCP_CONNTRACK_TIME_WAIT];
++	ve_nf_ct_tcp_timeouts[8] = tcp_timeouts[TCP_CONNTRACK_CLOSE];
++	ve_nf_ct_tcp_timeout_max_retrans = nf_ct_tcp_timeout_max_retrans;
++	ve_nf_ct_tcp_loose = nf_ct_tcp_loose;
++	ve_nf_ct_tcp_be_liberal = nf_ct_tcp_be_liberal;
++	ve_nf_ct_tcp_max_retrans = nf_ct_tcp_max_retrans;
++
++	ve_nf_conntrack_l4proto_tcp4 = tcp4;
++	ve_nf_conntrack_l4proto_tcp6 = tcp6;
++	return 0;
++
++no_mem_ct6:
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	kfree(tcp4->ctl_compat_table);
++no_mem_compat:
++#endif
++	kfree(tcp4->ctl_table);
++no_mem_sys:
++	kfree(tcp4);
++no_mem_ct4:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_init);
++
++void nf_ct_proto_tcp_sysctl_cleanup(void)
++{
++	if (!ve_is_super(get_exec_env())) {
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++		kfree(ve_nf_conntrack_l4proto_tcp4->ctl_compat_table);
++#endif
++		kfree(ve_nf_conntrack_l4proto_tcp4->ctl_table);
++		kfree(ve_nf_conntrack_l4proto_tcp4);
++
++		kfree(ve_nf_conntrack_l4proto_tcp6);
++	}
++}
++EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_cleanup);
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
++
+diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
+index 8b21762..b01823e 100644
+--- a/net/netfilter/nf_conntrack_proto_udp.c
++++ b/net/netfilter/nf_conntrack_proto_udp.c
+@@ -7,6 +7,7 @@
+  */
+ 
+ #include <linux/types.h>
++#include <linux/sched.h>
+ #include <linux/timer.h>
+ #include <linux/module.h>
+ #include <linux/udp.h>
+@@ -72,12 +73,13 @@ static int udp_packet(struct nf_conn *ct,
+ 	/* If we've seen traffic both ways, this is some kind of UDP
+ 	   stream.  Extend timeout. */
+ 	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+-		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream);
++		nf_ct_refresh_acct(ct, ctinfo, skb,
++				   ve_nf_ct_udp_timeout_stream);
+ 		/* Also, more likely to be important, and not a probe */
+ 		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ 			nf_conntrack_event_cache(IPCT_STATUS, skb);
+ 	} else
+-		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout);
++		nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_udp_timeout);
+ 
+ 	return NF_ACCEPT;
+ }
+@@ -229,3 +231,85 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
+ #endif
+ };
+ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
++
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++int nf_ct_proto_udp_sysctl_init(void)
++{
++	struct nf_conntrack_l4proto *udp4, *udp6;
++
++	if (ve_is_super(get_exec_env())) {
++		udp4 = &nf_conntrack_l4proto_udp4;
++		udp6 = &nf_conntrack_l4proto_udp6;
++		goto out;
++	}
++
++	udp4 = kmemdup(&nf_conntrack_l4proto_udp4,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (udp4 == NULL)
++		goto no_mem_ct4;
++
++	udp4->ctl_table_users = &ve_udp_sysctl_table_users;
++	udp4->ctl_table_header = &ve_udp_sysctl_header;
++	udp4->ctl_table = kmemdup(udp_sysctl_table,
++			sizeof(udp_sysctl_table), GFP_KERNEL);
++	if (udp4->ctl_table == NULL)
++		goto no_mem_sys;
++	udp4->ctl_table[0].data = &ve_nf_ct_udp_timeout;
++	udp4->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream;
++
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	udp4->ctl_compat_table_header = ve_udp_compat_sysctl_header;
++	udp4->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
++			sizeof(udp_compat_sysctl_table), GFP_KERNEL);
++	if (udp4->ctl_compat_table == NULL)
++		goto no_mem_compat;
++	udp4->ctl_compat_table[0].data = &ve_nf_ct_udp_timeout;
++	udp4->ctl_compat_table[1].data = &ve_nf_ct_udp_timeout_stream;
++#endif
++
++	udp6 = kmemdup(&nf_conntrack_l4proto_udp6,
++			sizeof(struct nf_conntrack_l4proto), GFP_KERNEL);
++	if (!udp6)
++		goto no_mem_ct6;
++
++	udp6->ctl_table_users = &ve_udp_sysctl_table_users;
++	udp6->ctl_table_header = &ve_udp_sysctl_header;
++	udp6->ctl_table = udp4->ctl_table;
++
++	udp6->ctl_table[0].data = &ve_nf_ct_udp_timeout;
++	udp6->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream;
++out:
++	ve_nf_ct_udp_timeout = nf_ct_udp_timeout;
++	ve_nf_ct_udp_timeout_stream = nf_ct_udp_timeout_stream;
++
++	ve_nf_conntrack_l4proto_udp4 = udp4;
++	ve_nf_conntrack_l4proto_udp6 = udp6;
++	return 0;
++
++no_mem_ct6:
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++	kfree(udp4->ctl_compat_table);
++no_mem_compat:
++#endif
++	kfree(udp4->ctl_table);
++no_mem_sys:
++	kfree(udp4);
++no_mem_ct4:
++	return -ENOMEM;
++}
++EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_init);
++
++void nf_ct_proto_udp_sysctl_cleanup(void)
++{
++	if (!ve_is_super(get_exec_env())) {
++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
++		kfree(ve_nf_conntrack_l4proto_udp4->ctl_compat_table);
++#endif
++		kfree(ve_nf_conntrack_l4proto_udp4->ctl_table);
++		kfree(ve_nf_conntrack_l4proto_udp4);
++
++		kfree(ve_nf_conntrack_l4proto_udp6);
++	}
++}
++EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_cleanup);
++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */
+diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
+index 46ea542..c4d8ef2 100644
+--- a/net/netfilter/nf_conntrack_standalone.c
++++ b/net/netfilter/nf_conntrack_standalone.c
+@@ -9,6 +9,7 @@
+ #include <linux/types.h>
+ #include <linux/netfilter.h>
+ #include <linux/module.h>
++#include <linux/nsproxy.h>
+ #include <linux/skbuff.h>
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+@@ -18,6 +19,7 @@
+ #ifdef CONFIG_SYSCTL
+ #include <linux/sysctl.h>
+ #endif
++#include <linux/nfcalls.h>
+ 
+ #include <net/netfilter/nf_conntrack.h>
+ #include <net/netfilter/nf_conntrack_core.h>
+@@ -28,6 +30,10 @@
+ 
+ MODULE_LICENSE("GPL");
+ 
++int ip_conntrack_disable_ve0 = 0;
++module_param(ip_conntrack_disable_ve0, int, 0440);
++EXPORT_SYMBOL(ip_conntrack_disable_ve0);
++
+ #ifdef CONFIG_PROC_FS
+ int
+ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+@@ -63,7 +69,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
+ 	for (st->bucket = 0;
+ 	     st->bucket < nf_conntrack_htable_size;
+ 	     st->bucket++) {
+-		n = rcu_dereference(nf_conntrack_hash[st->bucket].first);
++		n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first);
+ 		if (n)
+ 			return n;
+ 	}
+@@ -79,7 +85,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq,
+ 	while (head == NULL) {
+ 		if (++st->bucket >= nf_conntrack_htable_size)
+ 			return NULL;
+-		head = rcu_dereference(nf_conntrack_hash[st->bucket].first);
++		head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first);
+ 	}
+ 	return head;
+ }
+@@ -238,7 +244,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+ 
+ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+ {
+-	unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
++	unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count);
+ 	const struct ip_conntrack_stat *st = v;
+ 
+ 	if (v == SEQ_START_TOKEN) {
+@@ -292,27 +298,30 @@ static const struct file_operations ct_cpu_seq_fops = {
+ static int nf_conntrack_standalone_init_proc(void)
+ {
+ 	struct proc_dir_entry *pde;
++	struct net *net = get_exec_env()->ve_netns;
+ 
+-	pde = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops);
++	pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops);
+ 	if (!pde)
+ 		goto out_nf_conntrack;
+ 
+-	pde = proc_create("nf_conntrack", S_IRUGO, init_net.proc_net_stat,
++	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+ 			  &ct_cpu_seq_fops);
+ 	if (!pde)
+ 		goto out_stat_nf_conntrack;
+ 	return 0;
+ 
+ out_stat_nf_conntrack:
+-	proc_net_remove(&init_net, "nf_conntrack");
++	proc_net_remove(net, "nf_conntrack");
+ out_nf_conntrack:
+ 	return -ENOMEM;
+ }
+ 
+ static void nf_conntrack_standalone_fini_proc(void)
+ {
+-	remove_proc_entry("nf_conntrack", init_net.proc_net_stat);
+-	proc_net_remove(&init_net, "nf_conntrack");
++	struct net *net = get_exec_env()->ve_netns;
++
++	remove_proc_entry("nf_conntrack", net->proc_net_stat);
++	proc_net_remove(net, "nf_conntrack");
+ }
+ #else
+ static int nf_conntrack_standalone_init_proc(void)
+@@ -421,19 +430,61 @@ EXPORT_SYMBOL_GPL(nf_ct_log_invalid);
+ 
+ static int nf_conntrack_standalone_init_sysctl(void)
+ {
+-	nf_ct_sysctl_header =
+-		register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table);
+-	if (nf_ct_sysctl_header == NULL) {
+-		printk("nf_conntrack: can't register to sysctl.\n");
+-		return -ENOMEM;
++	struct ctl_table *nf_table, *ct_table;
++
++	nf_table = nf_ct_netfilter_table;
++	ct_table = nf_ct_sysctl_table;
++
++	if (!ve_is_super(get_exec_env())) {
++		nf_table = kmemdup(nf_table, sizeof(nf_ct_netfilter_table),
++				GFP_KERNEL);
++		if (nf_table == NULL)
++			goto err_nft;
++
++		ct_table = kmemdup(ct_table, sizeof(nf_ct_sysctl_table),
++				GFP_KERNEL);
++		if (ct_table == NULL)
++			goto err_ctt;
++
++		nf_table[0].child = ct_table;
+ 	}
++
++	nf_table[1].data = &ve_nf_conntrack_max;
++	ct_table[0].data = &ve_nf_conntrack_max;
++	ct_table[1].data = &ve_nf_conntrack_count;
++	/* nf_conntrack_htable_size is shared and readonly */
++	ct_table[3].data = &ve_nf_conntrack_checksum;
++	ct_table[4].data = &ve_nf_ct_log_invalid;
++	ct_table[5].data = &ve_nf_ct_expect_max;
++
++	ve_nf_ct_sysctl_header = register_net_sysctl_table(get_exec_env()->ve_netns,
++							   nf_ct_path, nf_table);
++	if (ve_nf_ct_sysctl_header == NULL)
++		goto err_reg;
++
+ 	return 0;
+ 
++err_reg:
++	if (ct_table != nf_ct_sysctl_table)
++		kfree(ct_table);
++err_ctt:
++	if (nf_table != nf_ct_netfilter_table)
++		kfree(nf_table);
++err_nft:
++	return -ENOMEM;
+ }
+ 
+ static void nf_conntrack_standalone_fini_sysctl(void)
+ {
+-	unregister_sysctl_table(nf_ct_sysctl_header);
++	struct ctl_table *table = ve_nf_ct_sysctl_header->ctl_table_arg;
++
++	unregister_net_sysctl_table(ve_nf_ct_sysctl_header);
++
++	if (!ve_is_super(get_exec_env())) {
++		kfree(table[0].child);
++		kfree(table);
++	}
++
+ }
+ #else
+ static int nf_conntrack_standalone_init_sysctl(void)
+@@ -446,7 +497,7 @@ static void nf_conntrack_standalone_fini_sysctl(void)
+ }
+ #endif /* CONFIG_SYSCTL */
+ 
+-static int __init nf_conntrack_standalone_init(void)
++static int nf_conntrack_init_ve(void)
+ {
+ 	int ret;
+ 
+@@ -469,13 +520,34 @@ out:
+ 	return ret;
+ }
+ 
+-static void __exit nf_conntrack_standalone_fini(void)
++static void nf_conntrack_cleanup_ve(void)
+ {
+ 	nf_conntrack_standalone_fini_sysctl();
+ 	nf_conntrack_standalone_fini_proc();
+ 	nf_conntrack_cleanup();
+ }
+ 
++static int __init nf_conntrack_standalone_init(void)
++{
++#ifdef CONFIG_VE_IPTABLES
++	KSYMRESOLVE(nf_conntrack_init_ve);
++	KSYMRESOLVE(nf_conntrack_cleanup_ve);
++	KSYMMODRESOLVE(nf_conntrack);
++#endif
++
++	return nf_conntrack_init_ve();
++}
++
++static void __exit nf_conntrack_standalone_fini(void)
++{
++#ifdef CONFIG_VE_IPTABLES
++	KSYMMODUNRESOLVE(nf_conntrack);
++	KSYMUNRESOLVE(nf_conntrack_init_ve);
++	KSYMUNRESOLVE(nf_conntrack_cleanup_ve);
++#endif
++	nf_conntrack_cleanup_ve();
++}
++
+ module_init(nf_conntrack_standalone_init);
+ module_exit(nf_conntrack_standalone_fini);
+ 
+diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
+index 69d699f..aa01c54 100644
+--- a/net/netfilter/nf_sockopt.c
++++ b/net/netfilter/nf_sockopt.c
+@@ -65,9 +65,6 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf,
+ {
+ 	struct nf_sockopt_ops *ops;
+ 
+-	if (sock_net(sk) != &init_net)
+-		return ERR_PTR(-ENOPROTOOPT);
+-
+ 	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
+ 		return ERR_PTR(-EINTR);
+ 
+diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
+index b75c9c4..04491ab 100644
+--- a/net/netfilter/nfnetlink.c
++++ b/net/netfilter/nfnetlink.c
+@@ -124,7 +124,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+ 	const struct nfnetlink_subsystem *ss;
+ 	int type, err;
+ 
+-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	/* All the messages must at least contain nfgenmsg */
+diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
+index 3447025..9655cff 100644
+--- a/net/netfilter/nfnetlink_queue.c
++++ b/net/netfilter/nfnetlink_queue.c
+@@ -556,9 +556,6 @@ nfqnl_rcv_dev_event(struct notifier_block *this,
+ {
+ 	struct net_device *dev = ptr;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	/* Drop any packets associated with the downed device */
+ 	if (event == NETDEV_DOWN)
+ 		nfqnl_dev_drop(dev->ifindex);
+@@ -587,8 +584,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
+ 			struct hlist_head *head = &instance_table[i];
+ 
+ 			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+-				if ((n->net == &init_net) &&
+-				    (n->pid == inst->peer_pid))
++				if (n->pid == inst->peer_pid)
+ 					__instance_destroy(inst);
+ 			}
+ 		}
+diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
+index 5d75cd8..7ffe66a 100644
+--- a/net/netfilter/x_tables.c
++++ b/net/netfilter/x_tables.c
+@@ -24,6 +24,8 @@
+ #include <linux/mm.h>
+ #include <net/net_namespace.h>
+ 
++#include <bc/kmem.h>
++
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter_arp.h>
+ 
+@@ -64,6 +66,46 @@ static const char *const xt_prefix[NPROTO] = {
+ 	[NF_ARP]	= "arp",
+ };
+ 
++#ifdef CONFIG_BEANCOUNTERS
++static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info)
++{
++	struct user_beancounter *ub;
++
++	for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent);
++	return ub;
++}
++
++static void uncharge_xtables(struct xt_table_info *info, unsigned long size)
++{
++	struct user_beancounter *ub;
++
++	ub = xt_table_ub(info);
++	uncharge_beancounter(ub, UB_NUMXTENT, size);
++}
++
++static int recharge_xtables(int check_ub,
++		struct xt_table_info *new, struct xt_table_info *old)
++{
++	struct user_beancounter *ub;
++	long change;
++
++	ub = xt_table_ub(new);
++	BUG_ON(check_ub && ub != xt_table_ub(old));
++
++	change = (long)new->number - (long)old->number;
++	if (change > 0) {
++		if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT))
++			return -ENOMEM;
++	} else if (change < 0)
++		uncharge_beancounter(ub, UB_NUMXTENT, -change);
++
++	return 0;
++}
++#else
++#define recharge_xtables(c, new, old)	(0)
++#define uncharge_xtables(info, s)	do { } while (0)
++#endif	/* CONFIG_BEANCOUNTERS */
++
+ /* Registration hooks for targets. */
+ int
+ xt_register_target(struct xt_target *target)
+@@ -312,23 +354,23 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
+ 		   unsigned short proto, int inv_proto)
+ {
+ 	if (XT_ALIGN(match->matchsize) != size) {
+-		printk("%s_tables: %s match: invalid size %Zu != %u\n",
++		ve_printk(VE_LOG, "%s_tables: %s match: invalid size %Zu != %u\n",
+ 		       xt_prefix[family], match->name,
+ 		       XT_ALIGN(match->matchsize), size);
+ 		return -EINVAL;
+ 	}
+ 	if (match->table && strcmp(match->table, table)) {
+-		printk("%s_tables: %s match: only valid in %s table, not %s\n",
++		ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n",
+ 		       xt_prefix[family], match->name, match->table, table);
+ 		return -EINVAL;
+ 	}
+ 	if (match->hooks && (hook_mask & ~match->hooks) != 0) {
+-		printk("%s_tables: %s match: bad hook_mask %u/%u\n",
++		ve_printk(VE_LOG, "%s_tables: %s match: bad hook_mask %u/%u\n",
+ 		       xt_prefix[family], match->name, hook_mask, match->hooks);
+ 		return -EINVAL;
+ 	}
+ 	if (match->proto && (match->proto != proto || inv_proto)) {
+-		printk("%s_tables: %s match: only valid for protocol %u\n",
++		ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n",
+ 		       xt_prefix[family], match->name, match->proto);
+ 		return -EINVAL;
+ 	}
+@@ -453,24 +495,24 @@ int xt_check_target(const struct xt_target *target, unsigned short family,
+ 		    unsigned short proto, int inv_proto)
+ {
+ 	if (XT_ALIGN(target->targetsize) != size) {
+-		printk("%s_tables: %s target: invalid size %Zu != %u\n",
++		ve_printk(VE_LOG, "%s_tables: %s target: invalid size %Zu != %u\n",
+ 		       xt_prefix[family], target->name,
+ 		       XT_ALIGN(target->targetsize), size);
+ 		return -EINVAL;
+ 	}
+ 	if (target->table && strcmp(target->table, table)) {
+-		printk("%s_tables: %s target: only valid in %s table, not %s\n",
++		ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n",
+ 		       xt_prefix[family], target->name, target->table, table);
+ 		return -EINVAL;
+ 	}
+ 	if (target->hooks && (hook_mask & ~target->hooks) != 0) {
+-		printk("%s_tables: %s target: bad hook_mask %u/%u\n",
++		ve_printk(VE_LOG, "%s_tables: %s target: bad hook_mask %u/%u\n",
+ 		       xt_prefix[family], target->name, hook_mask,
+ 		       target->hooks);
+ 		return -EINVAL;
+ 	}
+ 	if (target->proto && (target->proto != proto || inv_proto)) {
+-		printk("%s_tables: %s target: only valid for protocol %u\n",
++		ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n",
+ 		       xt_prefix[family], target->name, target->proto);
+ 		return -EINVAL;
+ 	}
+@@ -550,19 +592,19 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
+ 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages)
+ 		return NULL;
+ 
+-	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
++	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC);
+ 	if (!newinfo)
+ 		return NULL;
+ 
+-	newinfo->size = size;
++	newinfo->alloc_size = newinfo->size = size;
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		if (size <= PAGE_SIZE)
+ 			newinfo->entries[cpu] = kmalloc_node(size,
+-							GFP_KERNEL,
++							GFP_KERNEL_UBC,
+ 							cpu_to_node(cpu));
+ 		else
+-			newinfo->entries[cpu] = vmalloc_node(size,
++			newinfo->entries[cpu] = ub_vmalloc_node(size,
+ 							cpu_to_node(cpu));
+ 
+ 		if (newinfo->entries[cpu] == NULL) {
+@@ -580,7 +622,7 @@ void xt_free_table_info(struct xt_table_info *info)
+ 	int cpu;
+ 
+ 	for_each_possible_cpu(cpu) {
+-		if (info->size <= PAGE_SIZE)
++		if (info->alloc_size <= PAGE_SIZE)
+ 			kfree(info->entries[cpu]);
+ 		else
+ 			vfree(info->entries[cpu]);
+@@ -645,6 +687,13 @@ xt_replace_table(struct xt_table *table,
+ 		return NULL;
+ 	}
+ 	oldinfo = private;
++
++	if (recharge_xtables(num_counters != 0, newinfo, oldinfo)) {
++		write_unlock_bh(&table->lock);
++		*error = -ENOMEM;
++		return NULL;
++	}
++
+ 	table->private = newinfo;
+ 	newinfo->initial_entries = oldinfo->initial_entries;
+ 	write_unlock_bh(&table->lock);
+@@ -714,6 +763,7 @@ void *xt_unregister_table(struct xt_table *table)
+ 	list_del(&table->list);
+ 	mutex_unlock(&xt[table->af].mutex);
+ 	kfree(table);
++	uncharge_xtables(private, private->number);
+ 
+ 	return private;
+ }
+diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
+index 97efd74..d0453de 100644
+--- a/net/netfilter/xt_DSCP.c
++++ b/net/netfilter/xt_DSCP.c
+@@ -73,7 +73,7 @@ dscp_tg_check(const char *tablename, const void *e_void,
+ 	const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp;
+ 
+ 	if (dscp > XT_DSCP_MAX) {
+-		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
++		ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+ 		return false;
+ 	}
+ 	return true;
+diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
+index f9ce20b..030ba07 100644
+--- a/net/netfilter/xt_MARK.c
++++ b/net/netfilter/xt_MARK.c
+@@ -80,7 +80,7 @@ mark_tg_check_v0(const char *tablename, const void *entry,
+ 	const struct xt_mark_target_info *markinfo = targinfo;
+ 
+ 	if (markinfo->mark > 0xffffffff) {
+-		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
++		ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ 		return false;
+ 	}
+ 	return true;
+@@ -96,12 +96,12 @@ mark_tg_check_v1(const char *tablename, const void *entry,
+ 	if (markinfo->mode != XT_MARK_SET
+ 	    && markinfo->mode != XT_MARK_AND
+ 	    && markinfo->mode != XT_MARK_OR) {
+-		printk(KERN_WARNING "MARK: unknown mode %u\n",
++		ve_printk(VE_LOG, KERN_WARNING "MARK: unknown mode %u\n",
+ 		       markinfo->mode);
+ 		return false;
+ 	}
+ 	if (markinfo->mark > 0xffffffff) {
+-		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
++		ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ 		return false;
+ 	}
+ 	return true;
+diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
+index 217e2b6..1edbe18 100644
+--- a/net/netfilter/xt_TCPMSS.c
++++ b/net/netfilter/xt_TCPMSS.c
+@@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
+ 	   badly. --RR */
+ 	if (tcplen != tcph->doff*4) {
+ 		if (net_ratelimit())
+-			printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
++			ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
+ 			       skb->len);
+ 		return -1;
+ 	}
+@@ -75,7 +75,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
+ 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
+ 		if (dst_mtu(skb->dst) <= minlen) {
+ 			if (net_ratelimit())
+-				printk(KERN_ERR "xt_TCPMSS: "
++				ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: "
+ 				       "unknown or invalid path-MTU (%u)\n",
+ 				       dst_mtu(skb->dst));
+ 			return -1;
+@@ -269,13 +269,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry,
+ 	    (hook_mask & ~((1 << NF_INET_FORWARD) |
+ 			   (1 << NF_INET_LOCAL_OUT) |
+ 			   (1 << NF_INET_POST_ROUTING))) != 0) {
+-		printk("xt_TCPMSS: path-MTU clamping only supported in "
++		ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
+ 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
+ 		return false;
+ 	}
+ 	if (IPT_MATCH_ITERATE(e, find_syn_match))
+ 		return true;
+-	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
++	ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
+ 	return false;
+ }
+ 
+@@ -292,13 +292,13 @@ tcpmss_tg6_check(const char *tablename, const void *entry,
+ 	    (hook_mask & ~((1 << NF_INET_FORWARD) |
+ 			   (1 << NF_INET_LOCAL_OUT) |
+ 			   (1 << NF_INET_POST_ROUTING))) != 0) {
+-		printk("xt_TCPMSS: path-MTU clamping only supported in "
++		ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
+ 		       "FORWARD, OUTPUT and POSTROUTING hooks\n");
+ 		return false;
+ 	}
+ 	if (IP6T_MATCH_ITERATE(e, find_syn_match))
+ 		return true;
+-	printk("xt_TCPMSS: Only works on TCP SYN packets\n");
++	ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
+ 	return false;
+ }
+ #endif
+diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
+index 6809af5..da1f086 100644
+--- a/net/netfilter/xt_hashlimit.c
++++ b/net/netfilter/xt_hashlimit.c
+@@ -15,6 +15,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
++#include <linux/nsproxy.h>
+ #include <linux/list.h>
+ #include <linux/skbuff.h>
+ #include <linux/mm.h>
+@@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit");
+ MODULE_ALIAS("ip6t_hashlimit");
+ 
+ /* need to declare this at the top */
++#ifdef CONFIG_VE_IPTABLES
++#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4)
++#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6)
++#else
+ static struct proc_dir_entry *hashlimit_procdir4;
+ static struct proc_dir_entry *hashlimit_procdir6;
++#endif
+ static const struct file_operations dl_file_ops;
+ 
+ /* hash table crap */
+@@ -99,9 +105,16 @@ struct xt_hashlimit_htable {
+ 
+ static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
+ static DEFINE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
++#ifdef CONFIG_VE_IPTABLES
++#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables)
++#else
+ static HLIST_HEAD(hashlimit_htables);
++#endif
+ static struct kmem_cache *hashlimit_cachep __read_mostly;
+ 
++static int init_xt_hashlimit(void);
++static void fini_xt_hashlimit(void);
++
+ static inline bool dst_cmp(const struct dsthash_ent *ent,
+ 			   const struct dsthash_dst *b)
+ {
+@@ -702,6 +715,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf,
+ 	if (r->name[sizeof(r->name) - 1] != '\0')
+ 		return false;
+ 
++	if (init_xt_hashlimit())
++		return 0;
++
+ 	/* This is the best we've got: We cannot release and re-grab lock,
+ 	 * since checkentry() is called before x_tables.c grabs xt_mutex.
+ 	 * We also cannot grab the hashtable spinlock, since htable_create will
+@@ -748,6 +764,9 @@ hashlimit_mt_check(const char *tablename, const void *inf,
+ 			return false;
+ 	}
+ 
++	if (init_xt_hashlimit())
++		return 0;
++
+ 	/* This is the best we've got: We cannot release and re-grab lock,
+ 	 * since checkentry() is called before x_tables.c grabs xt_mutex.
+ 	 * We also cannot grab the hashtable spinlock, since htable_create will
+@@ -770,6 +789,8 @@ hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo)
+ 	const struct xt_hashlimit_info *r = matchinfo;
+ 
+ 	htable_put(r->hinfo);
++	if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
++		fini_xt_hashlimit();
+ }
+ 
+ static void
+@@ -778,6 +799,8 @@ hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo)
+ 	const struct xt_hashlimit_mtinfo1 *info = matchinfo;
+ 
+ 	htable_put(info->hinfo);
++	if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
++		fini_xt_hashlimit();
+ }
+ 
+ #ifdef CONFIG_COMPAT
+@@ -980,6 +1003,76 @@ static const struct file_operations dl_file_ops = {
+ 	.release = seq_release
+ };
+ 
++static inline struct proc_dir_entry *proc_from_netns(void)
++{
++#if defined(CONFIG_VE)
++	return get_exec_env()->ve_netns->proc_net;
++#else
++	return init_net.proc_net;
++#endif
++}
++
++static int init_xt_hashlimit(void)
++{
++	struct proc_dir_entry *proc_net = proc_from_netns();
++
++#if defined(CONFIG_VE_IPTABLES)
++	struct ve_struct *ve = get_exec_env();
++
++	if (ve->_xt_hashlimit)
++		return 0;
++
++	ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL);
++	if (!ve->_xt_hashlimit)
++		goto err1;
++#endif
++	INIT_HLIST_HEAD(&hashlimit_htables);
++
++	hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net);
++	if (!hashlimit_procdir4) {
++		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
++				"entry\n");
++		goto err2;
++	}
++#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
++	hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net);
++	if (!hashlimit_procdir6) {
++		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
++				"entry\n");
++		goto err3;
++	}
++#endif
++
++	return 0;
++
++err3:
++	remove_proc_entry("ipt_hashlimit", proc_net);
++err2:
++#if defined(CONFIG_VE_IPTABLES)
++	kfree(ve->_xt_hashlimit);
++	ve->_xt_hashlimit = NULL;
++err1:
++#endif
++	return -ENOMEM;
++}
++
++static void fini_xt_hashlimit(void)
++{
++	struct proc_dir_entry *proc_net = proc_from_netns();
++#ifdef CONFIG_VE_IPTABLES
++	struct ve_struct *ve = get_exec_env();
++#endif
++#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
++	remove_proc_entry("ip6t_hashlimit", proc_net);
++#endif
++	remove_proc_entry("ipt_hashlimit", proc_net);
++
++#if defined(CONFIG_VE_IPTABLES)
++	kfree(ve->_xt_hashlimit);
++	ve->_xt_hashlimit = NULL;
++#endif
++}
++
+ static int __init hashlimit_mt_init(void)
+ {
+ 	int err;
+@@ -997,24 +1090,11 @@ static int __init hashlimit_mt_init(void)
+ 		printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
+ 		goto err2;
+ 	}
+-	hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net);
+-	if (!hashlimit_procdir4) {
+-		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+-				"entry\n");
++	err = init_xt_hashlimit();
++	if (err)
+ 		goto err3;
+-	}
+-	err = 0;
+-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+-	hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net);
+-	if (!hashlimit_procdir6) {
+-		printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+-				"entry\n");
+-		err = -ENOMEM;
+-	}
+-#endif
+ 	if (!err)
+ 		return 0;
+-	remove_proc_entry("ipt_hashlimit", init_net.proc_net);
+ err3:
+ 	kmem_cache_destroy(hashlimit_cachep);
+ err2:
+@@ -1026,10 +1106,7 @@ err1:
+ 
+ static void __exit hashlimit_mt_exit(void)
+ {
+-	remove_proc_entry("ipt_hashlimit", init_net.proc_net);
+-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+-	remove_proc_entry("ip6t_hashlimit", init_net.proc_net);
+-#endif
++	fini_xt_hashlimit();
+ 	kmem_cache_destroy(hashlimit_cachep);
+ 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
+ }
+diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
+index aad9ab8..91570c7 100644
+--- a/net/netfilter/xt_limit.c
++++ b/net/netfilter/xt_limit.c
+@@ -105,7 +105,7 @@ limit_mt_check(const char *tablename, const void *inf,
+ 	/* Check for overflow. */
+ 	if (r->burst == 0
+ 	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+-		printk("Overflow in xt_limit, try lower: %u/%u\n",
++		ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n",
+ 		       r->avg, r->burst);
+ 		return false;
+ 	}
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index 349aba1..d30766c 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -60,29 +60,14 @@
+ #include <net/sock.h>
+ #include <net/scm.h>
+ #include <net/netlink.h>
++#include <net/netlink_sock.h>
++
++#include <bc/beancounter.h>
++#include <bc/net.h>
+ 
+ #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
+ #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
+ 
+-struct netlink_sock {
+-	/* struct sock has to be the first member of netlink_sock */
+-	struct sock		sk;
+-	u32			pid;
+-	u32			dst_pid;
+-	u32			dst_group;
+-	u32			flags;
+-	u32			subscriptions;
+-	u32			ngroups;
+-	unsigned long		*groups;
+-	unsigned long		state;
+-	wait_queue_head_t	wait;
+-	struct netlink_callback	*cb;
+-	struct mutex		*cb_mutex;
+-	struct mutex		cb_def_mutex;
+-	void			(*netlink_rcv)(struct sk_buff *skb);
+-	struct module		*module;
+-};
+-
+ #define NETLINK_KERNEL_SOCKET	0x1
+ #define NETLINK_RECV_PKTINFO	0x2
+ 
+@@ -401,6 +386,8 @@ static int __netlink_create(struct net *net, struct socket *sock,
+ 	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+ 	if (!sk)
+ 		return -ENOMEM;
++	if (ub_other_sock_charge(sk))
++		goto out_free;
+ 
+ 	sock_init_data(sock, sk);
+ 
+@@ -416,6 +403,10 @@ static int __netlink_create(struct net *net, struct socket *sock,
+ 	sk->sk_destruct = netlink_sock_destruct;
+ 	sk->sk_protocol = protocol;
+ 	return 0;
++
++out_free:
++	sk_free(sk);
++	return -ENOMEM;
+ }
+ 
+ static int netlink_create(struct net *net, struct socket *sock, int protocol)
+@@ -522,7 +513,7 @@ static int netlink_autobind(struct socket *sock)
+ 	struct hlist_head *head;
+ 	struct sock *osk;
+ 	struct hlist_node *node;
+-	s32 pid = current->tgid;
++	s32 pid = task_tgid_vnr(current);
+ 	int err;
+ 	static s32 rover = -4097;
+ 
+@@ -558,7 +549,7 @@ retry:
+ static inline int netlink_capable(struct socket *sock, unsigned int flag)
+ {
+ 	return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
+-	       capable(CAP_NET_ADMIN);
++	       capable(CAP_VE_NET_ADMIN);
+ }
+ 
+ static void
+@@ -763,12 +754,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+ 		      long *timeo, struct sock *ssk)
+ {
+ 	struct netlink_sock *nlk;
++	unsigned long chargesize;
++	int no_ubc;
+ 
+ 	nlk = nlk_sk(sk);
+ 
+-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
++	chargesize = skb_charge_fullsize(skb);
++	no_ubc = ub_sock_getwres_other(sk, chargesize);
++	if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ 	    test_bit(0, &nlk->state)) {
+ 		DECLARE_WAITQUEUE(wait, current);
++
++		if (!no_ubc)
++			ub_sock_retwres_other(sk, chargesize,
++					      SOCK_MIN_UBCSPACE_CH);
+ 		if (!*timeo) {
+ 			if (!ssk || netlink_is_kernel(ssk))
+ 				netlink_overrun(sk);
+@@ -780,13 +779,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+ 		__set_current_state(TASK_INTERRUPTIBLE);
+ 		add_wait_queue(&nlk->wait, &wait);
+ 
++		/* this if can't be moved upper because ub_sock_snd_queue_add()
++		 * may change task state to TASK_RUNNING */
++		if (no_ubc)
++			ub_sock_sndqueueadd_other(sk, chargesize);
++
+ 		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+-		     test_bit(0, &nlk->state)) &&
++		     test_bit(0, &nlk->state) || no_ubc) &&
+ 		    !sock_flag(sk, SOCK_DEAD))
+ 			*timeo = schedule_timeout(*timeo);
+ 
+ 		__set_current_state(TASK_RUNNING);
+ 		remove_wait_queue(&nlk->wait, &wait);
++		if (no_ubc)
++			ub_sock_sndqueuedel(sk);
+ 		sock_put(sk);
+ 
+ 		if (signal_pending(current)) {
+@@ -796,6 +802,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+ 		return 1;
+ 	}
+ 	skb_set_owner_r(skb, sk);
++	ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
+ 	return 0;
+ }
+ 
+@@ -961,6 +968,9 @@ static inline int do_one_broadcast(struct sock *sk,
+ 	    !test_bit(p->group - 1, nlk->groups))
+ 		goto out;
+ 
++	if (!ve_accessible_strict(get_exec_env(), sk->owner_env))
++		goto out;
++
+ 	if (!net_eq(sock_net(sk), p->net))
+ 		goto out;
+ 
+@@ -1530,6 +1540,10 @@ static int netlink_dump(struct sock *sk)
+ 	skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
+ 	if (!skb)
+ 		goto errout;
++	if (ub_nlrcvbuf_charge(skb, sk) < 0) {
++		kfree_skb(skb);
++		return -EACCES;
++	}
+ 
+ 	mutex_lock(nlk->cb_mutex);
+ 
+diff --git a/net/netlink/attr.c b/net/netlink/attr.c
+index 2d106cf..d9846a4 100644
+--- a/net/netlink/attr.c
++++ b/net/netlink/attr.c
+@@ -164,7 +164,7 @@ int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+ 	}
+ 
+ 	if (unlikely(rem > 0))
+-		printk(KERN_WARNING "netlink: %d bytes leftover after parsing "
++		ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing "
+ 		       "attributes.\n", rem);
+ 
+ 	err = 0;
+diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
+index 3e1191c..f5c0578 100644
+--- a/net/netlink/genetlink.c
++++ b/net/netlink/genetlink.c
+@@ -437,7 +437,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+ 		return -EOPNOTSUPP;
+ 
+ 	if ((ops->flags & GENL_ADMIN_PERM) &&
+-	    security_netlink_recv(skb, CAP_NET_ADMIN))
++	    security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
+index 2cee87d..1b7fbf4 100644
+--- a/net/packet/af_packet.c
++++ b/net/packet/af_packet.c
+@@ -80,6 +80,8 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+ 
++#include <bc/net.h>
++
+ #ifdef CONFIG_INET
+ #include <net/inet_common.h>
+ #endif
+@@ -454,6 +456,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
+ 	if (dev_net(dev) != sock_net(sk))
+ 		goto drop;
+ 
++	skb_orphan(skb);
++
+ 	skb->dev = dev;
+ 
+ 	if (dev->header_ops) {
+@@ -517,6 +521,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
+ 	if (pskb_trim(skb, snaplen))
+ 		goto drop_n_acct;
+ 
++	if (ub_sockrcvbuf_charge(sk, skb))
++		goto drop_n_acct;
++
+ 	skb_set_owner_r(skb, sk);
+ 	skb->dev = NULL;
+ 	dst_release(skb->dst);
+@@ -571,6 +578,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
+ 	if (dev_net(dev) != sock_net(sk))
+ 		goto drop;
+ 
++	skb_orphan(skb);
++
+ 	if (dev->header_ops) {
+ 		if (sk->sk_type != SOCK_DGRAM)
+ 			skb_push(skb, skb->data - skb_mac_header(skb));
+@@ -617,6 +626,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
+ 			snaplen = 0;
+ 	}
+ 
++	if (copy_skb &&
++	    ub_sockrcvbuf_charge(sk, copy_skb)) {
++		spin_lock(&sk->sk_receive_queue.lock);
++		goto ring_is_full;
++	}
++
+ 	spin_lock(&sk->sk_receive_queue.lock);
+ 	h = packet_lookup_frame(po, po->head);
+ 
+@@ -982,6 +997,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
+ 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+ 	if (sk == NULL)
+ 		goto out;
++	if (ub_other_sock_charge(sk))
++		goto out_free;
+ 
+ 	sock->ops = &packet_ops;
+ 	if (sock->type == SOCK_PACKET)
+@@ -1019,6 +1036,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
+ 	sk_add_node(sk, &net->packet.sklist);
+ 	write_unlock_bh(&net->packet.sklist_lock);
+ 	return(0);
++
++out_free:
++	sk_free(sk);
+ out:
+ 	return err;
+ }
+diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
+index 2a3c97f..9450c72 100644
+--- a/net/sched/sch_cbq.c
++++ b/net/sched/sch_cbq.c
+@@ -905,8 +905,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
+ 
+ 			if (cl->deficit <= 0) {
+ 				q->active[prio] = cl;
+-				cl = cl->next_alive;
+ 				cl->deficit += cl->quantum;
++				cl = cl->next_alive;
+ 			}
+ 			return skb;
+ 
+@@ -1078,17 +1078,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+ 
+ 	for (h=0; h<16; h++) {
+ 		for (cl = q->classes[h]; cl; cl = cl->next) {
++			long mtu;
+ 			/* BUGGGG... Beware! This expression suffer of
+ 			   arithmetic overflows!
+ 			 */
+ 			if (cl->priority == prio) {
+-				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+-					q->quanta[prio];
+-			}
+-			if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
+-				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
+-				cl->quantum = cl->qdisc->dev->mtu/2 + 1;
++				cl->quantum = (cl->weight * cl->allot) /
++					(q->quanta[prio] / q->nclasses[prio]);
+ 			}
++			mtu = cl->qdisc->dev->mtu;
++			if (cl->quantum <= mtu/2)
++				cl->quantum = mtu/2 + 1;
++			else if (cl->quantum > 32*mtu) 
++				cl->quantum = 32*mtu;
+ 		}
+ 	}
+ }
+diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
+index 13afa72..1365604 100644
+--- a/net/sched/sch_generic.c
++++ b/net/sched/sch_generic.c
+@@ -141,11 +141,13 @@ static inline int qdisc_restart(struct net_device *dev)
+ 	struct Qdisc *q = dev->qdisc;
+ 	struct sk_buff *skb;
+ 	int ret = NETDEV_TX_BUSY;
++	struct ve_struct *old_ve;
+ 
+ 	/* Dequeue packet */
+ 	if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL))
+ 		return 0;
+ 
++	old_ve = set_exec_env(skb->owner_env);
+ 
+ 	/* And release queue */
+ 	spin_unlock(&dev->queue_lock);
+@@ -179,6 +181,8 @@ static inline int qdisc_restart(struct net_device *dev)
+ 		break;
+ 	}
+ 
++	(void)set_exec_env(old_ve);
++
+ 	return ret;
+ }
+ 
+diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
+index 0444fd0..57e0989 100644
+--- a/net/sched/sch_teql.c
++++ b/net/sched/sch_teql.c
+@@ -174,6 +174,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
+ 	struct teql_master *m = (struct teql_master*)sch->ops;
+ 	struct teql_sched_data *q = qdisc_priv(sch);
+ 
++	if (!capable(CAP_NET_ADMIN))
++		return -EPERM;
++
+ 	if (dev->hard_header_len > m->dev->hard_header_len)
+ 		return -EINVAL;
+ 
+diff --git a/net/socket.c b/net/socket.c
+index 66c4a8c..58a9495 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -84,6 +84,7 @@
+ #include <linux/kmod.h>
+ #include <linux/audit.h>
+ #include <linux/wireless.h>
++#include <linux/in.h>
+ #include <linux/nsproxy.h>
+ 
+ #include <asm/uaccess.h>
+@@ -159,15 +160,6 @@ static DEFINE_PER_CPU(int, sockets_in_use) = 0;
+  * divide and look after the messy bits.
+  */
+ 
+-#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
+-					   16 for IP, 16 for IPX,
+-					   24 for IPv6,
+-					   about 80 for AX.25
+-					   must be at least one bigger than
+-					   the AF_UNIX size (see net/unix/af_unix.c
+-					   :unix_mkname()).
+-					 */
+-
+ /**
+  *	move_addr_to_kernel	-	copy a socket address into kernel space
+  *	@uaddr: Address in user space
+@@ -189,6 +181,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
+ 		return -EFAULT;
+ 	return audit_sockaddr(ulen, kaddr);
+ }
++EXPORT_SYMBOL(move_addr_to_kernel);
+ 
+ /**
+  *	move_addr_to_user	-	copy an address to user space
+@@ -496,6 +489,8 @@ static struct socket *sock_alloc(void)
+ 	return sock;
+ }
+ 
++EXPORT_SYMBOL(sock_alloc);
++
+ /*
+  *	In theory you can't get an open on this inode, but /proc provides
+  *	a back door. Remember to keep it shut otherwise you'll let the
+@@ -1090,6 +1085,49 @@ call_kill:
+ 	return 0;
+ }
+ 
++int vz_security_family_check(int family)
++{
++#ifdef CONFIG_VE
++	if (ve_is_super(get_exec_env()))
++		return 0;
++
++	switch (family) {
++	case PF_UNSPEC:
++	case PF_PACKET:
++	case PF_NETLINK:
++	case PF_UNIX:
++	case PF_INET:
++	case PF_INET6:
++		break;
++	default:
++		return -EAFNOSUPPORT;
++        }
++#endif
++	return 0;
++}
++EXPORT_SYMBOL_GPL(vz_security_family_check);
++
++int vz_security_protocol_check(int protocol)
++{
++#ifdef CONFIG_VE
++	if (ve_is_super(get_exec_env()))
++		return 0;
++
++	switch (protocol) {
++	case  IPPROTO_IP:
++	case  IPPROTO_TCP:
++	case  IPPROTO_UDP:
++	case  IPPROTO_RAW:
++	case  IPPROTO_DCCP:
++		break;
++	default:
++		return -EAFNOSUPPORT;
++	}
++#endif
++	return 0;
++}
++EXPORT_SYMBOL_GPL(vz_security_protocol_check);
++
+ static int __sock_create(struct net *net, int family, int type, int protocol,
+ 			 struct socket **res, int kern)
+ {
+@@ -1120,6 +1158,11 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
+ 		family = PF_PACKET;
+ 	}
+ 
++	/* VZ compatibility layer */
++	err = vz_security_family_check(family);
++	if (err < 0)
++		return err;
++
+ 	err = security_socket_create(family, type, protocol, kern);
+ 	if (err)
+ 		return err;
+@@ -2314,9 +2357,12 @@ int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
+ {
+ 	mm_segment_t oldfs = get_fs();
+ 	int err;
++	struct ve_struct *old_env;
+ 
+ 	set_fs(KERNEL_DS);
++	old_env = set_exec_env(get_ve0());
+ 	err = sock->ops->ioctl(sock, cmd, arg);
++	(void)set_exec_env(old_env);
+ 	set_fs(oldfs);
+ 
+ 	return err;
+diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
+index 6eab9bf..4fba93a 100644
+--- a/net/sunrpc/sched.c
++++ b/net/sunrpc/sched.c
+@@ -615,7 +615,9 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
+ static void __rpc_execute(struct rpc_task *task)
+ {
+ 	int		status = 0;
++	struct ve_struct *env;
+ 
++	env = set_exec_env(get_ve0());
+ 	dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
+ 			task->tk_pid, task->tk_flags);
+ 
+@@ -693,6 +695,7 @@ static void __rpc_execute(struct rpc_task *task)
+ 			task->tk_status);
+ 	/* Release all resources associated with the task */
+ 	rpc_release_task(task);
++	(void)set_exec_env(env);
+ }
+ 
+ /*
+diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
+index 3e65719..029c673 100644
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -178,6 +178,9 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+ 	unsigned int	pglen = xdr->page_len;
+ 	unsigned int	flags = MSG_MORE;
+ 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
++	struct ve_struct *old_env;
++
++	old_env = set_exec_env(get_ve0());
+ 
+ 	slen = xdr->len;
+ 
+@@ -238,6 +241,8 @@ out:
+ 		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
+ 		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
+ 
++	(void)set_exec_env(old_env);
++
+ 	return len;
+ }
+ 
+@@ -316,11 +321,14 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
+ 		.msg_flags	= MSG_DONTWAIT,
+ 	};
+ 	int len;
++	struct ve_struct *old_env;
+ 
+ 	rqstp->rq_xprt_hlen = 0;
+ 
++	old_env = set_exec_env(get_ve0());
+ 	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
+ 				msg.msg_flags);
++	(void)set_exec_env(get_ve0());
+ 
+ 	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ 		svsk, iov[0].iov_base, iov[0].iov_len, len);
+@@ -719,11 +727,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+ 	struct svc_sock	*newsvsk;
+ 	int		err, slen;
+ 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
++	struct ve_struct *old_env;
+ 
+ 	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+ 	if (!sock)
+ 		return NULL;
+ 
++	old_env = set_exec_env(get_ve0());
+ 	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ 	err = kernel_accept(sock, &newsock, O_NONBLOCK);
+ 	if (err < 0) {
+@@ -733,7 +743,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+ 		else if (err != -EAGAIN && net_ratelimit())
+ 			printk(KERN_WARNING "%s: accept failed (err %d)!\n",
+ 				   serv->sv_name, -err);
+-		return NULL;
++		goto restore;
+ 	}
+ 	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ 
+@@ -774,6 +784,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+ 	}
+ 	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
+ 
++	(void)set_exec_env(old_env);
++
+ 	if (serv->sv_stats)
+ 		serv->sv_stats->nettcpconn++;
+ 
+@@ -781,6 +793,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+ 
+ failed:
+ 	sock_release(newsock);
++restore:
++	(void)set_exec_env(old_env);
+ 	return NULL;
+ }
+ 
+@@ -1211,6 +1225,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+ 	struct sockaddr *newsin = (struct sockaddr *)&addr;
+ 	int		newlen;
+ 	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
++	struct ve_struct *old_env;
+ 
+ 	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
+ 			serv->sv_program->pg_name, protocol,
+@@ -1223,9 +1238,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+ 	}
+ 	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+ 
++	old_env = set_exec_env(get_ve0());
+ 	error = sock_create_kern(sin->sa_family, type, protocol, &sock);
+ 	if (error < 0)
+-		return ERR_PTR(error);
++		goto restore;
+ 
+ 	svc_reclassify_socket(sock);
+ 
+@@ -1247,12 +1263,15 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+ 
+ 	if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
+ 		svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
++		(void)set_exec_env(old_env);
+ 		return (struct svc_xprt *)svsk;
+ 	}
+ 
+ bummer:
+ 	dprintk("svc: svc_create_socket error = %d\n", -error);
+ 	sock_release(sock);
++restore:
++	(void)set_exec_env(old_env);
+ 	return ERR_PTR(error);
+ }
+ 
+@@ -1267,6 +1286,8 @@ static void svc_sock_detach(struct svc_xprt *xprt)
+ 
+ 	dprintk("svc: svc_sock_detach(%p)\n", svsk);
+ 
++	/* XXX: serialization? */
++	sk->sk_user_data = NULL;
+ 	/* put back the old socket callbacks */
+ 	sk->sk_state_change = svsk->sk_ostate;
+ 	sk->sk_data_ready = svsk->sk_odata;
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 783317d..39d2173 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -117,6 +117,9 @@
+ #include <net/checksum.h>
+ #include <linux/security.h>
+ 
++#include <bc/net.h>
++#include <bc/beancounter.h>
++
+ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
+ static DEFINE_SPINLOCK(unix_table_lock);
+ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+@@ -593,6 +596,8 @@ static struct sock * unix_create1(struct net *net, struct socket *sock)
+ 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+ 	if (!sk)
+ 		goto out;
++	if (ub_other_sock_charge(sk))
++		goto out_sk_free;
+ 
+ 	sock_init_data(sock,sk);
+ 	lockdep_set_class(&sk->sk_receive_queue.lock,
+@@ -614,6 +619,9 @@ out:
+ 	if (sk == NULL)
+ 		atomic_dec(&unix_nr_socks);
+ 	return sk;
++out_sk_free:
++	sk_free(sk);
++	return NULL;
+ }
+ 
+ static int unix_create(struct net *net, struct socket *sock, int protocol)
+@@ -1015,6 +1023,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ 	int st;
+ 	int err;
+ 	long timeo;
++	unsigned long chargesize;
+ 
+ 	err = unix_mkname(sunaddr, addr_len, &hash);
+ 	if (err < 0)
+@@ -1043,6 +1052,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+ 	if (skb == NULL)
+ 		goto out;
++	chargesize = skb_charge_fullsize(skb);
++	if (ub_sock_getwres_other(newsk, chargesize) < 0)
++		goto out;	
++	ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF);
+ 
+ restart:
+ 	/*  Find listening sock. */
+@@ -1290,7 +1303,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ 		unix_notinflight(scm->fp->fp[i]);
+ }
+ 
+-static void unix_destruct_fds(struct sk_buff *skb)
++void unix_destruct_fds(struct sk_buff *skb)
+ {
+ 	struct scm_cookie scm;
+ 	memset(&scm, 0, sizeof(scm));
+@@ -1301,6 +1314,7 @@ static void unix_destruct_fds(struct sk_buff *skb)
+ 	scm_destroy(&scm);
+ 	sock_wfree(skb);
+ }
++EXPORT_SYMBOL_GPL(unix_destruct_fds);
+ 
+ static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+@@ -1512,6 +1526,16 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
+ 
+ 		size = len-sent;
+ 
++		if (msg->msg_flags & MSG_DONTWAIT)
++			ub_sock_makewres_other(sk, skb_charge_size(size));
++		if (sock_bc(sk) != NULL && 
++				sock_bc(sk)->poll_reserv >= 
++					SOCK_MIN_UBCSPACE &&
++				skb_charge_size(size) >
++					sock_bc(sk)->poll_reserv)
++			size = skb_charge_datalen(sock_bc(sk)->poll_reserv);
++				
++
+ 		/* Keep two messages in the pipe so it schedules better */
+ 		if (size > ((sk->sk_sndbuf >> 1) - 64))
+ 			size = (sk->sk_sndbuf >> 1) - 64;
+@@ -1523,7 +1547,9 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
+ 		 *	Grab a buffer
+ 		 */
+ 
+-		skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
++
++		skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE,
++				msg->msg_flags&MSG_DONTWAIT, &err);
+ 
+ 		if (skb==NULL)
+ 			goto out_err;
+@@ -1963,6 +1989,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
+ {
+ 	struct sock *sk = sock->sk;
+ 	unsigned int mask;
++	int no_ub_res;
+ 
+ 	poll_wait(file, sk->sk_sleep, wait);
+ 	mask = 0;
+@@ -1975,6 +2002,10 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
+ 	if (sk->sk_shutdown & RCV_SHUTDOWN)
+ 		mask |= POLLRDHUP;
+ 
++	no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
++	if (no_ub_res)
++		ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
++
+ 	/* readable? */
+ 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ 	    (sk->sk_shutdown & RCV_SHUTDOWN))
+@@ -1988,7 +2019,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
+ 	 * we set writable also when the other side has shut down the
+ 	 * connection. This prevents stuck sockets.
+ 	 */
+-	if (unix_writable(sk))
++	if (!no_ub_res && unix_writable(sk))
+ 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ 
+ 	return mask;
+diff --git a/net/unix/garbage.c b/net/unix/garbage.c
+index ebdff3d..1ed511f 100644
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -80,6 +80,7 @@
+ #include <linux/file.h>
+ #include <linux/proc_fs.h>
+ #include <linux/mutex.h>
++#include <linux/module.h>
+ 
+ #include <net/sock.h>
+ #include <net/af_unix.h>
+@@ -151,6 +152,7 @@ void unix_notinflight(struct file *fp)
+ 		spin_unlock(&unix_gc_lock);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(unix_notinflight);
+ 
+ static inline struct sk_buff *sock_queue_head(struct sock *sk)
+ {
+diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
+index cae9fd8..420b756 100644
+--- a/net/xfrm/xfrm_policy.c
++++ b/net/xfrm/xfrm_policy.c
+@@ -2360,9 +2360,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
+ {
+ 	struct net_device *dev = ptr;
+ 
+-	if (dev_net(dev) != &init_net)
+-		return NOTIFY_DONE;
+-
+ 	switch (event) {
+ 	case NETDEV_DOWN:
+ 		xfrm_flush_bundles();
+diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
+index 04c4150..aa0bad6 100644
+--- a/net/xfrm/xfrm_user.c
++++ b/net/xfrm/xfrm_user.c
+@@ -1947,7 +1947,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+ 	link = &xfrm_dispatch[type];
+ 
+ 	/* All operations require privileges, even GET */
+-	if (security_netlink_recv(skb, CAP_NET_ADMIN))
++	if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
+ 		return -EPERM;
+ 
+ 	if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
+diff --git a/security/Kconfig b/security/Kconfig
+index 49b51f9..f2a0ec8 100644
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS
+ 
+ config SECURITY
+ 	bool "Enable different security models"
+-	depends on SYSFS
++	depends on SYSFS && !VE
+ 	help
+ 	  This allows you to choose different security modules to be
+ 	  configured into your kernel.
+diff --git a/security/commoncap.c b/security/commoncap.c
+index 33d3433..b42b99a 100644
+--- a/security/commoncap.c
++++ b/security/commoncap.c
+@@ -35,6 +35,10 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
+ 
+ int cap_netlink_recv(struct sk_buff *skb, int cap)
+ {
++	if (likely(cap == CAP_VE_NET_ADMIN) &&
++			cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
++		return 0;
++
+ 	if (!cap_raised(NETLINK_CB(skb).eff_cap, cap))
+ 		return -EPERM;
+ 	return 0;
+@@ -399,7 +403,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
+ 		return 0;
+ 	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ 		     sizeof(XATTR_SECURITY_PREFIX) - 1)  &&
+-	    !capable(CAP_SYS_ADMIN))
++	    !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
+ 		return -EPERM;
+ 	return 0;
+ }
+@@ -412,7 +416,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
+ 		return 0;
+ 	} else if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ 		     sizeof(XATTR_SECURITY_PREFIX) - 1)  &&
+-	    !capable(CAP_SYS_ADMIN))
++	    !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
+ 		return -EPERM;
+ 	return 0;
+ }
+@@ -675,7 +679,7 @@ void cap_task_reparent_to_init (struct task_struct *p)
+ 
+ int cap_syslog (int type)
+ {
+-	if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
++	if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN))
+ 		return -EPERM;
+ 	return 0;
+ }
+diff --git a/security/device_cgroup.c b/security/device_cgroup.c
+index ddd92ce..d1da90a 100644
+--- a/security/device_cgroup.c
++++ b/security/device_cgroup.c
+@@ -10,11 +10,23 @@
+ #include <linux/list.h>
+ #include <linux/uaccess.h>
+ #include <linux/seq_file.h>
++#include <linux/ve.h>
++#include <linux/vzcalluser.h>
++#include <linux/major.h>
+ 
+ #define ACC_MKNOD 1
+ #define ACC_READ  2
+ #define ACC_WRITE 4
+-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
++#define ACC_QUOTA 8
++#define ACC_HIDDEN 16
++#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA)
++
++static inline int convert_bits(int acc)
++{
++	/* ...10x <-> ...01x   trial: guess hwy */
++	return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) &
++		(ACC_READ | ACC_WRITE | ACC_QUOTA);
++}
+ 
+ #define DEV_BLOCK 1
+ #define DEV_CHAR  2
+@@ -73,6 +85,38 @@ static int devcgroup_can_attach(struct cgroup_subsys *ss,
+ /*
+  * called under cgroup_lock()
+  */
++#ifdef CONFIG_VE
++static struct dev_whitelist_item default_whitelist_items[] = {
++	{ ~0,                     ~0, DEV_ALL,  ACC_MKNOD },
++	{ UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ PTY_MASTER_MAJOR,       ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ PTY_SLAVE_MAJOR,        ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ MEM_MAJOR,	/* null */ 3, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ MEM_MAJOR,    /* zero */ 5, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ MEM_MAJOR,    /* full */ 7, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ TTYAUX_MAJOR,  /* tty */ 0, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ TTYAUX_MAJOR, /* ptmx */ 2, DEV_CHAR, ACC_READ | ACC_WRITE },
++	{ MEM_MAJOR,  /* random */ 8, DEV_CHAR, ACC_READ },
++	{ MEM_MAJOR, /* urandom */ 9, DEV_CHAR, ACC_READ },
++};
++
++static LIST_HEAD(default_perms);
++#define parent_whitelist(p)	(&default_perms)
++static void prepare_def_perms(void)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(default_whitelist_items); i++) {
++		default_whitelist_items[i].access |= ACC_HIDDEN;
++		list_add(&default_whitelist_items[i].list, &default_perms);
++	}
++}
++#else
++#define prepare_def_perms()	do { } while(0)
++#define parent_whitelist(p)	(&parent_dev_cgroup->whitelist)
++#endif
++
+ static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
+ {
+ 	struct dev_whitelist_item *wh, *tmp, *new;
+@@ -188,12 +232,14 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
+ 		}
+ 		wh->minor = wh->major = ~0;
+ 		wh->type = DEV_ALL;
+-		wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE;
++		wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA;
+ 		list_add(&wh->list, &dev_cgroup->whitelist);
++
++		prepare_def_perms();
+ 	} else {
+ 		parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
+ 		ret = dev_whitelist_copy(&dev_cgroup->whitelist,
+-				&parent_dev_cgroup->whitelist);
++				parent_whitelist(parent_dev_cgroup));
+ 		if (ret) {
+ 			kfree(dev_cgroup);
+ 			return ERR_PTR(ret);
+@@ -269,8 +315,14 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ 		set_access(acc, wh->access);
+ 		set_majmin(maj, wh->major);
+ 		set_majmin(min, wh->minor);
+-		seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
+-			   maj, min, acc);
++
++		if (cft != NULL)
++			seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
++					maj, min, acc);
++		else if (!(wh->access & ACC_HIDDEN))
++			seq_printf(m, "%10u %c %03o %s:%s\n", (int)m->private,
++					type_to_char(wh->type),
++					convert_bits(wh->access), maj, min);
+ 	}
+ 	spin_unlock(&devcgroup->lock);
+ 
+@@ -520,16 +572,13 @@ struct cgroup_subsys devices_subsys = {
+ 	.subsys_id = devices_subsys_id,
+ };
+ 
+-int devcgroup_inode_permission(struct inode *inode, int mask)
++static int __devcgroup_inode_permission(int blk, dev_t device, int mask)
+ {
+ 	struct dev_cgroup *dev_cgroup;
+ 	struct dev_whitelist_item *wh;
+ 
+-	dev_t device = inode->i_rdev;
+ 	if (!device)
+ 		return 0;
+-	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+-		return 0;
+ 	dev_cgroup = css_to_devcgroup(task_subsys_state(current,
+ 				devices_subsys_id));
+ 	if (!dev_cgroup)
+@@ -539,19 +588,21 @@ int devcgroup_inode_permission(struct inode *inode, int mask)
+ 	list_for_each_entry(wh, &dev_cgroup->whitelist, list) {
+ 		if (wh->type & DEV_ALL)
+ 			goto acc_check;
+-		if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode))
++		if ((wh->type & DEV_BLOCK) && !blk)
+ 			continue;
+-		if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode))
++		if ((wh->type & DEV_CHAR) && blk)
+ 			continue;
+-		if (wh->major != ~0 && wh->major != imajor(inode))
++		if (wh->major != ~0 && wh->major != MAJOR(device))
+ 			continue;
+-		if (wh->minor != ~0 && wh->minor != iminor(inode))
++		if (wh->minor != ~0 && wh->minor != MINOR(device))
+ 			continue;
+ acc_check:
+ 		if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE))
+ 			continue;
+ 		if ((mask & MAY_READ) && !(wh->access & ACC_READ))
+ 			continue;
++		if ((mask & MAY_QUOTACTL) && !(wh->access & ACC_QUOTA))
++			continue;
+ 		spin_unlock(&dev_cgroup->lock);
+ 		return 0;
+ 	}
+@@ -560,6 +611,15 @@ acc_check:
+ 	return -EPERM;
+ }
+ 
++int devcgroup_inode_permission(struct inode *inode, int mask)
++{
++	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
++		return 0;
++
++	return __devcgroup_inode_permission(S_ISBLK(inode->i_mode),
++			inode->i_rdev, mask);
++}
++
+ int devcgroup_inode_mknod(int mode, dev_t dev)
+ {
+ 	struct dev_cgroup *dev_cgroup;
+@@ -591,3 +651,75 @@ acc_check:
+ 	spin_unlock(&dev_cgroup->lock);
+ 	return -EPERM;
+ }
++
++#ifdef CONFIG_VE
++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode)
++{
++	int mask = 0;
++
++	mask |= (access_mode & FMODE_READ ? MAY_READ : 0);
++	mask |= (access_mode & FMODE_WRITE ? MAY_WRITE : 0);
++	mask |= (access_mode & FMODE_QUOTACTL ? MAY_QUOTACTL : 0);
++
++	return __devcgroup_inode_permission(dev_type == S_IFBLK, dev, mask);
++}
++EXPORT_SYMBOL(get_device_perms_ve);
++
++int set_device_perms_ve(struct ve_struct *ve,
++		unsigned type, dev_t dev, unsigned mask)
++{
++	int err = -EINVAL;
++	struct dev_whitelist_item *new;
++
++	new = kzalloc(sizeof(*new), GFP_KERNEL);
++	if (new == NULL)
++		return -ENOMEM;
++
++	if ((type & S_IFMT) == S_IFBLK)
++		new->type = DEV_BLOCK;
++	else if ((type & S_IFMT) == S_IFCHR)
++		new->type = DEV_CHAR;
++	else
++		goto out;
++
++	new->access = convert_bits(mask);
++	new->major = new->minor = ~0;
++
++	switch (type & VE_USE_MASK) {
++	default:
++		new->minor = MINOR(dev);
++	case VE_USE_MAJOR:
++		new->major = MAJOR(dev);
++	case 0:
++		;
++	}
++
++	err = dev_whitelist_add(cgroup_to_devcgroup(ve->ve_cgroup), new);
++out:
++	if (err < 0)
++		kfree(new);
++	return err;
++}
++EXPORT_SYMBOL(set_device_perms_ve);
++
++#ifdef CONFIG_PROC_FS
++int devperms_seq_show(struct seq_file *m, void *v)
++{
++	struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
++
++	if (m->private == (void *)0) {
++		seq_printf(m, "Version: 2.7\n");
++		m->private = (void *)-1;
++	}
++
++	if (ve_is_super(ve)) {
++		seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
++		return 0;
++	}
++
++	m->private = (void *)ve->veid;
++	return devcgroup_seq_read(ve->ve_cgroup, NULL, m);
++}
++EXPORT_SYMBOL(devperms_seq_show);
++#endif
++#endif
+diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
+index a436d1c..130a8be 100644
+--- a/security/selinux/Kconfig
++++ b/security/selinux/Kconfig
+@@ -1,6 +1,6 @@
+ config SECURITY_SELINUX
+ 	bool "NSA SELinux Support"
+-	depends on SECURITY_NETWORK && AUDIT && NET && INET
++	depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE
+ 	select NETWORK_SECMARK
+ 	default n
+ 	help
+diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
+index 1c864c0..263b6db 100644
+--- a/security/selinux/hooks.c
++++ b/security/selinux/hooks.c
+@@ -5186,12 +5186,12 @@ static int selinux_setprocattr(struct task_struct *p,
+ 			struct task_struct *g, *t;
+ 			struct mm_struct *mm = p->mm;
+ 			read_lock(&tasklist_lock);
+-			do_each_thread(g, t)
++			do_each_thread_ve(g, t)
+ 				if (t->mm == mm && t != p) {
+ 					read_unlock(&tasklist_lock);
+ 					return -EPERM;
+ 				}
+-			while_each_thread(g, t);
++			while_each_thread_ve(g, t);
+ 			read_unlock(&tasklist_lock);
+ 		}
+ 

Modified: dists/trunk/linux-2.6/debian/patches/series/1~experimental.1-extra
==============================================================================
--- dists/trunk/linux-2.6/debian/patches/series/1~experimental.1-extra	(original)
+++ dists/trunk/linux-2.6/debian/patches/series/1~experimental.1-extra	Mon Jul 21 14:07:51 2008
@@ -8,6 +8,7 @@
 + features/all/xen/xenctrl-xenbus.patch featureset=xen
 + features/all/xen/xenctrl-sys-hypervisor.patch featureset=xen
 
++ features/all/openvz/openvz.patch featureset=openvz
 
 # m68k patches
 



More information about the Kernel-svn-changes mailing list