r2487 - in trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian: . patches patches/series
Joshua Kwan
joshk at costa.debian.org
Fri Oct 19 10:58:24 UTC 2007
Author: joshk
Date: 2005-02-15 01:36:04 +0100 (Tue, 15 Feb 2005)
New Revision: 2487
Added:
trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/097_ipsec.diff
Modified:
trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/changelog
trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/series/2.4.29-1
Log:
add ipsec patch!
Modified: trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/changelog
===================================================================
--- trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/changelog 2005-02-14 17:51:54 UTC (rev 2486)
+++ trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/changelog 2005-02-15 00:36:04 UTC (rev 2487)
@@ -7,8 +7,10 @@
- 077_isofs_ignore_volseqno.diff (irrelevant)
- 093_tty_lockup.diff (backport)
- 114-binfmt_aout-CAN-2004-1074.diff (backport)
+ * Patches added
+ - 097_ipsec.diff (Herbert's backport)
- -- Joshua Kwan <joshk at triplehelix.org> Sat, 5 Feb 2005 16:28:05 -0800
+ -- Joshua Kwan <joshk at triplehelix.org> Mon, 14 Feb 2005 16:35:49 -0800
kernel-source-2.4.28 (2.4.28-1) unstable; urgency=low
Added: trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/097_ipsec.diff
===================================================================
--- trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/097_ipsec.diff 2005-02-14 17:51:54 UTC (rev 2486)
+++ trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/097_ipsec.diff 2005-02-15 00:36:04 UTC (rev 2487)
@@ -0,0 +1,33556 @@
+# origin: http://gondor.apana.org.au/~herbert/ipsec-2.4/files/ipsec-2.4.29-20050213-1.bz2
+# cset: n/a
+# description: Backport of 2.6 IPsec to 2.4.29
+# revision date: 2005-02-14
+
+diff -Nru a/Documentation/Configure.help b/Documentation/Configure.help
+--- a/Documentation/Configure.help 2005-02-13 21:25:10 +11:00
++++ b/Documentation/Configure.help 2005-02-13 21:25:10 +11:00
+@@ -5992,6 +5992,14 @@
+ and you should also say Y to "Kernel/User network link driver",
+ below. If unsure, say N.
+
++PF_KEY sockets
++CONFIG_NET_KEY
++ PF_KEYv2 socket family, compatible to KAME ones.
++ They are required if you are going to use IPsec tools ported
++ from KAME.
++
++ Say Y unless you know what you are doing.
++
+ TCP/IP networking
+ CONFIG_INET
+ These are the protocols used on the Internet and on most local
+@@ -6251,6 +6259,39 @@
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
++IP: AH transformation
++CONFIG_INET_AH
++ Support for IPsec AH.
++
++ If unsure, say Y.
++
++IP: ESP transformation
++CONFIG_INET_ESP
++ Support for IPsec ESP.
++
++ If unsure, say Y.
++
++IP: IPComp transformation
++CONFIG_INET_IPCOMP
++ Support for IP Paylod Compression (RFC3173), typically needed
++ for IPsec.
++
++ If unsure, say Y.
++
++IP: tunnel transformation
++CONFIG_INET_TUNNEL
++ Support for generic IP tunnel transformation, which is required by
++ the IP tunneling module as well as tunnel mode IPComp.
++
++ If unsure, say Y.
++
++IP: IPsec user configuration interface
++CONFIG_XFRM_USER
++ Support for IPsec user configuration interface used
++ by native Linux tools.
++
++ If unsure, say Y.
++
+ Unix domain sockets
+ CONFIG_UNIX
+ If you say Y here, you will include support for Unix domain sockets;
+@@ -6295,6 +6336,28 @@
+ as a module, say M here and read <file:Documentation/modules.txt>.
+
+ It is safe to say N here for now.
++
++IPv6: Privacy Extensions (RFC 3041) support
++CONFIG_IPV6_PRIVACY
++ Privacy Extensions for Stateless Address Autoconfiguration in IPv6
++ support. With this option, additional periodically-alter
++ pseudo-random global-scope unicast address(es) will assigned to
++ your interface(s).
++
++ By default, kernel do not generate temporary addresses.
++ To use temporary addresses, do
++
++ echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr
++
++ See <file:Documentation/networking/ip-sysctl.txt> for details.
++
++IPv6: tunnel transformation
++CONFIG_INET6_TUNNEL
++ Support for generic IPv6-in-IPv6 tunnel transformation, which is
++ required by the IPv6-in-IPv6 tunneling module as well as tunnel mode
++ IPComp.
++
++ If unsure, say Y.
+
+ The SCTP Protocol (EXPERIMENTAL)
+ CONFIG_IP_SCTP
+diff -Nru a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
+--- a/Documentation/networking/ip-sysctl.txt 2005-02-13 21:25:09 +11:00
++++ b/Documentation/networking/ip-sysctl.txt 2005-02-13 21:25:09 +11:00
+@@ -708,6 +708,37 @@
+ 0 to disable any limiting, otherwise the maximal rate in jiffies(1)
+ Default: 100
+
++use_tempaddr - INTEGER
++ Preference for Privacy Extensions (RFC3041).
++ <= 0 : disable Privacy Extensions
++ == 1 : enable Privacy Extensions, but prefer public
++ addresses over temporary addresses.
++ > 1 : enable Privacy Extensions and prefer temporary
++ addresses over public addresses.
++ Default: 0 (for most devices)
++ -1 (for point-to-point devices and loopback devices)
++
++temp_valid_lft - INTEGER
++ valid lifetime (in seconds) for temporary addresses.
++ Default: 604800 (7 days)
++
++temp_prefered_lft - INTEGER
++ Preferred lifetime (in seconds) for temorary addresses.
++ Default: 86400 (1 day)
++
++max_desync_factor - INTEGER
++ Maximum value for DESYNC_FACTOR, which is a random value
++ that ensures that clients don't synchronize with each
++ other and generage new addresses at exactly the same time.
++ value is in seconds.
++ Default: 600
++
++regen_max_retry - INTEGER
++ Number of attempts before give up attempting to generate
++ valid temporary addresses.
++ Default: 5
++
++
+ IPv6 Update by:
+ Pekka Savola <pekkas at netcore.fi>
+ YOSHIFUJI Hideaki / USAGI Project <yoshfuji at linux-ipv6.org>
+diff -Nru a/arch/alpha/defconfig b/arch/alpha/defconfig
+--- a/arch/alpha/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/alpha/defconfig 2005-02-13 21:25:09 +11:00
+@@ -127,6 +127,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/arm/defconfig b/arch/arm/defconfig
+--- a/arch/arm/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/arm/defconfig 2005-02-13 21:25:09 +11:00
+@@ -170,6 +170,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/cris/defconfig b/arch/cris/defconfig
+--- a/arch/cris/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/cris/defconfig 2005-02-13 21:25:09 +11:00
+@@ -214,6 +214,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/i386/defconfig b/arch/i386/defconfig
+--- a/arch/i386/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/i386/defconfig 2005-02-13 21:25:09 +11:00
+@@ -184,6 +184,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/ia64/defconfig b/arch/ia64/defconfig
+--- a/arch/ia64/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/ia64/defconfig 2005-02-13 21:25:09 +11:00
+@@ -101,6 +101,7 @@
+ # CONFIG_NETFILTER is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/m68k/defconfig b/arch/m68k/defconfig
+--- a/arch/m68k/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/m68k/defconfig 2005-02-13 21:25:09 +11:00
+@@ -82,6 +82,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/mips/defconfig b/arch/mips/defconfig
+--- a/arch/mips/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/mips/defconfig 2005-02-13 21:25:09 +11:00
+@@ -207,6 +207,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/mips64/defconfig b/arch/mips64/defconfig
+--- a/arch/mips64/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/mips64/defconfig 2005-02-13 21:25:09 +11:00
+@@ -212,6 +212,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/parisc/defconfig b/arch/parisc/defconfig
+--- a/arch/parisc/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/parisc/defconfig 2005-02-13 21:25:09 +11:00
+@@ -116,6 +116,7 @@
+ # CONFIG_NETFILTER is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/ppc/defconfig b/arch/ppc/defconfig
+--- a/arch/ppc/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/ppc/defconfig 2005-02-13 21:25:09 +11:00
+@@ -134,6 +134,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/ppc/kernel/head_8xx.S b/arch/ppc/kernel/head_8xx.S
+--- a/arch/ppc/kernel/head_8xx.S 2005-02-13 21:25:09 +11:00
++++ b/arch/ppc/kernel/head_8xx.S 2005-02-13 21:25:09 +11:00
+@@ -338,13 +338,13 @@
+ 3:
+ lwz r21, 0(r20) /* Get the level 1 entry */
+ rlwinm. r20, r21,0,0,19 /* Extract page descriptor page address */
+- beq 2f /* If zero, don't try to find a pte */
+
+ /* We have a pte table, so load the MI_TWC with the attributes
+ * for this "segment."
+ */
+ tophys(r21,r21)
+ ori r21,r21,1 /* Set valid bit */
++ beq- 2f /* If zero, don't try to find a pte */
+ #ifdef CONFIG_8xx_CPU6
+ li r3, 0x2b80
+ stw r3, 12(r0)
+@@ -369,7 +369,7 @@
+ * set. All other Linux PTE bits control the behavior
+ * of the MMU.
+ */
+- li r21, 0x00f0
++2: li r21, 0x00f0
+ rlwimi r20, r21, 0, 24, 28 /* Set 24-27, clear 28 */
+
+ #ifdef CONFIG_8xx_CPU6
+@@ -388,15 +388,6 @@
+ #endif
+ rfi
+
+-2: mfspr r20, M_TW /* Restore registers */
+- lwz r21, 0(r0)
+- mtcr r21
+- lwz r21, 4(r0)
+-#ifdef CONFIG_8xx_CPU6
+- lwz r3, 8(r0)
+-#endif
+- b InstructionAccess
+-
+ . = 0x1200
+ DataStoreTLBMiss:
+ #ifdef CONFIG_8xx_CPU6
+@@ -422,12 +413,12 @@
+ 3:
+ lwz r21, 0(r20) /* Get the level 1 entry */
+ rlwinm. r20, r21,0,0,19 /* Extract page descriptor page address */
+- beq 2f /* If zero, don't try to find a pte */
+
+ /* We have a pte table, so load fetch the pte from the table.
+ */
+ tophys(r21, r21)
+ ori r21, r21, 1 /* Set valid bit in physical L2 page */
++ beq- 2f /* If zero, don't try to find a pte */
+ #ifdef CONFIG_8xx_CPU6
+ li r3, 0x3b80
+ stw r3, 12(r0)
+@@ -461,7 +452,7 @@
+ * set. All other Linux PTE bits control the behavior
+ * of the MMU.
+ */
+- li r21, 0x00f0
++2: li r21, 0x00f0
+ rlwimi r20, r21, 0, 24, 28 /* Set 24-27, clear 28 */
+
+ #ifdef CONFIG_8xx_CPU6
+@@ -479,24 +470,6 @@
+ lwz r3, 8(r0)
+ #endif
+ rfi
+-
+-2:
+- /* Copy 20 msb from MD_EPN to DAR since the dcxx instructions fail
+- * to update DAR when they cause a DTLB miss.
+- */
+- mfspr r21, MD_EPN
+- mfspr r20, DAR
+- rlwimi r20, r21, 0, 0, 19
+- mtspr DAR, r20
+-
+- mfspr r20, M_TW /* Restore registers */
+- lwz r21, 0(r0)
+- mtcr r21
+- lwz r21, 4(r0)
+-#ifdef CONFIG_8xx_CPU6
+- lwz r3, 8(r0)
+-#endif
+- b DataAccess
+
+ /* This is an instruction TLB error on the MPC8xx. This could be due
+ * to many reasons, such as executing guarded memory or illegal instruction
+diff -Nru a/arch/ppc64/defconfig b/arch/ppc64/defconfig
+--- a/arch/ppc64/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/ppc64/defconfig 2005-02-13 21:25:09 +11:00
+@@ -110,6 +110,7 @@
+ # CONFIG_NETFILTER is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/s390/defconfig b/arch/s390/defconfig
+--- a/arch/s390/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/s390/defconfig 2005-02-13 21:25:09 +11:00
+@@ -150,6 +150,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/s390x/defconfig b/arch/s390x/defconfig
+--- a/arch/s390x/defconfig 2005-02-13 21:25:08 +11:00
++++ b/arch/s390x/defconfig 2005-02-13 21:25:08 +11:00
+@@ -150,6 +150,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/sh64/defconfig b/arch/sh64/defconfig
+--- a/arch/sh64/defconfig 2005-02-13 21:25:10 +11:00
++++ b/arch/sh64/defconfig 2005-02-13 21:25:10 +11:00
+@@ -113,6 +113,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/sparc/defconfig b/arch/sparc/defconfig
+--- a/arch/sparc/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/sparc/defconfig 2005-02-13 21:25:09 +11:00
+@@ -144,6 +144,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/sparc64/defconfig b/arch/sparc64/defconfig
+--- a/arch/sparc64/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/sparc64/defconfig 2005-02-13 21:25:09 +11:00
+@@ -203,6 +203,7 @@
+ # CONFIG_NETFILTER_DEBUG is not set
+ CONFIG_FILTER=y
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ CONFIG_IP_MULTICAST=y
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/arch/x86_64/defconfig b/arch/x86_64/defconfig
+--- a/arch/x86_64/defconfig 2005-02-13 21:25:09 +11:00
++++ b/arch/x86_64/defconfig 2005-02-13 21:25:09 +11:00
+@@ -144,6 +144,7 @@
+ # CONFIG_NETFILTER is not set
+ # CONFIG_FILTER is not set
+ CONFIG_UNIX=y
++CONFIG_NET_KEY=y
+ CONFIG_INET=y
+ # CONFIG_IP_MULTICAST is not set
+ # CONFIG_IP_ADVANCED_ROUTER is not set
+diff -Nru a/crypto/Config.in b/crypto/Config.in
+--- a/crypto/Config.in 2005-02-13 21:25:09 +11:00
++++ b/crypto/Config.in 2005-02-13 21:25:09 +11:00
+@@ -11,7 +11,8 @@
+ "$CONFIG_INET6_AH" = "y" -o \
+ "$CONFIG_INET6_AH" = "m" -o \
+ "$CONFIG_INET6_ESP" = "y" -o \
+- "$CONFIG_INET6_ESP" = "m" ]; then
++ "$CONFIG_INET6_ESP" = "m" -o \
++ "$CONFIG_IPV6_PRIVACY" = "y" ]; then
+ define_bool CONFIG_CRYPTO y
+ else
+ bool 'Cryptographic API' CONFIG_CRYPTO
+@@ -25,7 +26,8 @@
+ "$CONFIG_INET6_AH" = "y" -o \
+ "$CONFIG_INET6_AH" = "m" -o \
+ "$CONFIG_INET6_ESP" = "y" -o \
+- "$CONFIG_INET6_ESP" = "m" ]; then
++ "$CONFIG_INET6_ESP" = "m" -o \
++ "$CONFIG_IPV6_PRIVACY" = "y" ]; then
+ define_bool CONFIG_CRYPTO_HMAC y
+ else
+ bool ' HMAC support' CONFIG_CRYPTO_HMAC
+@@ -33,39 +35,56 @@
+ tristate ' NULL algorithms' CONFIG_CRYPTO_NULL
+ tristate ' MD4 digest algorithm' CONFIG_CRYPTO_MD4
+ if [ "$CONFIG_INET_AH" = "y" -o \
+- "$CONFIG_INET_AH" = "m" -o \
+ "$CONFIG_INET_ESP" = "y" -o \
+- "$CONFIG_INET_ESP" = "m" -o \
+ "$CONFIG_INET6_AH" = "y" -o \
+- "$CONFIG_INET6_AH" = "m" -o \
+- "$CONFIG_INET6_ESP" = "y" -o \
+- "$CONFIG_INET6_ESP" = "m" ]; then
+- define_bool CONFIG_CRYPTO_MD5 y
++ "$CONFIG_INET6_ESP" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_MD5 y
+ else
+- tristate ' MD5 digest algorithm' CONFIG_CRYPTO_MD5
++ if [ "$CONFIG_IPV6" = "y" -a "$CONFIG_IPV6_PRIVACY" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_MD5 y
++ else
++ if [ "$CONFIG_INET_AH" = "m" -o \
++ "$CONFIG_INET_ESP" = "m" -o \
++ "$CONFIG_INET6_AH" = "m" -o \
++ "$CONFIG_INET6_ESP" = "m" ]; then
++ define_tristate CONFIG_CRYPTO_MD5 m
++ else
++ if [ "$CONFIG_IPV6" = "m" -a "$CONFIG_IPV6_PRIVACY" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_MD5 m
++ else
++ tristate ' MD5 digest algorithm' CONFIG_CRYPTO_MD5
++ fi
++ fi
++ fi
+ fi
+ if [ "$CONFIG_INET_AH" = "y" -o \
+- "$CONFIG_INET_AH" = "m" -o \
+ "$CONFIG_INET_ESP" = "y" -o \
+- "$CONFIG_INET_ESP" = "m" -o \
+ "$CONFIG_INET6_AH" = "y" -o \
+- "$CONFIG_INET6_AH" = "m" -o \
+- "$CONFIG_INET6_ESP" = "y" -o \
+- "$CONFIG_INET6_ESP" = "m" ]; then
+- define_bool CONFIG_CRYPTO_SHA1 y
++ "$CONFIG_INET6_ESP" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_SHA1 y
+ else
+- tristate ' SHA1 digest algorithm' CONFIG_CRYPTO_SHA1
++ if [ "$CONFIG_INET_AH" = "m" -o \
++ "$CONFIG_INET_ESP" = "m" -o \
++ "$CONFIG_INET6_AH" = "m" -o \
++ "$CONFIG_INET6_ESP" = "m" ]; then
++ define_tristate CONFIG_CRYPTO_SHA1 m
++ else
++ tristate ' SHA1 digest algorithm' CONFIG_CRYPTO_SHA1
++ fi
+ fi
+ tristate ' SHA256 digest algorithm' CONFIG_CRYPTO_SHA256
+ tristate ' SHA384 and SHA512 digest algorithms' CONFIG_CRYPTO_SHA512
+ tristate ' Whirlpool digest algorithms' CONFIG_CRYPTO_WP512
+ if [ "$CONFIG_INET_ESP" = "y" -o \
+- "$CONFIG_INET_ESP" = "m" -o \
+- "$CONFIG_INET6_ESP" = "y" -o \
+- "$CONFIG_INET6_ESP" = "m" ]; then
+- define_bool CONFIG_CRYPTO_DES y
++ "$CONFIG_INET6_ESP" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_DES y
+ else
+- tristate ' DES and Triple DES EDE cipher algorithms' CONFIG_CRYPTO_DES
++ if [ "$CONFIG_INET_ESP" = "m" -o \
++ "$CONFIG_INET6_ESP" = "m" ]; then
++ define_tristate CONFIG_CRYPTO_DES m
++ else
++ tristate ' DES and Triple DES EDE cipher algorithms' CONFIG_CRYPTO_DES
++ fi
+ fi
+ tristate ' Blowfish cipher algorithm' CONFIG_CRYPTO_BLOWFISH
+ tristate ' Twofish cipher algorithm' CONFIG_CRYPTO_TWOFISH
+@@ -78,12 +97,15 @@
+ tristate ' Anubis cipher algorithm' CONFIG_CRYPTO_ANUBIS
+ tristate ' ARC4 cipher algorithm' CONFIG_CRYPTO_ARC4
+ if [ "$CONFIG_INET_IPCOMP" = "y" -o \
+- "$CONFIG_INET_IPCOMP" = "m" -o \
+- "$CONFIG_INET6_IPCOMP" = "y" -o \
+- "$CONFIG_INET6_IPCOMP" = "m" ]; then
+- define_bool CONFIG_CRYPTO_DEFLATE y
++ "$CONFIG_INET6_IPCOMP" = "y" ]; then
++ define_tristate CONFIG_CRYPTO_DEFLATE y
+ else
+- tristate ' Deflate compression algorithm' CONFIG_CRYPTO_DEFLATE
++ if [ "$CONFIG_INET_IPCOMP" = "m" -o \
++ "$CONFIG_INET6_IPCOMP" = "m" ]; then
++ define_tristate CONFIG_CRYPTO_DEFLATE m
++ else
++ tristate ' Deflate compression algorithm' CONFIG_CRYPTO_DEFLATE
++ fi
+ fi
+ tristate ' Michael MIC keyed digest algorithm' CONFIG_CRYPTO_MICHAEL_MIC
+ tristate ' Testing module' CONFIG_CRYPTO_TEST
+diff -Nru a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
+--- a/drivers/net/ppp_generic.c 2005-02-13 21:25:10 +11:00
++++ b/drivers/net/ppp_generic.c 2005-02-13 21:25:10 +11:00
+@@ -57,7 +57,9 @@
+ #define NP_IPV6 1 /* Internet Protocol V6 */
+ #define NP_IPX 2 /* IPX protocol */
+ #define NP_AT 3 /* Appletalk protocol */
+-#define NUM_NP 4 /* Number of NPs. */
++#define NP_MPLS_UC 4 /* MPLS unicast */
++#define NP_MPLS_MC 5 /* MPLS multicast */
++#define NUM_NP 6 /* Number of NPs. */
+
+ #define MPHDRLEN 6 /* multilink protocol header length */
+ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */
+@@ -281,6 +283,10 @@
+ return NP_IPX;
+ case PPP_AT:
+ return NP_AT;
++ case PPP_MPLS_UC:
++ return NP_MPLS_UC;
++ case PPP_MPLS_MC:
++ return NP_MPLS_MC;
+ }
+ return -EINVAL;
+ }
+@@ -291,6 +297,8 @@
+ PPP_IPV6,
+ PPP_IPX,
+ PPP_AT,
++ PPP_MPLS_UC,
++ PPP_MPLS_MC,
+ };
+
+ /* Translates an ethertype into an NP index */
+@@ -306,6 +314,10 @@
+ case ETH_P_PPPTALK:
+ case ETH_P_ATALK:
+ return NP_AT;
++ case ETH_P_MPLS_UC:
++ return NP_MPLS_UC;
++ case ETH_P_MPLS_MC:
++ return NP_MPLS_MC;
+ }
+ return -1;
+ }
+@@ -316,6 +328,8 @@
+ ETH_P_IPV6,
+ ETH_P_IPX,
+ ETH_P_PPPTALK,
++ ETH_P_MPLS_UC,
++ ETH_P_MPLS_MC,
+ };
+
+ /*
+diff -Nru a/drivers/scsi/megaraid2.c b/drivers/scsi/megaraid2.c
+--- a/drivers/scsi/megaraid2.c 2005-02-13 21:25:09 +11:00
++++ b/drivers/scsi/megaraid2.c 2005-02-13 21:25:09 +11:00
+@@ -2819,7 +2819,7 @@
+ }
+
+ if( iter++ < MBOX_ABORT_SLEEP*1000 ) {
+- mdelay(1);
++ msleep(1);
+ }
+ else {
+ printk(KERN_WARNING
+@@ -2899,7 +2899,7 @@
+ }
+
+ if( iter++ < MBOX_RESET_SLEEP*1000 ) {
+- mdelay(1);
++ msleep(1);
+ }
+ else {
+ printk(KERN_WARNING
+@@ -4040,10 +4040,10 @@
+ printk(KERN_INFO "megaraid: cache flush delay: ");
+ for( i = 9; i >= 0; i-- ) {
+ printk("\b\b\b[%d]", i);
+- mdelay(1000);
++ msleep(1000);
+ }
+ printk("\b\b\b[done]\n");
+- mdelay(1000);
++ msleep(1000);
+
+ return NOTIFY_DONE;
+ }
+diff -Nru a/include/asm-alpha/scatterlist.h b/include/asm-alpha/scatterlist.h
+--- a/include/asm-alpha/scatterlist.h 2005-02-13 21:25:09 +11:00
++++ b/include/asm-alpha/scatterlist.h 2005-02-13 21:25:09 +11:00
+@@ -2,6 +2,7 @@
+ #define _ALPHA_SCATTERLIST_H
+
+ #include <asm/page.h>
++#include <linux/types.h>
+
+ struct scatterlist {
+ /* This will disappear in 2.5.x */
+diff -Nru a/include/linux/if_arp.h b/include/linux/if_arp.h
+--- a/include/linux/if_arp.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/if_arp.h 2005-02-13 21:25:09 +11:00
+@@ -60,7 +60,7 @@
+ #define ARPHRD_RAWHDLC 518 /* Raw HDLC */
+
+ #define ARPHRD_TUNNEL 768 /* IPIP tunnel */
+-#define ARPHRD_TUNNEL6 769 /* IPIP6 tunnel */
++#define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel */
+ #define ARPHRD_FRAD 770 /* Frame Relay Access Device */
+ #define ARPHRD_SKIP 771 /* SKIP vif */
+ #define ARPHRD_LOOPBACK 772 /* Loopback device */
+diff -Nru a/include/linux/in.h b/include/linux/in.h
+--- a/include/linux/in.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/in.h 2005-02-13 21:25:09 +11:00
+@@ -18,6 +18,7 @@
+ #ifndef _LINUX_IN_H
+ #define _LINUX_IN_H
+
++#include <linux/socket.h>
+ #include <linux/types.h>
+ #include <linux/socket.h>
+
+@@ -69,6 +70,8 @@
+ #define IP_RECVTOS 13
+ #define IP_MTU 14
+ #define IP_FREEBIND 15
++#define IP_IPSEC_POLICY 16
++#define IP_XFRM_POLICY 17
+
+ /* BSD compatibility */
+ #define IP_RECVRETOPTS IP_RETOPTS
+diff -Nru a/include/linux/in6.h b/include/linux/in6.h
+--- a/include/linux/in6.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/in6.h 2005-02-13 21:25:09 +11:00
+@@ -180,5 +180,8 @@
+ #define IPV6_FLOWLABEL_MGR 32
+ #define IPV6_FLOWINFO_SEND 33
+
++#define IPV6_IPSEC_POLICY 34
++#define IPV6_XFRM_POLICY 35
++
+
+ #endif
+diff -Nru a/include/linux/inetdevice.h b/include/linux/inetdevice.h
+--- a/include/linux/inetdevice.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/inetdevice.h 2005-02-13 21:25:09 +11:00
+@@ -21,6 +21,8 @@
+ int arp_announce;
+ int arp_ignore;
+ int medium_id;
++ int no_xfrm;
++ int no_policy;
+ int force_igmp_version;
+ void *sysctl;
+ };
+diff -Nru a/include/linux/ip.h b/include/linux/ip.h
+--- a/include/linux/ip.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/ip.h 2005-02-13 21:25:09 +11:00
+@@ -18,8 +18,6 @@
+ #define _LINUX_IP_H
+ #include <asm/byteorder.h>
+
+-/* SOL_IP socket options */
+-
+ #define IPTOS_TOS_MASK 0x1E
+ #define IPTOS_TOS(tos) ((tos)&IPTOS_TOS_MASK)
+ #define IPTOS_LOWDELAY 0x10
+@@ -67,14 +65,6 @@
+ #define MAXTTL 255
+ #define IPDEFTTL 64
+
+-/* struct timestamp, struct route and MAX_ROUTES are removed.
+-
+- REASONS: it is clear that nobody used them because:
+- - MAX_ROUTES value was wrong.
+- - "struct route" was wrong.
+- - "struct timestamp" had fatally misaligned bitfields and was completely unusable.
+- */
+-
+ #define IPOPT_OPTVAL 0
+ #define IPOPT_OLEN 1
+ #define IPOPT_OFFSET 2
+@@ -133,6 +123,27 @@
+ __u32 saddr;
+ __u32 daddr;
+ /*The options start here. */
++};
++
++struct ip_auth_hdr {
++ __u8 nexthdr;
++ __u8 hdrlen; /* This one is measured in 32 bit units! */
++ __u16 reserved;
++ __u32 spi;
++ __u32 seq_no; /* Sequence number */
++ __u8 auth_data[0]; /* Variable len but >=4. Mind the 64 bit alignment! */
++};
++
++struct ip_esp_hdr {
++ __u32 spi;
++ __u32 seq_no; /* Sequence number */
++ __u8 enc_data[0]; /* Variable len but >=8. Mind the 64 bit alignment! */
++};
++
++struct ip_comp_hdr {
++ __u8 nexthdr;
++ __u8 flags;
++ __u16 cpi;
+ };
+
+ #endif /* _LINUX_IP_H */
+diff -Nru a/include/linux/ip6_tunnel.h b/include/linux/ip6_tunnel.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/linux/ip6_tunnel.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,32 @@
++/*
++ * $Id$
++ */
++
++#ifndef _IP6_TUNNEL_H
++#define _IP6_TUNNEL_H
++
++#define IPV6_TLV_TNL_ENCAP_LIMIT 4
++#define IPV6_DEFAULT_TNL_ENCAP_LIMIT 4
++
++/* don't add encapsulation limit if one isn't present in inner packet */
++#define IP6_TNL_F_IGN_ENCAP_LIMIT 0x1
++/* copy the traffic class field from the inner packet */
++#define IP6_TNL_F_USE_ORIG_TCLASS 0x2
++/* copy the flowlabel from the inner packet */
++#define IP6_TNL_F_USE_ORIG_FLOWLABEL 0x4
++/* being used for Mobile IPv6 */
++#define IP6_TNL_F_MIP6_DEV 0x8
++
++struct ip6_tnl_parm {
++ char name[IFNAMSIZ]; /* name of tunnel device */
++ int link; /* ifindex of underlying L2 interface */
++ __u8 proto; /* tunnel protocol */
++ __u8 encap_limit; /* encapsulation limit for tunnel */
++ __u8 hop_limit; /* hop limit for tunnel */
++ __u32 flowinfo; /* traffic class and flowlabel for tunnel */
++ __u32 flags; /* tunnel flags */
++ struct in6_addr laddr; /* local tunnel end-point address */
++ struct in6_addr raddr; /* remote tunnel end-point address */
++};
++
++#endif
+diff -Nru a/include/linux/ipsec.h b/include/linux/ipsec.h
+--- a/include/linux/ipsec.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/ipsec.h 2005-02-13 21:25:09 +11:00
+@@ -1,69 +1,46 @@
+-/*
+- * Definitions for the SECurity layer
+- *
+- * Author:
+- * Robert Muchsel <muchsel at acm.org>
+- *
+- * This program is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU General Public License
+- * as published by the Free Software Foundation; either version
+- * 2 of the License, or (at your option) any later version.
+- */
+-
+ #ifndef _LINUX_IPSEC_H
+ #define _LINUX_IPSEC_H
+
+-#include <linux/config.h>
+-#include <linux/socket.h>
+-#include <net/sock.h>
+-#include <linux/skbuff.h>
+-
+-/* Values for the set/getsockopt calls */
+-
+-/* These defines are compatible with NRL IPv6, however their semantics
+- is different */
+-
+-#define IPSEC_LEVEL_NONE -1 /* send plaintext, accept any */
+-#define IPSEC_LEVEL_DEFAULT 0 /* encrypt/authenticate if possible */
+- /* the default MUST be 0, because a */
+- /* socket is initialized with 0's */
+-#define IPSEC_LEVEL_USE 1 /* use outbound, don't require inbound */
+-#define IPSEC_LEVEL_REQUIRE 2 /* require both directions */
+-#define IPSEC_LEVEL_UNIQUE 2 /* for compatibility only */
+-
+-#ifdef __KERNEL__
+-
+-/* skb bit flags set on packet input processing */
+-
+-#define RCV_SEC 0x0f /* options on receive */
+-#define RCV_AUTH 0x01 /* was authenticated */
+-#define RCV_CRYPT 0x02 /* was encrypted */
+-#define RCV_TUNNEL 0x04 /* was tunneled */
+-#define SND_SEC 0xf0 /* options on send, these are */
+-#define SND_AUTH 0x10 /* currently unused */
+-#define SND_CRYPT 0x20
+-#define SND_TUNNEL 0x40
+-
+-/*
+- * FIXME: ignores network encryption for now..
+- */
+-
+-#ifdef CONFIG_NET_SECURITY
+-static __inline__ int ipsec_sk_policy(struct sock *sk, struct sk_buff *skb)
+-{
+- return ((sk->authentication < IPSEC_LEVEL_REQUIRE) ||
+- (skb->security & RCV_AUTH)) &&
+- ((sk->encryption < IPSEC_LEVEL_REQUIRE) ||
+- (skb->security & RCV_CRYPT));
+-}
+-
+-#else
+-
+-static __inline__ int ipsec_sk_policy(struct sock *sk, struct sk_buff *skb)
+-{
+- return 1;
+-}
+-#endif /* CONFIG */
++/* The definitions, required to talk to KAME racoon IKE. */
++
++#include <linux/pfkeyv2.h>
++
++#define IPSEC_PORT_ANY 0
++#define IPSEC_ULPROTO_ANY 255
++#define IPSEC_PROTO_ANY 255
++
++enum {
++ IPSEC_MODE_ANY = 0, /* We do not support this for SA */
++ IPSEC_MODE_TRANSPORT = 1,
++ IPSEC_MODE_TUNNEL = 2
++};
++
++enum {
++ IPSEC_DIR_ANY = 0,
++ IPSEC_DIR_INBOUND = 1,
++ IPSEC_DIR_OUTBOUND = 2,
++ IPSEC_DIR_FWD = 3, /* It is our own */
++ IPSEC_DIR_MAX = 4,
++ IPSEC_DIR_INVALID = 5
++};
++
++enum {
++ IPSEC_POLICY_DISCARD = 0,
++ IPSEC_POLICY_NONE = 1,
++ IPSEC_POLICY_IPSEC = 2,
++ IPSEC_POLICY_ENTRUST = 3,
++ IPSEC_POLICY_BYPASS = 4
++};
++
++enum {
++ IPSEC_LEVEL_DEFAULT = 0,
++ IPSEC_LEVEL_USE = 1,
++ IPSEC_LEVEL_REQUIRE = 2,
++ IPSEC_LEVEL_UNIQUE = 3
++};
++
++#define IPSEC_MANUAL_REQID_MAX 0x3fff
++
++#define IPSEC_REPLAYWSIZE 32
+
+-#endif /* __KERNEL__ */
+ #endif /* _LINUX_IPSEC_H */
+diff -Nru a/include/linux/ipv6.h b/include/linux/ipv6.h
+--- a/include/linux/ipv6.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/ipv6.h 2005-02-13 21:25:09 +11:00
+@@ -73,6 +73,27 @@
+ #define rt0_type rt_hdr.type
+ };
+
++struct ipv6_auth_hdr {
++ __u8 nexthdr;
++ __u8 hdrlen; /* This one is measured in 32 bit units! */
++ __u16 reserved;
++ __u32 spi;
++ __u32 seq_no; /* Sequence number */
++ __u8 auth_data[0]; /* Length variable but >=4. Mind the 64 bit alignment! */
++};
++
++struct ipv6_esp_hdr {
++ __u32 spi;
++ __u32 seq_no; /* Sequence number */
++ __u8 enc_data[0]; /* Length variable but >=8. Mind the 64 bit alignment! */
++};
++
++struct ipv6_comp_hdr {
++ __u8 nexthdr;
++ __u8 flags;
++ __u16 cpi;
++};
++
+ /*
+ * IPv6 fixed header
+ *
+diff -Nru a/include/linux/ipv6_route.h b/include/linux/ipv6_route.h
+--- a/include/linux/ipv6_route.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/ipv6_route.h 2005-02-13 21:25:09 +11:00
+@@ -13,15 +13,6 @@
+ #ifndef _LINUX_IPV6_ROUTE_H
+ #define _LINUX_IPV6_ROUTE_H
+
+-enum
+-{
+- RTA_IPV6_UNSPEC,
+- RTA_IPV6_HOPLIMIT,
+-};
+-
+-#define RTA_IPV6_MAX RTA_IPV6_HOPLIMIT
+-
+-
+ #define RTF_DEFAULT 0x00010000 /* default - learned via ND */
+ #define RTF_ALLONLINK 0x00020000 /* fallback, no routers on link */
+ #define RTF_ADDRCONF 0x00040000 /* addrconf route - RA */
+@@ -33,6 +24,7 @@
+ #define RTF_CACHE 0x01000000 /* cache entry */
+ #define RTF_FLOW 0x02000000 /* flow significant route */
+ #define RTF_POLICY 0x04000000 /* policy route */
++#define RTF_NDISC 0x08000000 /* ndisc route */
+
+ #define RTF_LOCAL 0x80000000
+
+diff -Nru a/include/linux/kernel.h b/include/linux/kernel.h
+--- a/include/linux/kernel.h 2005-02-13 21:25:08 +11:00
++++ b/include/linux/kernel.h 2005-02-13 21:25:08 +11:00
+@@ -133,6 +133,16 @@
+ ((unsigned char *)&addr)[2], \
+ ((unsigned char *)&addr)[3]
+
++#define NIP6(addr) \
++ ntohs((addr).s6_addr16[0]), \
++ ntohs((addr).s6_addr16[1]), \
++ ntohs((addr).s6_addr16[2]), \
++ ntohs((addr).s6_addr16[3]), \
++ ntohs((addr).s6_addr16[4]), \
++ ntohs((addr).s6_addr16[5]), \
++ ntohs((addr).s6_addr16[6]), \
++ ntohs((addr).s6_addr16[7])
++
+ #if defined(__LITTLE_ENDIAN)
+ #define HIPQUAD(addr) \
+ ((unsigned char *)&addr)[3], \
+diff -Nru a/include/linux/list.h b/include/linux/list.h
+--- a/include/linux/list.h 2005-02-13 21:25:08 +11:00
++++ b/include/linux/list.h 2005-02-13 21:25:08 +11:00
+@@ -3,6 +3,7 @@
+
+ #if defined(__KERNEL__) || defined(_LVM_H_INCLUDE)
+
++#include <linux/stddef.h>
+ #include <linux/prefetch.h>
+
+ /*
+@@ -254,6 +255,152 @@
+ pos = list_entry(pos->member.next, typeof(*pos), member), \
+ prefetch(pos->member.next))
+
++/*
++ * Double linked lists with a single pointer list head.
++ * Mostly useful for hash tables where the two pointer list head is
++ * too wasteful.
++ * You lose the ability to access the tail in O(1).
++ */
++
++struct hlist_head {
++ struct hlist_node *first;
++};
++
++struct hlist_node {
++ struct hlist_node *next, **pprev;
++};
++
++#define HLIST_HEAD_INIT { .first = NULL }
++#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
++#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
++#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
++
++static inline int hlist_unhashed(const struct hlist_node *h)
++{
++ return !h->pprev;
++}
++
++static inline int hlist_empty(const struct hlist_head *h)
++{
++ return !h->first;
++}
++
++static inline void __hlist_del(struct hlist_node *n)
++{
++ struct hlist_node *next = n->next;
++ struct hlist_node **pprev = n->pprev;
++ *pprev = next;
++ if (next)
++ next->pprev = pprev;
++}
++
++static inline void hlist_del(struct hlist_node *n)
++{
++ __hlist_del(n);
++ n->next = NULL;
++ n->pprev = NULL;
++}
++
++static inline void hlist_del_init(struct hlist_node *n)
++{
++ if (n->pprev) {
++ __hlist_del(n);
++ INIT_HLIST_NODE(n);
++ }
++}
++
++static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
++{
++ struct hlist_node *first = h->first;
++ n->next = first;
++ if (first)
++ first->pprev = &n->next;
++ h->first = n;
++ n->pprev = &h->first;
++}
++
++/* next must be != NULL */
++static inline void hlist_add_before(struct hlist_node *n,
++ struct hlist_node *next)
++{
++ n->pprev = next->pprev;
++ n->next = next;
++ next->pprev = &n->next;
++ *(n->pprev) = n;
++}
++
++static inline void hlist_add_after(struct hlist_node *n,
++ struct hlist_node *next)
++{
++ next->next = n->next;
++ n->next = next;
++ next->pprev = &n->next;
++
++ if(next->next)
++ next->next->pprev = &next->next;
++}
++
++#define hlist_entry(ptr, type, member) \
++ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
++
++/* Cannot easily do prefetch unfortunately */
++#define hlist_for_each(pos, head) \
++ for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
++ pos = pos->next)
++
++#define hlist_for_each_safe(pos, n, head) \
++ for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
++ pos = n)
++
++/**
++ * hlist_for_each_entry - iterate over list of given type
++ * @tpos: the type * to use as a loop counter.
++ * @pos: the &struct hlist_node to use as a loop counter.
++ * @head: the head for your list.
++ * @member: the name of the hlist_node within the struct.
++ */
++#define hlist_for_each_entry(tpos, pos, head, member) \
++ for (pos = (head)->first; \
++ pos && ({ prefetch(pos->next); 1;}) && \
++ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
++ pos = pos->next)
++
++/**
++ * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point
++ * @tpos: the type * to use as a loop counter.
++ * @pos: the &struct hlist_node to use as a loop counter.
++ * @member: the name of the hlist_node within the struct.
++ */
++#define hlist_for_each_entry_continue(tpos, pos, member) \
++ for (pos = (pos)->next; \
++ pos && ({ prefetch(pos->next); 1;}) && \
++ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
++ pos = pos->next)
++
++/**
++ * hlist_for_each_entry_from - iterate over a hlist continuing from existing point
++ * @tpos: the type * to use as a loop counter.
++ * @pos: the &struct hlist_node to use as a loop counter.
++ * @member: the name of the hlist_node within the struct.
++ */
++#define hlist_for_each_entry_from(tpos, pos, member) \
++ for (; pos && ({ prefetch(pos->next); 1;}) && \
++ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
++ pos = pos->next)
++
++/**
++ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
++ * @tpos: the type * to use as a loop counter.
++ * @pos: the &struct hlist_node to use as a loop counter.
++ * @n: another &struct hlist_node to use as temporary storage
++ * @head: the head for your list.
++ * @member: the name of the hlist_node within the struct.
++ */
++#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
++ for (pos = (head)->first; \
++ pos && ({ n = pos->next; 1; }) && \
++ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
++ pos = n)
+ #endif /* __KERNEL__ || _LVM_H_INCLUDE */
+
+ #endif
+diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h
+--- a/include/linux/netdevice.h 2005-02-13 21:25:10 +11:00
++++ b/include/linux/netdevice.h 2005-02-13 21:25:10 +11:00
+@@ -96,6 +96,11 @@
+ #define MAX_HEADER (LL_MAX_HEADER + 48)
+ #endif
+
++/* Reserve 16byte aligned hard_header_len, but at least 16.
++ * Alternative is: dev->hard_header_len ? (dev->hard_header_len + 15)&~15 : 0
++ */
++#define LL_RESERVED_SPACE(dev) (((dev)->hard_header_len&~15) + 16)
++
+ /*
+ * Network device statistics. Akin to the 2.0 ether stats but
+ * with byte counters.
+@@ -499,6 +504,7 @@
+ extern int dev_queue_xmit(struct sk_buff *skb);
+ extern int register_netdevice(struct net_device *dev);
+ extern int unregister_netdevice(struct net_device *dev);
++extern void synchronize_net(void);
+ extern int register_netdevice_notifier(struct notifier_block *nb);
+ extern int unregister_netdevice_notifier(struct notifier_block *nb);
+ extern int dev_new_index(void);
+diff -Nru a/include/linux/netlink.h b/include/linux/netlink.h
+--- a/include/linux/netlink.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/netlink.h 2005-02-13 21:25:09 +11:00
+@@ -7,6 +7,7 @@
+ #define NETLINK_FIREWALL 3 /* Firewalling hook */
+ #define NETLINK_TCPDIAG 4 /* TCP socket monitoring */
+ #define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */
++#define NETLINK_XFRM 6 /* ipsec */
+ #define NETLINK_ARPD 8
+ #define NETLINK_ROUTE6 11 /* af_inet6 route comm channel */
+ #define NETLINK_IP6_FW 13
+@@ -87,6 +88,8 @@
+
+ #ifdef __KERNEL__
+
++#include <linux/capability.h>
++
+ struct netlink_skb_parms
+ {
+ struct ucred creds; /* Skb credentials */
+@@ -108,8 +111,8 @@
+ extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len));
+ extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
+ extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
+-extern void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
+- __u32 group, int allocation);
++extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
++ __u32 group, int allocation);
+ extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
+ extern int netlink_register_notifier(struct notifier_block *nb);
+ extern int netlink_unregister_notifier(struct notifier_block *nb);
+diff -Nru a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/linux/pfkeyv2.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,335 @@
++/* PF_KEY user interface, this is defined by rfc2367 so
++ * do not make arbitrary modifications or else this header
++ * file will not be compliant.
++ */
++
++#ifndef _LINUX_PFKEY2_H
++#define _LINUX_PFKEY2_H
++
++#include <linux/types.h>
++
++#define PF_KEY_V2 2
++#define PFKEYV2_REVISION 199806L
++
++struct sadb_msg {
++ uint8_t sadb_msg_version;
++ uint8_t sadb_msg_type;
++ uint8_t sadb_msg_errno;
++ uint8_t sadb_msg_satype;
++ uint16_t sadb_msg_len;
++ uint16_t sadb_msg_reserved;
++ uint32_t sadb_msg_seq;
++ uint32_t sadb_msg_pid;
++} __attribute__((packed));
++/* sizeof(struct sadb_msg) == 16 */
++
++struct sadb_ext {
++ uint16_t sadb_ext_len;
++ uint16_t sadb_ext_type;
++} __attribute__((packed));
++/* sizeof(struct sadb_ext) == 4 */
++
++struct sadb_sa {
++ uint16_t sadb_sa_len;
++ uint16_t sadb_sa_exttype;
++ uint32_t sadb_sa_spi;
++ uint8_t sadb_sa_replay;
++ uint8_t sadb_sa_state;
++ uint8_t sadb_sa_auth;
++ uint8_t sadb_sa_encrypt;
++ uint32_t sadb_sa_flags;
++} __attribute__((packed));
++/* sizeof(struct sadb_sa) == 16 */
++
++struct sadb_lifetime {
++ uint16_t sadb_lifetime_len;
++ uint16_t sadb_lifetime_exttype;
++ uint32_t sadb_lifetime_allocations;
++ uint64_t sadb_lifetime_bytes;
++ uint64_t sadb_lifetime_addtime;
++ uint64_t sadb_lifetime_usetime;
++} __attribute__((packed));
++/* sizeof(struct sadb_lifetime) == 32 */
++
++struct sadb_address {
++ uint16_t sadb_address_len;
++ uint16_t sadb_address_exttype;
++ uint8_t sadb_address_proto;
++ uint8_t sadb_address_prefixlen;
++ uint16_t sadb_address_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_address) == 8 */
++
++struct sadb_key {
++ uint16_t sadb_key_len;
++ uint16_t sadb_key_exttype;
++ uint16_t sadb_key_bits;
++ uint16_t sadb_key_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_key) == 8 */
++
++struct sadb_ident {
++ uint16_t sadb_ident_len;
++ uint16_t sadb_ident_exttype;
++ uint16_t sadb_ident_type;
++ uint16_t sadb_ident_reserved;
++ uint64_t sadb_ident_id;
++} __attribute__((packed));
++/* sizeof(struct sadb_ident) == 16 */
++
++struct sadb_sens {
++ uint16_t sadb_sens_len;
++ uint16_t sadb_sens_exttype;
++ uint32_t sadb_sens_dpd;
++ uint8_t sadb_sens_sens_level;
++ uint8_t sadb_sens_sens_len;
++ uint8_t sadb_sens_integ_level;
++ uint8_t sadb_sens_integ_len;
++ uint32_t sadb_sens_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_sens) == 16 */
++
++/* followed by:
++ uint64_t sadb_sens_bitmap[sens_len];
++ uint64_t sadb_integ_bitmap[integ_len]; */
++
++struct sadb_prop {
++ uint16_t sadb_prop_len;
++ uint16_t sadb_prop_exttype;
++ uint8_t sadb_prop_replay;
++ uint8_t sadb_prop_reserved[3];
++} __attribute__((packed));
++/* sizeof(struct sadb_prop) == 8 */
++
++/* followed by:
++ struct sadb_comb sadb_combs[(sadb_prop_len +
++ sizeof(uint64_t) - sizeof(struct sadb_prop)) /
++ sizeof(strut sadb_comb)]; */
++
++struct sadb_comb {
++ uint8_t sadb_comb_auth;
++ uint8_t sadb_comb_encrypt;
++ uint16_t sadb_comb_flags;
++ uint16_t sadb_comb_auth_minbits;
++ uint16_t sadb_comb_auth_maxbits;
++ uint16_t sadb_comb_encrypt_minbits;
++ uint16_t sadb_comb_encrypt_maxbits;
++ uint32_t sadb_comb_reserved;
++ uint32_t sadb_comb_soft_allocations;
++ uint32_t sadb_comb_hard_allocations;
++ uint64_t sadb_comb_soft_bytes;
++ uint64_t sadb_comb_hard_bytes;
++ uint64_t sadb_comb_soft_addtime;
++ uint64_t sadb_comb_hard_addtime;
++ uint64_t sadb_comb_soft_usetime;
++ uint64_t sadb_comb_hard_usetime;
++} __attribute__((packed));
++/* sizeof(struct sadb_comb) == 72 */
++
++struct sadb_supported {
++ uint16_t sadb_supported_len;
++ uint16_t sadb_supported_exttype;
++ uint32_t sadb_supported_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_supported) == 8 */
++
++/* followed by:
++ struct sadb_alg sadb_algs[(sadb_supported_len +
++ sizeof(uint64_t) - sizeof(struct sadb_supported)) /
++ sizeof(struct sadb_alg)]; */
++
++struct sadb_alg {
++ uint8_t sadb_alg_id;
++ uint8_t sadb_alg_ivlen;
++ uint16_t sadb_alg_minbits;
++ uint16_t sadb_alg_maxbits;
++ uint16_t sadb_alg_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_alg) == 8 */
++
++struct sadb_spirange {
++ uint16_t sadb_spirange_len;
++ uint16_t sadb_spirange_exttype;
++ uint32_t sadb_spirange_min;
++ uint32_t sadb_spirange_max;
++ uint32_t sadb_spirange_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_spirange) == 16 */
++
++struct sadb_x_kmprivate {
++ uint16_t sadb_x_kmprivate_len;
++ uint16_t sadb_x_kmprivate_exttype;
++ u_int32_t sadb_x_kmprivate_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_x_kmprivate) == 8 */
++
++struct sadb_x_sa2 {
++ uint16_t sadb_x_sa2_len;
++ uint16_t sadb_x_sa2_exttype;
++ uint8_t sadb_x_sa2_mode;
++ uint8_t sadb_x_sa2_reserved1;
++ uint16_t sadb_x_sa2_reserved2;
++ uint32_t sadb_x_sa2_sequence;
++ uint32_t sadb_x_sa2_reqid;
++} __attribute__((packed));
++/* sizeof(struct sadb_x_sa2) == 16 */
++
++struct sadb_x_policy {
++ uint16_t sadb_x_policy_len;
++ uint16_t sadb_x_policy_exttype;
++ uint16_t sadb_x_policy_type;
++ uint8_t sadb_x_policy_dir;
++ uint8_t sadb_x_policy_reserved;
++ uint32_t sadb_x_policy_id;
++ uint32_t sadb_x_policy_priority;
++} __attribute__((packed));
++/* sizeof(struct sadb_x_policy) == 16 */
++
++struct sadb_x_ipsecrequest {
++ uint16_t sadb_x_ipsecrequest_len;
++ uint16_t sadb_x_ipsecrequest_proto;
++ uint8_t sadb_x_ipsecrequest_mode;
++ uint8_t sadb_x_ipsecrequest_level;
++ uint16_t sadb_x_ipsecrequest_reserved1;
++ uint32_t sadb_x_ipsecrequest_reqid;
++ uint32_t sadb_x_ipsecrequest_reserved2;
++} __attribute__((packed));
++/* sizeof(struct sadb_x_ipsecrequest) == 16 */
++
++/* This defines the TYPE of Nat Traversal in use. Currently only one
++ * type of NAT-T is supported, draft-ietf-ipsec-udp-encaps-06
++ */
++struct sadb_x_nat_t_type {
++ uint16_t sadb_x_nat_t_type_len;
++ uint16_t sadb_x_nat_t_type_exttype;
++ uint8_t sadb_x_nat_t_type_type;
++ uint8_t sadb_x_nat_t_type_reserved[3];
++} __attribute__((packed));
++/* sizeof(struct sadb_x_nat_t_type) == 8 */
++
++/* Pass a NAT Traversal port (Source or Dest port) */
++struct sadb_x_nat_t_port {
++ uint16_t sadb_x_nat_t_port_len;
++ uint16_t sadb_x_nat_t_port_exttype;
++ uint16_t sadb_x_nat_t_port_port;
++ uint16_t sadb_x_nat_t_port_reserved;
++} __attribute__((packed));
++/* sizeof(struct sadb_x_nat_t_port) == 8 */
++
++/* Message types */
++#define SADB_RESERVED 0
++#define SADB_GETSPI 1
++#define SADB_UPDATE 2
++#define SADB_ADD 3
++#define SADB_DELETE 4
++#define SADB_GET 5
++#define SADB_ACQUIRE 6
++#define SADB_REGISTER 7
++#define SADB_EXPIRE 8
++#define SADB_FLUSH 9
++#define SADB_DUMP 10
++#define SADB_X_PROMISC 11
++#define SADB_X_PCHANGE 12
++#define SADB_X_SPDUPDATE 13
++#define SADB_X_SPDADD 14
++#define SADB_X_SPDDELETE 15
++#define SADB_X_SPDGET 16
++#define SADB_X_SPDACQUIRE 17
++#define SADB_X_SPDDUMP 18
++#define SADB_X_SPDFLUSH 19
++#define SADB_X_SPDSETIDX 20
++#define SADB_X_SPDEXPIRE 21
++#define SADB_X_SPDDELETE2 22
++#define SADB_X_NAT_T_NEW_MAPPING 23
++#define SADB_MAX 23
++
++/* Security Association flags */
++#define SADB_SAFLAGS_PFS 1
++#define SADB_SAFLAGS_NOECN 0x80000000
++
++/* Security Association states */
++#define SADB_SASTATE_LARVAL 0
++#define SADB_SASTATE_MATURE 1
++#define SADB_SASTATE_DYING 2
++#define SADB_SASTATE_DEAD 3
++#define SADB_SASTATE_MAX 3
++
++/* Security Association types */
++#define SADB_SATYPE_UNSPEC 0
++#define SADB_SATYPE_AH 2
++#define SADB_SATYPE_ESP 3
++#define SADB_SATYPE_RSVP 5
++#define SADB_SATYPE_OSPFV2 6
++#define SADB_SATYPE_RIPV2 7
++#define SADB_SATYPE_MIP 8
++#define SADB_X_SATYPE_IPCOMP 9
++#define SADB_SATYPE_MAX 9
++
++/* Authentication algorithms */
++#define SADB_AALG_NONE 0
++#define SADB_AALG_MD5HMAC 2
++#define SADB_AALG_SHA1HMAC 3
++#define SADB_X_AALG_SHA2_256HMAC 5
++#define SADB_X_AALG_SHA2_384HMAC 6
++#define SADB_X_AALG_SHA2_512HMAC 7
++#define SADB_X_AALG_RIPEMD160HMAC 8
++#define SADB_X_AALG_NULL 251 /* kame */
++#define SADB_AALG_MAX 251
++
++/* Encryption algorithms */
++#define SADB_EALG_NONE 0
++#define SADB_EALG_DESCBC 2
++#define SADB_EALG_3DESCBC 3
++#define SADB_X_EALG_CASTCBC 6
++#define SADB_X_EALG_BLOWFISHCBC 7
++#define SADB_EALG_NULL 11
++#define SADB_X_EALG_AESCBC 12
++#define SADB_EALG_MAX 253 /* last EALG */
++/* private allocations should use 249-255 (RFC2407) */
++#define SADB_X_EALG_SERPENTCBC 252 /* draft-ietf-ipsec-ciph-aes-cbc-00 */
++#define SADB_X_EALG_TWOFISHCBC 253 /* draft-ietf-ipsec-ciph-aes-cbc-00 */
++
++/* Compression algorithms */
++#define SADB_X_CALG_NONE 0
++#define SADB_X_CALG_OUI 1
++#define SADB_X_CALG_DEFLATE 2
++#define SADB_X_CALG_LZS 3
++#define SADB_X_CALG_LZJH 4
++#define SADB_X_CALG_MAX 4
++
++/* Extension Header values */
++#define SADB_EXT_RESERVED 0
++#define SADB_EXT_SA 1
++#define SADB_EXT_LIFETIME_CURRENT 2
++#define SADB_EXT_LIFETIME_HARD 3
++#define SADB_EXT_LIFETIME_SOFT 4
++#define SADB_EXT_ADDRESS_SRC 5
++#define SADB_EXT_ADDRESS_DST 6
++#define SADB_EXT_ADDRESS_PROXY 7
++#define SADB_EXT_KEY_AUTH 8
++#define SADB_EXT_KEY_ENCRYPT 9
++#define SADB_EXT_IDENTITY_SRC 10
++#define SADB_EXT_IDENTITY_DST 11
++#define SADB_EXT_SENSITIVITY 12
++#define SADB_EXT_PROPOSAL 13
++#define SADB_EXT_SUPPORTED_AUTH 14
++#define SADB_EXT_SUPPORTED_ENCRYPT 15
++#define SADB_EXT_SPIRANGE 16
++#define SADB_X_EXT_KMPRIVATE 17
++#define SADB_X_EXT_POLICY 18
++#define SADB_X_EXT_SA2 19
++/* The next four entries are for setting up NAT Traversal */
++#define SADB_X_EXT_NAT_T_TYPE 20
++#define SADB_X_EXT_NAT_T_SPORT 21
++#define SADB_X_EXT_NAT_T_DPORT 22
++#define SADB_X_EXT_NAT_T_OA 23
++#define SADB_EXT_MAX 23
++
++/* Identity Extension values */
++#define SADB_IDENTTYPE_RESERVED 0
++#define SADB_IDENTTYPE_PREFIX 1
++#define SADB_IDENTTYPE_FQDN 2
++#define SADB_IDENTTYPE_USERFQDN 3
++#define SADB_IDENTTYPE_MAX 3
++
++#endif /* !(_LINUX_PFKEY2_H) */
+diff -Nru a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h
+--- a/include/linux/ppp_defs.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/ppp_defs.h 2005-02-13 21:25:09 +11:00
+@@ -74,12 +74,15 @@
+ #define PPP_IPV6 0x57 /* Internet Protocol Version 6 */
+ #define PPP_COMPFRAG 0xfb /* fragment compressed below bundle */
+ #define PPP_COMP 0xfd /* compressed packet */
++#define PPP_MPLS_UC 0x0281 /* Multi Protocol Label Switching - Unicast */
++#define PPP_MPLS_MC 0x0283 /* Multi Protocol Label Switching - Multicast */
+ #define PPP_IPCP 0x8021 /* IP Control Protocol */
+ #define PPP_ATCP 0x8029 /* AppleTalk Control Protocol */
+ #define PPP_IPXCP 0x802b /* IPX Control Protocol */
+ #define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */
+ #define PPP_CCPFRAG 0x80fb /* CCP at link level (below MP bundle) */
+ #define PPP_CCP 0x80fd /* Compression Control Protocol */
++#define PPP_MPLSCP 0x80fd /* MPLS Control Protocol */
+ #define PPP_LCP 0xc021 /* Link Control Protocol */
+ #define PPP_PAP 0xc023 /* Password Authentication Protocol */
+ #define PPP_LQR 0xc025 /* Link Quality Report protocol */
+diff -Nru a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
+--- a/include/linux/rtnetlink.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/rtnetlink.h 2005-02-13 21:25:09 +11:00
+@@ -204,6 +204,7 @@
+ RTA_PROTOINFO,
+ RTA_FLOW,
+ RTA_CACHEINFO,
++ RTA_SESSION,
+ __RTA_MAX
+ };
+
+@@ -286,11 +287,40 @@
+ #define RTAX_ADVMSS RTAX_ADVMSS
+ RTAX_REORDERING,
+ #define RTAX_REORDERING RTAX_REORDERING
++ RTAX_HOPLIMIT,
++#define RTAX_HOPLIMIT RTAX_HOPLIMIT
++ RTAX_INITCWND,
++#define RTAX_INITCWND RTAX_INITCWND
++ RTAX_FEATURES,
++#define RTAX_FEATURES RTAX_FEATURES
+ __RTAX_MAX
+ };
+
+ #define RTAX_MAX (__RTAX_MAX - 1)
+
++#define RTAX_FEATURE_ECN 0x00000001
++#define RTAX_FEATURE_SACK 0x00000002
++#define RTAX_FEATURE_TIMESTAMP 0x00000004
++
++struct rta_session
++{
++ __u8 proto;
++
++ union {
++ struct {
++ __u16 sport;
++ __u16 dport;
++ } ports;
++
++ struct {
++ __u8 type;
++ __u8 code;
++ __u16 ident;
++ } icmpt;
++
++ __u32 spi;
++ } u;
++};
+
+
+ /*********************************************************
+@@ -323,6 +353,7 @@
+ /* ifa_flags */
+
+ #define IFA_F_SECONDARY 0x01
++#define IFA_F_TEMPORARY IFA_F_SECONDARY
+
+ #define IFA_F_DEPRECATED 0x20
+ #define IFA_F_TENTATIVE 0x40
+@@ -585,7 +616,7 @@
+ extern struct rtnetlink_link * rtnetlink_links[NPROTO];
+ extern int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb);
+ extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
+-extern int rtnetlink_put_metrics(struct sk_buff *skb, unsigned *metrics);
++extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
+
+ extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
+
+diff -Nru a/include/linux/skbuff.h b/include/linux/skbuff.h
+--- a/include/linux/skbuff.h 2005-02-13 21:25:08 +11:00
++++ b/include/linux/skbuff.h 2005-02-13 21:25:08 +11:00
+@@ -148,6 +148,7 @@
+ struct icmphdr *icmph;
+ struct igmphdr *igmph;
+ struct iphdr *ipiph;
++ struct ipv6hdr *ipv6h;
+ struct spxhdr *spxh;
+ unsigned char *raw;
+ } h;
+@@ -169,7 +170,8 @@
+ unsigned char *raw;
+ } mac;
+
+- struct dst_entry *dst;
++ struct dst_entry *dst;
++ struct sec_path *sp;
+
+ /*
+ * This is the control buffer. It is free to use for every
+@@ -181,8 +183,9 @@
+
+ unsigned int len; /* Length of actual data */
+ unsigned int data_len;
++ unsigned int mac_len; /* Length of link layer header */
+ unsigned int csum; /* Checksum */
+- unsigned char __unused, /* Dead field, may be reused */
++ unsigned char local_df,
+ cloned, /* head may be cloned (check refcnt to be sure). */
+ pkt_type, /* Packet class */
+ ip_summed; /* Driver fed us an IP checksum */
+@@ -756,6 +759,24 @@
+ static inline unsigned int skb_headlen(const struct sk_buff *skb)
+ {
+ return skb->len - skb->data_len;
++}
++
++static inline int skb_pagelen(const struct sk_buff *skb)
++{
++ int i, len = 0;
++
++ for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
++ len += skb_shinfo(skb)->frags[i].size;
++ return len + skb_headlen(skb);
++}
++
++static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
++{
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++ frag->page = page;
++ frag->page_offset = off;
++ frag->size = size;
++ skb_shinfo(skb)->nr_frags = i+1;
+ }
+
+ #define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) out_of_line_bug(); } while (0)
+diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h
+--- a/include/linux/sysctl.h 2005-02-13 21:25:08 +11:00
++++ b/include/linux/sysctl.h 2005-02-13 21:25:08 +11:00
+@@ -373,6 +373,8 @@
+ NET_IPV4_CONF_TAG=12,
+ NET_IPV4_CONF_ARPFILTER=13,
+ NET_IPV4_CONF_MEDIUM_ID=14,
++ NET_IPV4_CONF_NOXFRM=15,
++ NET_IPV4_CONF_NOPOLICY=16,
+ NET_IPV4_CONF_FORCE_IGMP_VERSION=17,
+ NET_IPV4_CONF_ARP_ANNOUNCE=18,
+ NET_IPV4_CONF_ARP_IGNORE=19,
+@@ -429,7 +431,12 @@
+ NET_IPV6_DAD_TRANSMITS=7,
+ NET_IPV6_RTR_SOLICITS=8,
+ NET_IPV6_RTR_SOLICIT_INTERVAL=9,
+- NET_IPV6_RTR_SOLICIT_DELAY=10
++ NET_IPV6_RTR_SOLICIT_DELAY=10,
++ NET_IPV6_USE_TEMPADDR=11,
++ NET_IPV6_TEMP_VALID_LFT=12,
++ NET_IPV6_TEMP_PREFERED_LFT=13,
++ NET_IPV6_REGEN_MAX_RETRY=14,
++ NET_IPV6_MAX_DESYNC_FACTOR=15
+ };
+
+ /* /proc/sys/net/ipv6/icmp */
+diff -Nru a/include/linux/timer.h b/include/linux/timer.h
+--- a/include/linux/timer.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/timer.h 2005-02-13 21:25:09 +11:00
+@@ -3,6 +3,7 @@
+
+ #include <linux/config.h>
+ #include <linux/list.h>
++#include <linux/stddef.h>
+
+ /*
+ * In Linux 2.4, static timers have been removed from the kernel.
+diff -Nru a/include/linux/udp.h b/include/linux/udp.h
+--- a/include/linux/udp.h 2005-02-13 21:25:09 +11:00
++++ b/include/linux/udp.h 2005-02-13 21:25:09 +11:00
+@@ -17,6 +17,7 @@
+ #ifndef _LINUX_UDP_H
+ #define _LINUX_UDP_H
+
++#include <linux/types.h>
+
+ struct udphdr {
+ __u16 source;
+@@ -25,5 +26,12 @@
+ __u16 check;
+ };
+
++/* UDP socket options */
++#define UDP_CORK 1 /* Never send partially complete segments */
++#define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */
++
++/* UDP encapsulation types */
++#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
++#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */
+
+ #endif /* _LINUX_UDP_H */
+diff -Nru a/include/linux/xfrm.h b/include/linux/xfrm.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/linux/xfrm.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,257 @@
++#ifndef _LINUX_XFRM_H
++#define _LINUX_XFRM_H
++
++#include <linux/types.h>
++
++/* All of the structures in this file may not change size as they are
++ * passed into the kernel from userspace via netlink sockets.
++ */
++
++/* Structure to encapsulate addresses. I do not want to use
++ * "standard" structure. My apologies.
++ */
++typedef union
++{
++ __u32 a4;
++ __u32 a6[4];
++} xfrm_address_t;
++
++/* Ident of a specific xfrm_state. It is used on input to lookup
++ * the state by (spi,daddr,ah/esp) or to store information about
++ * spi, protocol and tunnel address on output.
++ */
++struct xfrm_id
++{
++ xfrm_address_t daddr;
++ __u32 spi;
++ __u8 proto;
++};
++
++/* Selector, used as selector both on policy rules (SPD) and SAs. */
++
++struct xfrm_selector
++{
++ xfrm_address_t daddr;
++ xfrm_address_t saddr;
++ __u16 dport;
++ __u16 dport_mask;
++ __u16 sport;
++ __u16 sport_mask;
++ __u16 family;
++ __u8 prefixlen_d;
++ __u8 prefixlen_s;
++ __u8 proto;
++ int ifindex;
++ uid_t user;
++};
++
++#define XFRM_INF (~(__u64)0)
++
++struct xfrm_lifetime_cfg
++{
++ __u64 soft_byte_limit;
++ __u64 hard_byte_limit;
++ __u64 soft_packet_limit;
++ __u64 hard_packet_limit;
++ __u64 soft_add_expires_seconds;
++ __u64 hard_add_expires_seconds;
++ __u64 soft_use_expires_seconds;
++ __u64 hard_use_expires_seconds;
++};
++
++struct xfrm_lifetime_cur
++{
++ __u64 bytes;
++ __u64 packets;
++ __u64 add_time;
++ __u64 use_time;
++};
++
++struct xfrm_replay_state
++{
++ __u32 oseq;
++ __u32 seq;
++ __u32 bitmap;
++};
++
++struct xfrm_algo {
++ char alg_name[64];
++ int alg_key_len; /* in bits */
++ char alg_key[0];
++};
++
++struct xfrm_stats {
++ __u32 replay_window;
++ __u32 replay;
++ __u32 integrity_failed;
++};
++
++enum
++{
++ XFRM_POLICY_IN = 0,
++ XFRM_POLICY_OUT = 1,
++ XFRM_POLICY_FWD = 2,
++ XFRM_POLICY_MAX = 3
++};
++
++enum
++{
++ XFRM_SHARE_ANY, /* No limitations */
++ XFRM_SHARE_SESSION, /* For this session only */
++ XFRM_SHARE_USER, /* For this user only */
++ XFRM_SHARE_UNIQUE /* Use once */
++};
++
++/* Netlink configuration messages. */
++enum {
++ XFRM_MSG_BASE = 0x10,
++
++ XFRM_MSG_NEWSA = 0x10,
++#define XFRM_MSG_NEWSA XFRM_MSG_NEWSA
++ XFRM_MSG_DELSA,
++#define XFRM_MSG_DELSA XFRM_MSG_DELSA
++ XFRM_MSG_GETSA,
++#define XFRM_MSG_GETSA XFRM_MSG_GETSA
++
++ XFRM_MSG_NEWPOLICY,
++#define XFRM_MSG_NEWPOLICY XFRM_MSG_NEWPOLICY
++ XFRM_MSG_DELPOLICY,
++#define XFRM_MSG_DELPOLICY XFRM_MSG_DELPOLICY
++ XFRM_MSG_GETPOLICY,
++#define XFRM_MSG_GETPOLICY XFRM_MSG_GETPOLICY
++
++ XFRM_MSG_ALLOCSPI,
++#define XFRM_MSG_ALLOCSPI XFRM_MSG_ALLOCSPI
++ XFRM_MSG_ACQUIRE,
++#define XFRM_MSG_ACQUIRE XFRM_MSG_ACQUIRE
++ XFRM_MSG_EXPIRE,
++#define XFRM_MSG_EXPIRE XFRM_MSG_EXPIRE
++
++ XFRM_MSG_UPDPOLICY,
++#define XFRM_MSG_UPDPOLICY XFRM_MSG_UPDPOLICY
++ XFRM_MSG_UPDSA,
++#define XFRM_MSG_UPDSA XFRM_MSG_UPDSA
++
++ XFRM_MSG_POLEXPIRE,
++#define XFRM_MSG_POLEXPIRE XFRM_MSG_POLEXPIRE
++
++ XFRM_MSG_FLUSHSA,
++#define XFRM_MSG_FLUSHSA XFRM_MSG_FLUSHSA
++ XFRM_MSG_FLUSHPOLICY,
++#define XFRM_MSG_FLUSHPOLICY XFRM_MSG_FLUSHPOLICY
++
++ XFRM_MSG_MAX
++};
++
++struct xfrm_user_tmpl {
++ struct xfrm_id id;
++ __u16 family;
++ xfrm_address_t saddr;
++ __u32 reqid;
++ __u8 mode;
++ __u8 share;
++ __u8 optional;
++ __u32 aalgos;
++ __u32 ealgos;
++ __u32 calgos;
++};
++
++struct xfrm_encap_tmpl {
++ __u16 encap_type;
++ __u16 encap_sport;
++ __u16 encap_dport;
++ xfrm_address_t encap_oa;
++};
++
++/* Netlink message attributes. */
++enum xfrm_attr_type_t {
++ XFRMA_UNSPEC,
++ XFRMA_ALG_AUTH, /* struct xfrm_algo */
++ XFRMA_ALG_CRYPT, /* struct xfrm_algo */
++ XFRMA_ALG_COMP, /* struct xfrm_algo */
++ XFRMA_ENCAP, /* struct xfrm_algo + struct xfrm_encap_tmpl */
++ XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */
++ __XFRMA_MAX
++
++#define XFRMA_MAX (__XFRMA_MAX - 1)
++};
++
++struct xfrm_usersa_info {
++ struct xfrm_selector sel;
++ struct xfrm_id id;
++ xfrm_address_t saddr;
++ struct xfrm_lifetime_cfg lft;
++ struct xfrm_lifetime_cur curlft;
++ struct xfrm_stats stats;
++ __u32 seq;
++ __u32 reqid;
++ __u16 family;
++ __u8 mode; /* 0=transport,1=tunnel */
++ __u8 replay_window;
++ __u8 flags;
++#define XFRM_STATE_NOECN 1
++};
++
++struct xfrm_usersa_id {
++ xfrm_address_t daddr;
++ __u32 spi;
++ __u16 family;
++ __u8 proto;
++};
++
++struct xfrm_userspi_info {
++ struct xfrm_usersa_info info;
++ __u32 min;
++ __u32 max;
++};
++
++struct xfrm_userpolicy_info {
++ struct xfrm_selector sel;
++ struct xfrm_lifetime_cfg lft;
++ struct xfrm_lifetime_cur curlft;
++ __u32 priority;
++ __u32 index;
++ __u8 dir;
++ __u8 action;
++#define XFRM_POLICY_ALLOW 0
++#define XFRM_POLICY_BLOCK 1
++ __u8 flags;
++#define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */
++ __u8 share;
++};
++
++struct xfrm_userpolicy_id {
++ struct xfrm_selector sel;
++ __u32 index;
++ __u8 dir;
++};
++
++struct xfrm_user_acquire {
++ struct xfrm_id id;
++ xfrm_address_t saddr;
++ struct xfrm_selector sel;
++ struct xfrm_userpolicy_info policy;
++ __u32 aalgos;
++ __u32 ealgos;
++ __u32 calgos;
++ __u32 seq;
++};
++
++struct xfrm_user_expire {
++ struct xfrm_usersa_info state;
++ __u8 hard;
++};
++
++struct xfrm_user_polexpire {
++ struct xfrm_userpolicy_info pol;
++ __u8 hard;
++};
++
++struct xfrm_usersa_flush {
++ __u8 proto;
++};
++
++#define XFRMGRP_ACQUIRE 1
++#define XFRMGRP_EXPIRE 2
++
++#endif /* _LINUX_XFRM_H */
+diff -Nru a/include/net/addrconf.h b/include/net/addrconf.h
+--- a/include/net/addrconf.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/addrconf.h 2005-02-13 21:25:09 +11:00
+@@ -6,6 +6,13 @@
+ #define MAX_RTR_SOLICITATIONS 3
+ #define RTR_SOLICITATION_INTERVAL (4*HZ)
+
++#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */
++
++#define TEMP_VALID_LIFETIME (7*86400)
++#define TEMP_PREFERRED_LIFETIME (86400)
++#define REGEN_MAX_RETRY (5)
++#define MAX_DESYNC_FACTOR (600)
++
+ #define ADDR_CHECK_FREQUENCY (120*HZ)
+
+ struct prefix_info {
+diff -Nru a/include/net/ah.h b/include/net/ah.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/net/ah.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,35 @@
++#ifndef _NET_AH_H
++#define _NET_AH_H
++
++#include <net/xfrm.h>
++
++/* This is the maximum truncated ICV length that we know of. */
++#define MAX_AH_AUTH_LEN 12
++
++struct ah_data
++{
++ u8 *key;
++ int key_len;
++ u8 *work_icv;
++ int icv_full_len;
++ int icv_trunc_len;
++
++ void (*icv)(struct ah_data*,
++ struct sk_buff *skb, u8 *icv);
++
++ struct crypto_tfm *tfm;
++};
++
++static inline void
++ah_hmac_digest(struct ah_data *ahp, struct sk_buff *skb, u8 *auth_data)
++{
++ struct crypto_tfm *tfm = ahp->tfm;
++
++ memset(auth_data, 0, ahp->icv_trunc_len);
++ crypto_hmac_init(tfm, ahp->key, &ahp->key_len);
++ skb_icv_walk(skb, tfm, 0, skb->len, crypto_hmac_update);
++ crypto_hmac_final(tfm, ahp->key, &ahp->key_len, ahp->work_icv);
++ memcpy(auth_data, ahp->work_icv, ahp->icv_trunc_len);
++}
++
++#endif
+diff -Nru a/include/net/dn_fib.h b/include/net/dn_fib.h
+--- a/include/net/dn_fib.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/dn_fib.h 2005-02-13 21:25:09 +11:00
+@@ -7,6 +7,9 @@
+
+ #include <linux/rtnetlink.h>
+
++/* WARNING: The ordering of these elements must match ordering
++ * of RTA_* rtnetlink attribute numbers.
++ */
+ struct dn_kern_rta
+ {
+ void *rta_dst;
+@@ -19,8 +22,9 @@
+ struct rtattr *rta_mx;
+ struct rtattr *rta_mp;
+ unsigned char *rta_protoinfo;
+- unsigned char *rta_flow;
++ u32 *rta_flow;
+ struct rta_cacheinfo *rta_ci;
++ struct rta_session *rta_sess;
+ };
+
+ struct dn_fib_key {
+diff -Nru a/include/net/dn_route.h b/include/net/dn_route.h
+--- a/include/net/dn_route.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/dn_route.h 2005-02-13 21:25:09 +11:00
+@@ -122,7 +122,7 @@
+ if ((dst = sk->dst_cache) && !dst->obsolete) {
+ try_again:
+ skb->dst = dst_clone(dst);
+- dst->output(skb);
++ dst_output(skb);
+ return;
+ }
+
+diff -Nru a/include/net/dst.h b/include/net/dst.h
+--- a/include/net/dst.h 2005-02-13 21:25:10 +11:00
++++ b/include/net/dst.h 2005-02-13 21:25:10 +11:00
+@@ -9,6 +9,8 @@
+ #define _NET_DST_H
+
+ #include <linux/config.h>
++#include <linux/rtnetlink.h>
++#include <linux/netdevice.h>
+ #include <net/neighbour.h>
+
+ /*
+@@ -22,6 +24,13 @@
+ #define DST_GC_INC (HZ/2)
+ #define DST_GC_MAX (120*HZ)
+
++/* Each dst_entry has reference count and sits in some parent list(s).
++ * When it is removed from parent list, it is "freed" (dst_free).
++ * After this it enters dead state (dst->obsolete > 0) and if its refcnt
++ * is zero, it can be destroyed immediately, otherwise it is added
++ * to gc list and garbage collector periodically checks the refcnt.
++ */
++
+ struct sk_buff;
+
+ struct dst_entry
+@@ -29,22 +38,22 @@
+ struct dst_entry *next;
+ atomic_t __refcnt; /* client references */
+ int __use;
++ struct dst_entry *child;
+ struct net_device *dev;
+ int obsolete;
+ int flags;
+ #define DST_HOST 1
++#define DST_NOXFRM 2
++#define DST_NOPOLICY 4
++#define DST_NOHASH 8
+ unsigned long lastuse;
+ unsigned long expires;
+
+- unsigned mxlock;
+- unsigned pmtu;
+- unsigned window;
+- unsigned rtt;
+- unsigned rttvar;
+- unsigned ssthresh;
+- unsigned cwnd;
+- unsigned advmss;
+- unsigned reordering;
++ unsigned short header_len; /* more space at head required */
++ unsigned short trailer_len; /* space to reserve at tail */
++
++ u32 metrics[RTAX_MAX];
++ struct dst_entry *path;
+
+ unsigned long rate_last; /* rate limiting for ICMP */
+ unsigned long rate_tokens;
+@@ -53,6 +62,7 @@
+
+ struct neighbour *neighbour;
+ struct hh_cache *hh;
++ struct xfrm_state *xfrm;
+
+ int (*input)(struct sk_buff*);
+ int (*output)(struct sk_buff*);
+@@ -75,11 +85,11 @@
+
+ int (*gc)(void);
+ struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
+- struct dst_entry * (*reroute)(struct dst_entry *,
+- struct sk_buff *);
+ void (*destroy)(struct dst_entry *);
+ struct dst_entry * (*negative_advice)(struct dst_entry *);
+ void (*link_failure)(struct sk_buff *);
++ void (*update_pmtu)(struct dst_entry *dst, u32 mtu);
++ int (*get_mss)(struct dst_entry *dst, u32 mtu);
+ int entry_size;
+
+ atomic_t entries;
+@@ -88,6 +98,33 @@
+
+ #ifdef __KERNEL__
+
++static inline u32
++dst_metric(struct dst_entry *dst, int metric)
++{
++ return dst->metrics[metric-1];
++}
++
++static inline u32
++dst_path_metric(struct dst_entry *dst, int metric)
++{
++ return dst->path->metrics[metric-1];
++}
++
++static inline u32
++dst_pmtu(struct dst_entry *dst)
++{
++ u32 mtu = dst_path_metric(dst, RTAX_MTU);
++ /* Yes, _exactly_. This is paranoia. */
++ barrier();
++ return mtu;
++}
++
++static inline int
++dst_metric_locked(struct dst_entry *dst, int metric)
++{
++ return dst_metric(dst, RTAX_LOCK) & (1<<metric);
++}
++
+ static inline void dst_hold(struct dst_entry * dst)
+ {
+ atomic_inc(&dst->__refcnt);
+@@ -104,22 +141,40 @@
+ static inline
+ void dst_release(struct dst_entry * dst)
+ {
+- if (dst)
++ if (dst) {
++ if (atomic_read(&dst->__refcnt) < 1) {
++ printk("BUG: dst underflow %d: %p\n",
++ atomic_read(&dst->__refcnt),
++ current_text_addr());
++ }
+ atomic_dec(&dst->__refcnt);
++ }
++}
++
++/* Children define the path of the packet through the
++ * Linux networking. Thus, destinations are stackable.
++ */
++
++static inline struct dst_entry *dst_pop(struct dst_entry *dst)
++{
++ struct dst_entry *child = dst_clone(dst->child);
++
++ dst_release(dst);
++ return child;
+ }
+
+ extern void * dst_alloc(struct dst_ops * ops);
+ extern void __dst_free(struct dst_entry * dst);
+-extern void dst_destroy(struct dst_entry * dst);
++extern struct dst_entry *dst_destroy(struct dst_entry * dst);
+
+-static inline
+-void dst_free(struct dst_entry * dst)
++static inline void dst_free(struct dst_entry * dst)
+ {
+ if (dst->obsolete > 1)
+ return;
+ if (!atomic_read(&dst->__refcnt)) {
+- dst_destroy(dst);
+- return;
++ dst = dst_destroy(dst);
++ if (!dst)
++ return;
+ }
+ __dst_free(dst);
+ }
+@@ -155,8 +210,50 @@
+ dst->expires = expires;
+ }
+
++/* Output packet to network from transport. */
++static inline int dst_output(struct sk_buff *skb)
++{
++ int err;
++
++ for (;;) {
++ err = skb->dst->output(skb);
++
++ if (likely(err == 0))
++ return err;
++ if (unlikely(err != NET_XMIT_BYPASS))
++ return err;
++ }
++}
++
++/* Input packet from network to transport. */
++static inline int dst_input(struct sk_buff *skb)
++{
++ int err;
++
++ for (;;) {
++ err = skb->dst->input(skb);
++
++ if (likely(err == 0))
++ return err;
++ /* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
++ if (unlikely(err != NET_XMIT_BYPASS))
++ return err;
++ }
++}
++
+ extern void dst_init(void);
+
++struct flowi;
++#ifndef CONFIG_XFRM
++static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
++ struct sock *sk, int flags)
++{
++ return 0;
++}
++#else
++extern int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
++ struct sock *sk, int flags);
++#endif
+ #endif
+
+ #endif /* _NET_DST_H */
+diff -Nru a/include/net/esp.h b/include/net/esp.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/net/esp.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,59 @@
++#ifndef _NET_ESP_H
++#define _NET_ESP_H
++
++#include <net/xfrm.h>
++#include <asm/scatterlist.h>
++
++#define ESP_NUM_FAST_SG 4
++
++struct esp_data
++{
++ struct scatterlist sgbuf[ESP_NUM_FAST_SG];
++
++ /* Confidentiality */
++ struct {
++ u8 *key; /* Key */
++ int key_len; /* Key length */
++ u8 *ivec; /* ivec buffer */
++ /* ivlen is offset from enc_data, where encrypted data start.
++ * It is logically different of crypto_tfm_alg_ivsize(tfm).
++ * We assume that it is either zero (no ivec), or
++ * >= crypto_tfm_alg_ivsize(tfm). */
++ int ivlen;
++ int padlen; /* 0..255 */
++ struct crypto_tfm *tfm; /* crypto handle */
++ } conf;
++
++ /* Integrity. It is active when icv_full_len != 0 */
++ struct {
++ u8 *key; /* Key */
++ int key_len; /* Length of the key */
++ u8 *work_icv;
++ int icv_full_len;
++ int icv_trunc_len;
++ void (*icv)(struct esp_data*,
++ struct sk_buff *skb,
++ int offset, int len, u8 *icv);
++ struct crypto_tfm *tfm;
++ } auth;
++};
++
++extern int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len);
++extern int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
++extern void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
++
++static inline void
++esp_hmac_digest(struct esp_data *esp, struct sk_buff *skb, int offset,
++ int len, u8 *auth_data)
++{
++ struct crypto_tfm *tfm = esp->auth.tfm;
++ char *icv = esp->auth.work_icv;
++
++ memset(auth_data, 0, esp->auth.icv_trunc_len);
++ crypto_hmac_init(tfm, esp->auth.key, &esp->auth.key_len);
++ skb_icv_walk(skb, tfm, offset, len, crypto_hmac_update);
++ crypto_hmac_final(tfm, esp->auth.key, &esp->auth.key_len, icv);
++ memcpy(auth_data, icv, esp->auth.icv_trunc_len);
++}
++
++#endif
+diff -Nru a/include/net/flow.h b/include/net/flow.h
+--- a/include/net/flow.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/flow.h 2005-02-13 21:25:09 +11:00
+@@ -1,24 +1,31 @@
+ /*
+ *
+- * Flow based forwarding rules (usage: firewalling, etc)
++ * Generic internet FLOW.
+ *
+ */
+
+ #ifndef _NET_FLOW_H
+ #define _NET_FLOW_H
+
++#include <linux/in6.h>
++#include <asm/atomic.h>
++
+ struct flowi {
+- int proto; /* {TCP, UDP, ICMP} */
++ int oif;
++ int iif;
+
+ union {
+ struct {
+ __u32 daddr;
+ __u32 saddr;
++ __u32 fwmark;
++ __u8 tos;
++ __u8 scope;
+ } ip4_u;
+
+ struct {
+- struct in6_addr * daddr;
+- struct in6_addr * saddr;
++ struct in6_addr daddr;
++ struct in6_addr saddr;
+ __u32 flowlabel;
+ } ip6_u;
+ } nl_u;
+@@ -27,9 +34,12 @@
+ #define fl6_flowlabel nl_u.ip6_u.flowlabel
+ #define fl4_dst nl_u.ip4_u.daddr
+ #define fl4_src nl_u.ip4_u.saddr
++#define fl4_fwmark nl_u.ip4_u.fwmark
++#define fl4_tos nl_u.ip4_u.tos
++#define fl4_scope nl_u.ip4_u.scope
+
+- int oif;
+-
++ __u8 proto;
++ __u8 flags;
+ union {
+ struct {
+ __u16 sport;
+@@ -41,61 +51,27 @@
+ __u8 code;
+ } icmpt;
+
+- unsigned long data;
++ __u32 spi;
+ } uli_u;
+-};
+-
+-#define FLOWR_NODECISION 0 /* rule not appliable to flow */
+-#define FLOWR_SELECT 1 /* flow must follow this rule */
+-#define FLOWR_CLEAR 2 /* priority level clears flow */
+-#define FLOWR_ERROR 3
+-
+-struct fl_acc_args {
+- int type;
+-
+-
+-#define FL_ARG_FORWARD 1
+-#define FL_ARG_ORIGIN 2
+-
+- union {
+- struct sk_buff *skb;
+- struct {
+- struct sock *sk;
+- struct flowi *flow;
+- } fl_o;
+- } fl_u;
+-};
+-
+-
+-struct pkt_filter {
+- atomic_t refcnt;
+- unsigned int offset;
+- __u32 value;
+- __u32 mask;
+- struct pkt_filter *next;
+-};
+-
+-#define FLR_INPUT 1
+-#define FLR_OUTPUT 2
+-
+-struct flow_filter {
+- int type;
+- union {
+- struct pkt_filter *filter;
+- struct sock *sk;
+- } u;
+-};
+-
+-struct flow_rule {
+- struct flow_rule_ops *ops;
+- unsigned char private[0];
+-};
+-
+-struct flow_rule_ops {
+- int (*accept)(struct rt6_info *rt,
+- struct rt6_info *rule,
+- struct fl_acc_args *args,
+- struct rt6_info **nrt);
+-};
++#define fl_ip_sport uli_u.ports.sport
++#define fl_ip_dport uli_u.ports.dport
++#define fl_icmp_type uli_u.icmpt.type
++#define fl_icmp_code uli_u.icmpt.code
++#define fl_ipsec_spi uli_u.spi
++
++ u32 __pad;
++} __attribute__((__aligned__(BITS_PER_LONG/8)));
++
++#define FLOW_DIR_IN 0
++#define FLOW_DIR_OUT 1
++#define FLOW_DIR_FWD 2
++
++typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir,
++ void **objp, atomic_t **obj_refp);
++
++extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
++ flow_resolve_t resolver);
++extern void flow_cache_flush(void);
++extern atomic_t flow_cache_genid;
+
+ #endif
+diff -Nru a/include/net/if_inet6.h b/include/net/if_inet6.h
+--- a/include/net/if_inet6.h 2005-02-13 21:25:10 +11:00
++++ b/include/net/if_inet6.h 2005-02-13 21:25:10 +11:00
+@@ -47,6 +47,12 @@
+ struct inet6_ifaddr *lst_next; /* next addr in addr_lst */
+ struct inet6_ifaddr *if_next; /* next addr in inet6_dev */
+
++#ifdef CONFIG_IPV6_PRIVACY
++ struct inet6_ifaddr *tmp_next; /* next addr in tempaddr_lst */
++ struct inet6_ifaddr *ifpub;
++ int regen_count;
++#endif
++
+ int dead;
+ };
+
+@@ -150,6 +156,15 @@
+ atomic_t refcnt;
+ __u32 if_flags;
+ int dead;
++
++#ifdef CONFIG_IPV6_PRIVACY
++ u8 rndid[8];
++ u8 entropy[8];
++ struct timer_list regen_timer;
++ struct inet6_ifaddr *tempaddr_list;
++ __u8 work_eui64[8];
++ __u8 work_digest[16];
++#endif
+
+ struct neigh_parms *nd_parms;
+ struct inet6_dev *next;
+diff -Nru a/include/net/inet_ecn.h b/include/net/inet_ecn.h
+--- a/include/net/inet_ecn.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/inet_ecn.h 2005-02-13 21:25:09 +11:00
+@@ -1,6 +1,8 @@
+ #ifndef _INET_ECN_H_
+ #define _INET_ECN_H_
+
++#include <linux/ip.h>
++
+ enum {
+ INET_ECN_NOT_ECT = 0,
+ INET_ECN_ECT_1 = 1,
+@@ -52,11 +54,21 @@
+ iph->tos |= 1;
+ }
+
++static inline void IP_ECN_clear(struct iphdr *iph)
++{
++ iph->tos &= ~3;
++}
++
+ struct ipv6hdr;
+
+ static inline void IP6_ECN_set_ce(struct ipv6hdr *iph)
+ {
+ *(u32*)iph |= htonl(1<<20);
++}
++
++static inline void IP6_ECN_clear(struct ipv6hdr *iph)
++{
++ *(u32*)iph &= ~htonl(3<<20);
+ }
+
+ #define ip6_get_dsfield(iph) ((ntohs(*(u16*)(iph)) >> 4) & 0xFF)
+diff -Nru a/include/net/ip.h b/include/net/ip.h
+--- a/include/net/ip.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ip.h 2005-02-13 21:25:09 +11:00
+@@ -29,6 +29,7 @@
+ #include <linux/netdevice.h>
+ #include <linux/inetdevice.h>
+ #include <linux/in_route.h>
++#include <linux/sysctl.h>
+ #include <net/route.h>
+ #include <net/arp.h>
+
+@@ -46,6 +47,7 @@
+ #define IPSKB_MASQUERADED 1
+ #define IPSKB_TRANSLATED 2
+ #define IPSKB_FORWARDED 4
++#define IPSKB_XFRM_TUNNEL_SIZE 8
+ };
+
+ struct ipcm_cookie
+@@ -98,17 +100,19 @@
+ extern void ip_send_check(struct iphdr *ip);
+ extern int ip_queue_xmit(struct sk_buff *skb, int ipfragok);
+ extern void ip_init(void);
+-extern int ip_build_xmit(struct sock *sk,
+- int getfrag (const void *,
+- char *,
+- unsigned int,
+- unsigned int,
+- struct sk_buff *),
+- const void *frag,
+- unsigned length,
+- struct ipcm_cookie *ipc,
+- struct rtable *rt,
+- int flags);
++extern int ip_append_data(struct sock *sk,
++ int getfrag(void *from, char *to, int offset, int len,
++ int odd, struct sk_buff *skb),
++ void *from, int len, int protolen,
++ struct ipcm_cookie *ipc,
++ struct rtable *rt,
++ unsigned int flags);
++extern int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
++extern ssize_t ip_append_page(struct sock *sk, struct page *page,
++ int offset, size_t size, int flags);
++extern int ip_push_pending_frames(struct sock *sk);
++extern void ip_flush_pending_frames(struct sock *sk);
++
+
+ /*
+ * Map a multicast IP onto multicast MAC for type Token Ring.
+@@ -128,8 +132,7 @@
+ }
+
+ struct ip_reply_arg {
+- struct iovec iov[2];
+- int n_iov; /* redundant */
++ struct iovec iov[1];
+ u32 csum;
+ int csumoffset; /* u16 offset of csum in iov[0].iov_base */
+ /* -1 if not needed */
+@@ -161,14 +164,6 @@
+ extern int sysctl_ip_default_ttl;
+
+ #ifdef CONFIG_INET
+-static inline int ip_send(struct sk_buff *skb)
+-{
+- if (skb->len > skb->dst->pmtu)
+- return ip_fragment(skb, ip_finish_output);
+- else
+- return ip_finish_output(skb);
+-}
+-
+ /* The function in 2.2 was invalid, producing wrong result for
+ * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
+ static inline
+@@ -185,7 +180,7 @@
+ {
+ return (sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_DO ||
+ (sk->protinfo.af_inet.pmtudisc == IP_PMTUDISC_WANT &&
+- !(dst->mxlock&(1<<RTAX_MTU))));
++ !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU))));
+ }
+
+ extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst);
+@@ -268,5 +263,16 @@
+ u16 port, u32 info, u8 *payload);
+ extern void ip_local_error(struct sock *sk, int err, u32 daddr, u16 dport,
+ u32 info);
++
++/* sysctl helpers - any sysctl which holds a value that ends up being
++ * fed into the routing cache should use these handlers.
++ */
++int ipv4_doint_and_flush(ctl_table *ctl, int write,
++ struct file* filp, void *buffer,
++ size_t *lenp);
++int ipv4_doint_and_flush_strategy(ctl_table *table, int *name, int nlen,
++ void *oldval, size_t *oldlenp,
++ void *newval, size_t newlen,
++ void **context);
+
+ #endif /* _IP_H */
+diff -Nru a/include/net/ip6_fib.h b/include/net/ip6_fib.h
+--- a/include/net/ip6_fib.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ip6_fib.h 2005-02-13 21:25:09 +11:00
+@@ -67,17 +67,8 @@
+
+ u32 rt6i_flags;
+ u32 rt6i_metric;
+- u8 rt6i_hoplimit;
+ atomic_t rt6i_ref;
+
+- union {
+- struct flow_rule *rt6iu_flowr;
+- struct flow_filter *rt6iu_filter;
+- } flow_u;
+-
+-#define rt6i_flowr flow_u.rt6iu_flowr
+-#define rt6i_filter flow_u.rt6iu_filter
+-
+ struct rt6key rt6i_dst;
+ struct rt6key rt6i_src;
+
+@@ -171,10 +162,12 @@
+
+ extern int fib6_add(struct fib6_node *root,
+ struct rt6_info *rt,
+- struct nlmsghdr *nlh);
++ struct nlmsghdr *nlh,
++ void *rtattr);
+
+ extern int fib6_del(struct rt6_info *rt,
+- struct nlmsghdr *nlh);
++ struct nlmsghdr *nlh,
++ void *rtattr);
+
+ extern void inet6_rt_notify(int event, struct rt6_info *rt,
+ struct nlmsghdr *nlh);
+diff -Nru a/include/net/ip6_fw.h b/include/net/ip6_fw.h
+--- a/include/net/ip6_fw.h 2005-02-13 21:25:09 +11:00
++++ /dev/null Wed Dec 31 16:00:00 196900
+@@ -1,54 +0,0 @@
+-#ifndef __NET_IP6_FW_H
+-#define __NET_IP6_FW_H
+-
+-#define IP6_FW_LISTHEAD 0x1000
+-#define IP6_FW_ACCEPT 0x0001
+-#define IP6_FW_REJECT 0x0002
+-
+-#define IP6_FW_DEBUG 2
+-
+-#define IP6_FW_MSG_ADD 1
+-#define IP6_FW_MSG_DEL 2
+-#define IP6_FW_MSG_REPORT 3
+-
+-
+-/*
+- * Fast "hack" user interface
+- */
+-struct ip6_fw_msg {
+- struct in6_addr dst;
+- struct in6_addr src;
+- int dst_len;
+- int src_len;
+- int action;
+- int policy;
+- int proto;
+- union {
+- struct {
+- __u16 sport;
+- __u16 dport;
+- } transp;
+-
+- unsigned long data;
+-
+- int icmp_type;
+- } u;
+-
+- int msg_len;
+-};
+-
+-#ifdef __KERNEL__
+-
+-#include <net/flow.h>
+-
+-struct ip6_fw_rule {
+- struct flow_rule flowr;
+- struct ip6_fw_rule *next;
+- struct ip6_fw_rule *prev;
+- struct flowi info;
+- unsigned long policy;
+-};
+-
+-#endif
+-
+-#endif
+diff -Nru a/include/net/ip6_route.h b/include/net/ip6_route.h
+--- a/include/net/ip6_route.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ip6_route.h 2005-02-13 21:25:09 +11:00
+@@ -39,12 +39,15 @@
+ extern int ipv6_route_ioctl(unsigned int cmd, void *arg);
+
+ extern int ip6_route_add(struct in6_rtmsg *rtmsg,
+- struct nlmsghdr *);
++ struct nlmsghdr *,
++ void *rtattr);
+ extern int ip6_del_rt(struct rt6_info *,
+- struct nlmsghdr *);
++ struct nlmsghdr *,
++ void *rtattr);
+
+ extern int ip6_rt_addr_add(struct in6_addr *addr,
+- struct net_device *dev);
++ struct net_device *dev,
++ int anycast);
+
+ extern int ip6_rt_addr_del(struct in6_addr *addr,
+ struct net_device *dev);
+@@ -60,6 +63,12 @@
+ struct in6_addr *saddr,
+ int oif, int flags);
+
++extern struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
++ struct neighbour *neigh,
++ int (*output)(struct sk_buff *));
++extern int ndisc_dst_gc(int *more);
++extern void fib6_force_start_gc(void);
++
+ /*
+ * support functions for ND
+ *
+@@ -109,6 +118,13 @@
+ np->daddr_cache = daddr;
+ np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ write_unlock(&sk->dst_lock);
++}
++
++static inline int ipv6_unicast_destination(struct sk_buff *skb)
++{
++ struct rt6_info *rt = (struct rt6_info *) skb->dst;
++
++ return rt->rt6i_flags & RTF_LOCAL;
+ }
+
+ #endif
+diff -Nru a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/net/ip6_tunnel.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,46 @@
++/*
++ * $Id$
++ */
++
++#ifndef _NET_IP6_TUNNEL_H
++#define _NET_IP6_TUNNEL_H
++
++#include <linux/ipv6.h>
++#include <linux/netdevice.h>
++#include <linux/ip6_tunnel.h>
++
++/* capable of sending packets */
++#define IP6_TNL_F_CAP_XMIT 0x10000
++/* capable of receiving packets */
++#define IP6_TNL_F_CAP_RCV 0x20000
++
++#define IP6_TNL_MAX 128
++
++/* IPv6 tunnel */
++
++struct ip6_tnl {
++ struct ip6_tnl *next; /* next tunnel in list */
++ struct net_device *dev; /* virtual device associated with tunnel */
++ struct net_device_stats stat; /* statistics for tunnel device */
++ int recursion; /* depth of hard_start_xmit recursion */
++ struct ip6_tnl_parm parms; /* tunnel configuration paramters */
++ struct flowi fl; /* flowi template for xmit */
++ struct dst_entry *dst_cache; /* cached dst */
++ u32 dst_cookie;
++};
++
++/* Tunnel encapsulation limit destination sub-option */
++
++struct ipv6_tlv_tnl_enc_lim {
++ __u8 type; /* type-code for option */
++ __u8 length; /* option length */
++ __u8 encap_limit; /* tunnel encapsulation limit */
++} __attribute__ ((packed));
++
++#ifdef __KERNEL__
++#ifdef CONFIG_IPV6_TUNNEL
++extern int __init ip6_tunnel_init(void);
++extern void ip6_tunnel_cleanup(void);
++#endif
++#endif
++#endif
+diff -Nru a/include/net/ip_fib.h b/include/net/ip_fib.h
+--- a/include/net/ip_fib.h 2005-02-13 21:25:10 +11:00
++++ b/include/net/ip_fib.h 2005-02-13 21:25:10 +11:00
+@@ -17,7 +17,11 @@
+ #define _NET_IP_FIB_H
+
+ #include <linux/config.h>
++#include <net/flow.h>
+
++/* WARNING: The ordering of these elements must match ordering
++ * of RTA_* rtnetlink attribute numbers.
++ */
+ struct kern_rta
+ {
+ void *rta_dst;
+@@ -30,8 +34,9 @@
+ struct rtattr *rta_mx;
+ struct rtattr *rta_mp;
+ unsigned char *rta_protoinfo;
+- unsigned char *rta_flow;
++ u32 *rta_flow;
+ struct rta_cacheinfo *rta_ci;
++ struct rta_session *rta_sess;
+ };
+
+ struct fib_nh
+@@ -65,7 +70,7 @@
+ int fib_protocol;
+ u32 fib_prefsrc;
+ u32 fib_priority;
+- unsigned fib_metrics[RTAX_MAX];
++ u32 fib_metrics[RTAX_MAX];
+ #define fib_mtu fib_metrics[RTAX_MTU-1]
+ #define fib_window fib_metrics[RTAX_WINDOW-1]
+ #define fib_rtt fib_metrics[RTAX_RTT-1]
+@@ -117,7 +122,7 @@
+ {
+ unsigned char tb_id;
+ unsigned tb_stamp;
+- int (*tb_lookup)(struct fib_table *tb, const struct rt_key *key, struct fib_result *res);
++ int (*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res);
+ int (*tb_insert)(struct fib_table *table, struct rtmsg *r,
+ struct kern_rta *rta, struct nlmsghdr *n,
+ struct netlink_skb_parms *req);
+@@ -130,7 +135,7 @@
+ int (*tb_get_info)(struct fib_table *table, char *buf,
+ int first, int count);
+ void (*tb_select_default)(struct fib_table *table,
+- const struct rt_key *key, struct fib_result *res);
++ const struct flowi *flp, struct fib_result *res);
+
+ unsigned char tb_data[0];
+ };
+@@ -152,18 +157,18 @@
+ return fib_get_table(id);
+ }
+
+-static inline int fib_lookup(const struct rt_key *key, struct fib_result *res)
++static inline int fib_lookup(const struct flowi *flp, struct fib_result *res)
+ {
+- if (local_table->tb_lookup(local_table, key, res) &&
+- main_table->tb_lookup(main_table, key, res))
++ if (local_table->tb_lookup(local_table, flp, res) &&
++ main_table->tb_lookup(main_table, flp, res))
+ return -ENETUNREACH;
+ return 0;
+ }
+
+-static inline void fib_select_default(const struct rt_key *key, struct fib_result *res)
++static inline void fib_select_default(const struct flowi *flp, struct fib_result *res)
+ {
+ if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+- main_table->tb_select_default(main_table, key, res);
++ main_table->tb_select_default(main_table, flp, res);
+ }
+
+ #else /* CONFIG_IP_MULTIPLE_TABLES */
+@@ -171,7 +176,7 @@
+ #define main_table (fib_tables[RT_TABLE_MAIN])
+
+ extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
+-extern int fib_lookup(const struct rt_key *key, struct fib_result *res);
++extern int fib_lookup(const struct flowi *flp, struct fib_result *res);
+ extern struct fib_table *__fib_new_table(int id);
+ extern void fib_rule_put(struct fib_rule *r);
+
+@@ -191,7 +196,7 @@
+ return fib_tables[id] ? : __fib_new_table(id);
+ }
+
+-extern void fib_select_default(const struct rt_key *key, struct fib_result *res);
++extern void fib_select_default(const struct flowi *flp, struct fib_result *res);
+
+ #endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+@@ -204,13 +209,13 @@
+ extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb);
+ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+ struct net_device *dev, u32 *spec_dst, u32 *itag);
+-extern void fib_select_multipath(const struct rt_key *key, struct fib_result *res);
++extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
+
+ /* Exported by fib_semantics.c */
+ extern int ip_fib_check_default(u32 gw, struct net_device *dev);
+ extern void fib_release_info(struct fib_info *);
+ extern int fib_semantic_match(int type, struct fib_info *,
+- const struct rt_key *, struct fib_result*);
++ const struct flowi *, struct fib_result*);
+ extern struct fib_info *fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+ const struct nlmsghdr *, int *err);
+ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, struct kern_rta *rta, struct fib_info *fi);
+diff -Nru a/include/net/ip_vs.h b/include/net/ip_vs.h
+--- a/include/net/ip_vs.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ip_vs.h 2005-02-13 21:25:09 +11:00
+@@ -281,6 +281,13 @@
+ #define LeaveFunction(level) do {} while (0)
+ #endif
+
++#define IP_VS_XMIT(skb, rt) \
++do { \
++ skb->nfcache |= NFC_IPVS_PROPERTY; \
++ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
++ (rt)->u.dst.dev, dst_output); \
++} while (0)
++
+
+ /*
+ * The port number of FTP service (in network order).
+@@ -864,7 +871,16 @@
+ spin_lock(&dest->dst_lock);
+ if (!(rt = (struct rtable *)
+ __ip_vs_dst_check(dest, rtos, 0))) {
+- if (ip_route_output(&rt, dest->addr, 0, rtos, 0)) {
++ struct flowi fl = {
++ .oif = 0,
++ .nl_u = {
++ .ip4_u = {
++ .daddr = dest->addr,
++ .saddr = 0,
++ .tos = rtos, } },
++ };
++
++ if (ip_route_output_key(&rt, &fl)) {
+ spin_unlock(&dest->dst_lock);
+ IP_VS_DBG_RL("ip_route_output error, "
+ "dest: %u.%u.%u.%u\n",
+@@ -878,7 +894,16 @@
+ }
+ spin_unlock(&dest->dst_lock);
+ } else {
+- if (ip_route_output(&rt, cp->daddr, 0, rtos, 0)) {
++ struct flowi fl = {
++ .oif = 0,
++ .nl_u = {
++ .ip4_u = {
++ .daddr = cp->daddr,
++ .saddr = 0,
++ .tos = rtos, } },
++ };
++
++ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_route_output error, dest: "
+ "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+ return NULL;
+diff -Nru a/include/net/ipcomp.h b/include/net/ipcomp.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/net/ipcomp.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,12 @@
++#ifndef _NET_IPCOMP_H
++#define _NET_IPCOMP_H
++
++#define IPCOMP_SCRATCH_SIZE 65400
++
++struct ipcomp_data {
++ u16 threshold;
++ u8 *scratch;
++ struct crypto_tfm *tfm;
++};
++
++#endif
+diff -Nru a/include/net/ipip.h b/include/net/ipip.h
+--- a/include/net/ipip.h 2005-02-13 21:25:10 +11:00
++++ b/include/net/ipip.h 2005-02-13 21:25:10 +11:00
+@@ -34,7 +34,7 @@
+ ip_select_ident(iph, &rt->u.dst, NULL); \
+ ip_send_check(iph); \
+ \
+- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, do_ip_send); \
++ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output);\
+ if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { \
+ stats->tx_bytes += pkt_len; \
+ stats->tx_packets++; \
+diff -Nru a/include/net/ipv6.h b/include/net/ipv6.h
+--- a/include/net/ipv6.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ipv6.h 2005-02-13 21:25:09 +11:00
+@@ -22,6 +22,8 @@
+
+ #define SIN6_LEN_RFC2133 24
+
++#define IPV6_MAXPLEN 65535
++
+ /*
+ * NextHeader field of IPv6 header
+ */
+@@ -48,7 +50,7 @@
+ /*
+ * Addr type
+ *
+- * type - unicast | multicast | anycast
++ * type - unicast | multicast
+ * scope - local | site | global
+ * v4 - compat
+ * v4mapped
+@@ -60,7 +62,6 @@
+
+ #define IPV6_ADDR_UNICAST 0x0001U
+ #define IPV6_ADDR_MULTICAST 0x0002U
+-#define IPV6_ADDR_ANYCAST 0x0004U
+
+ #define IPV6_ADDR_LOOPBACK 0x0010U
+ #define IPV6_ADDR_LINKLOCAL 0x0020U
+@@ -98,6 +99,8 @@
+ __u32 identification;
+ };
+
++#define IP6_MF 0x0001
++
+ #ifdef __KERNEL__
+
+ #include <net/sock.h>
+@@ -199,12 +202,8 @@
+
+ extern int ip6_call_ra_chain(struct sk_buff *skb, int sel);
+
+-extern int ipv6_reassembly(struct sk_buff **skb, int);
+-
+ extern int ipv6_parse_hopopts(struct sk_buff *skb, int);
+
+-extern int ipv6_parse_exthdrs(struct sk_buff **skb, int);
+-
+ extern struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt);
+
+ extern int ip6_frag_nqueues;
+@@ -239,6 +238,23 @@
+ memcpy((void *) a1, (const void *) a2, sizeof(struct in6_addr));
+ }
+
++static inline void ipv6_addr_prefix(struct in6_addr *pfx,
++ const struct in6_addr *addr,
++ int plen)
++{
++ /* caller must guarantee 0 <= plen <= 128 */
++ int o = plen >> 3,
++ b = plen & 0x7;
++
++ memcpy(pfx->s6_addr, addr, o);
++ if (b != 0) {
++ pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
++ o++;
++ }
++ if (o < 16)
++ memset(pfx->s6_addr + o, 0, 16 - o);
++}
++
+ #ifndef __HAVE_ARCH_ADDR_SET
+ static inline void ipv6_addr_set(struct in6_addr *addr,
+ __u32 w1, __u32 w2,
+@@ -291,6 +307,26 @@
+ unsigned length,
+ struct ipv6_txoptions *opt,
+ int hlimit, int flags);
++extern int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
++
++extern int ip6_append_data(struct sock *sk,
++ int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
++ void *from,
++ int length,
++ int transhdrlen,
++ int hlimit,
++ struct ipv6_txoptions *opt,
++ struct flowi *fl,
++ struct rt6_info *rt,
++ unsigned int flags);
++
++extern int ip6_push_pending_frames(struct sock *sk);
++
++extern void ip6_flush_pending_frames(struct sock *sk);
++
++extern int ip6_dst_lookup(struct sock *sk,
++ struct dst_entry **dst,
++ struct flowi *fl);
+
+ /*
+ * skb processing functions
+diff -Nru a/include/net/ndisc.h b/include/net/ndisc.h
+--- a/include/net/ndisc.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/ndisc.h 2005-02-13 21:25:09 +11:00
+@@ -56,20 +56,6 @@
+ __u8 nd_opt_len;
+ } __attribute__((__packed__));
+
+-struct ndisc_options {
+- struct nd_opt_hdr *nd_opt_array[7];
+- struct nd_opt_hdr *nd_opt_piend;
+-};
+-
+-#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
+-#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR]
+-#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO]
+-#define nd_opts_pi_end nd_opt_piend
+-#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR]
+-#define nd_opts_mtu nd_opt_array[ND_OPT_MTU]
+-
+-extern struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end);
+-extern struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, struct ndisc_options *ndopts);
+
+ extern int ndisc_init(struct net_proto_family *ops);
+
+diff -Nru a/include/net/protocol.h b/include/net/protocol.h
+--- a/include/net/protocol.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/protocol.h 2005-02-13 21:25:09 +11:00
+@@ -30,7 +30,7 @@
+ #include <linux/ipv6.h>
+ #endif
+
+-#define MAX_INET_PROTOS 32 /* Must be a power of 2 */
++#define MAX_INET_PROTOS 256 /* Must be a power of 2 */
+
+
+ /* This is used to register protocols. */
+@@ -38,29 +38,23 @@
+ {
+ int (*handler)(struct sk_buff *skb);
+ void (*err_handler)(struct sk_buff *skb, u32 info);
+- struct inet_protocol *next;
+- unsigned char protocol;
+- unsigned char copy:1;
+- void *data;
+- const char *name;
++ int no_policy;
+ };
+
+ #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ struct inet6_protocol
+ {
+- int (*handler)(struct sk_buff *skb);
++ int (*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+
+ void (*err_handler)(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ int type, int code, int offset,
+ __u32 info);
+- struct inet6_protocol *next;
+- unsigned char protocol;
+- unsigned char copy:1;
+- void *data;
+- const char *name;
++ unsigned int flags; /* INET6_PROTO_xxx */
+ };
+
++#define INET6_PROTO_NOPOLICY 0x1
++#define INET6_PROTO_FINAL 0x2
+ #endif
+
+ /* This is used to register socket interfaces for IP protocols. */
+@@ -93,14 +87,14 @@
+ extern struct list_head inetsw6[SOCK_MAX];
+ #endif
+
+-extern void inet_add_protocol(struct inet_protocol *prot);
+-extern int inet_del_protocol(struct inet_protocol *prot);
++extern int inet_add_protocol(struct inet_protocol *prot, unsigned char num);
++extern int inet_del_protocol(struct inet_protocol *prot, unsigned char num);
+ extern void inet_register_protosw(struct inet_protosw *p);
+ extern void inet_unregister_protosw(struct inet_protosw *p);
+
+ #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+-extern void inet6_add_protocol(struct inet6_protocol *prot);
+-extern int inet6_del_protocol(struct inet6_protocol *prot);
++extern int inet6_add_protocol(struct inet6_protocol *prot, unsigned char num);
++extern int inet6_del_protocol(struct inet6_protocol *prot, unsigned char num);
+ extern void inet6_register_protosw(struct inet_protosw *p);
+ extern void inet6_unregister_protosw(struct inet_protosw *p);
+ #endif
+diff -Nru a/include/net/raw.h b/include/net/raw.h
+--- a/include/net/raw.h 2005-02-13 21:25:10 +11:00
++++ b/include/net/raw.h 2005-02-13 21:25:10 +11:00
+@@ -37,6 +37,6 @@
+ unsigned long raddr, unsigned long laddr,
+ int dif);
+
+-extern struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
++extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
+
+ #endif /* _RAW_H */
+diff -Nru a/include/net/rawv6.h b/include/net/rawv6.h
+--- a/include/net/rawv6.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/rawv6.h 2005-02-13 21:25:09 +11:00
+@@ -7,9 +7,7 @@
+ extern struct sock *raw_v6_htable[RAWV6_HTABLE_SIZE];
+ extern rwlock_t raw_v6_lock;
+
+-extern struct sock * ipv6_raw_deliver(struct sk_buff *skb,
+- int nexthdr);
+-
++extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr);
+
+ extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
+ struct in6_addr *loc_addr, struct in6_addr *rmt_addr);
+diff -Nru a/include/net/route.h b/include/net/route.h
+--- a/include/net/route.h 2005-02-13 21:25:08 +11:00
++++ b/include/net/route.h 2005-02-13 21:25:08 +11:00
+@@ -27,6 +27,7 @@
+ #include <linux/config.h>
+ #include <net/dst.h>
+ #include <net/inetpeer.h>
++#include <net/flow.h>
+ #include <linux/in_route.h>
+ #include <linux/rtnetlink.h>
+ #include <linux/route.h>
+@@ -45,19 +46,6 @@
+
+ #define RT_CONN_FLAGS(sk) (RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute)
+
+-struct rt_key
+-{
+- __u32 dst;
+- __u32 src;
+- int iif;
+- int oif;
+-#ifdef CONFIG_IP_ROUTE_FWMARK
+- __u32 fwmark;
+-#endif
+- __u8 tos;
+- __u8 scope;
+-};
+-
+ struct inet_peer;
+ struct rtable
+ {
+@@ -78,7 +66,7 @@
+ __u32 rt_gateway;
+
+ /* Cache lookup keys */
+- struct rt_key key;
++ struct flowi fl;
+
+ /* Miscellaneous cached information */
+ __u32 rt_spec_dst; /* RFC1122 specific destination */
+@@ -126,10 +114,11 @@
+ u32 src, u8 tos, struct net_device *dev);
+ extern void ip_rt_advice(struct rtable **rp, int advice);
+ extern void rt_cache_flush(int how);
+-extern int ip_route_output_key(struct rtable **, const struct rt_key *key);
++extern int __ip_route_output_key(struct rtable **, const struct flowi *flp);
++extern int ip_route_output_key(struct rtable **, struct flowi *flp);
++extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
+ extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin);
+ extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu);
+-extern void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu);
+ extern void ip_rt_send_redirect(struct sk_buff *skb);
+
+ extern unsigned inet_addr_type(u32 addr);
+@@ -138,16 +127,6 @@
+ extern void ip_rt_get_source(u8 *src, struct rtable *rt);
+ extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+-/* Deprecated: use ip_route_output_key directly */
+-static inline int ip_route_output(struct rtable **rp,
+- u32 daddr, u32 saddr, u32 tos, int oif)
+-{
+- struct rt_key key = { dst:daddr, src:saddr, oif:oif, tos:tos };
+-
+- return ip_route_output_key(rp, &key);
+-}
+-
+-
+ static inline void ip_rt_put(struct rtable * rt)
+ {
+ if (rt)
+@@ -163,17 +142,47 @@
+ return ip_tos2prio[IPTOS_TOS(tos)>>1];
+ }
+
+-static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif)
+-{
++static inline int ip_route_connect(struct rtable **rp, u32 dst,
++ u32 src, u32 tos, int oif, u8 protocol,
++ u16 sport, u16 dport, struct sock *sk)
++{
++ struct flowi fl = { .oif = oif,
++ .nl_u = { .ip4_u = { .daddr = dst,
++ .saddr = src,
++ .tos = tos } },
++ .proto = protocol,
++ .uli_u = { .ports =
++ { .sport = sport,
++ .dport = dport } } };
++
+ int err;
+- err = ip_route_output(rp, dst, src, tos, oif);
+- if (err || (dst && src))
+- return err;
+- dst = (*rp)->rt_dst;
+- src = (*rp)->rt_src;
+- ip_rt_put(*rp);
+- *rp = NULL;
+- return ip_route_output(rp, dst, src, tos, oif);
++ if (!dst || !src) {
++ err = __ip_route_output_key(rp, &fl);
++ if (err)
++ return err;
++ fl.fl4_dst = (*rp)->rt_dst;
++ fl.fl4_src = (*rp)->rt_src;
++ ip_rt_put(*rp);
++ *rp = NULL;
++ }
++ return ip_route_output_flow(rp, &fl, sk, 0);
++}
++
++static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport,
++ struct sock *sk)
++{
++ if (sport != (*rp)->fl.fl_ip_sport ||
++ dport != (*rp)->fl.fl_ip_dport) {
++ struct flowi fl;
++
++ memcpy(&fl, &(*rp)->fl, sizeof(fl));
++ fl.fl_ip_sport = sport;
++ fl.fl_ip_dport = dport;
++ ip_rt_put(*rp);
++ *rp = NULL;
++ return ip_route_output_flow(rp, &fl, sk, 0);
++ }
++ return 0;
+ }
+
+ extern void rt_bind_peer(struct rtable *rt, int create);
+diff -Nru a/include/net/sctp/compat.h b/include/net/sctp/compat.h
+--- a/include/net/sctp/compat.h 2005-02-13 21:25:08 +11:00
++++ b/include/net/sctp/compat.h 2005-02-13 21:25:08 +11:00
+@@ -55,14 +55,10 @@
+ extern type name[]
+ #define SNMP_DEC_STATS(mib, field) ((mib)[2*smp_processor_id()+!in_softirq()].field--)
+
+-#define inet_sk(__sk) (&(((struct sock *)__sk)->protinfo.af_inet))
+-#define inet6_sk(__sk) (&(((struct sock *)__sk)->net_pinfo.af_inet6))
+-
+ #define virt_addr_valid(x) VALID_PAGE(virt_to_page((x)))
+ #define sock_owned_by_user(sk) ((sk)->lock.users)
+ #define sk_set_owner(x, y)
+ #define __unsafe(x)
+-#define dst_pmtu(x) ((x)->pmtu)
+
+ /*
+ * find last bit set.
+diff -Nru a/include/net/sock.h b/include/net/sock.h
+--- a/include/net/sock.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/sock.h 2005-02-13 21:25:09 +11:00
+@@ -45,6 +45,8 @@
+ #include <net/if_inet6.h> /* struct ipv6_mc_socklist */
+ #endif
+
++#include <net/flow.h>
++
+ #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
+ #include <linux/icmp.h>
+ #endif
+@@ -184,6 +186,12 @@
+
+ struct ipv6_txoptions *opt;
+ struct sk_buff *pktoptions;
++ struct {
++ struct ipv6_txoptions *opt;
++ struct rt6_info *rt;
++ struct flowi fl;
++ int hop_limit;
++ } cork;
+ };
+
+ struct raw6_opt {
+@@ -210,7 +218,7 @@
+ #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
+ struct inet_opt
+ {
+- int ttl; /* TTL setting */
++ int uc_ttl; /* Unicast TTL */
+ int tos; /* TOS */
+ unsigned cmsg_flags;
+ struct ip_options *opt;
+@@ -224,7 +232,24 @@
+ int mc_index; /* Multicast device index */
+ __u32 mc_addr;
+ struct ip_mc_socklist *mc_list; /* Group array */
++ struct page *sndmsg_page; /* Cached page for sendmsg */
++ u32 sndmsg_off; /* Cached offset for sendmsg */
++ /*
++ * Following members are used to retain the infomation to build
++ * an ip header on each ip fragmentation while the socket is corked.
++ */
++ struct {
++ unsigned int flags;
++ unsigned int fragsize;
++ struct ip_options *opt;
++ struct rtable *rt;
++ int length; /* Total length of all frames */
++ u32 addr;
++ } cork;
+ };
++
++#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
++
+ #endif
+
+ #if defined(CONFIG_PPPOE) || defined (CONFIG_PPPOE_MODULE)
+@@ -250,6 +275,14 @@
+ #define pppoe_relay proto.pppoe.relay
+ #endif
+
++#if defined(CONFIG_NET_KEY) || defined(CONFIG_NET_KEY_MODULE)
++struct pfkey_opt {
++ int registered;
++ int promisc;
++};
++#define pfkey_sk(__sk) ((__sk)->protinfo.pf_key)
++#endif
++
+ /* This defines a selective acknowledgement block. */
+ struct tcp_sack_block {
+ __u32 start_seq;
+@@ -314,6 +347,7 @@
+ __u16 mss_cache; /* Cached effective mss, not including SACKS */
+ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
+ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
++ __u16 ext2_header_len;/* Options depending on route */
+ __u8 ca_state; /* State of fast-retransmit machine */
+ __u8 retransmits; /* Number of unrecovered RTO timeouts. */
+
+@@ -354,8 +388,6 @@
+
+ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */
+ struct sk_buff *send_head; /* Front of stuff to transmit */
+- struct page *sndmsg_page; /* Cached page for sendmsg */
+- u32 sndmsg_off; /* Cached offset for sendmsg */
+
+ __u32 rcv_wnd; /* Current receiver window */
+ __u32 rcv_wup; /* rcv_nxt on last window update sent */
+@@ -488,6 +520,20 @@
+ } bictcp;
+ };
+
++struct udp_opt {
++ int pending; /* Any pending frames ? */
++ unsigned int corkflag; /* Cork is required */
++ __u16 encap_type; /* Is this an Encapsulation socket? */
++ /*
++ * Following members retains the infomation to create a UDP header
++ * when the socket is uncorked.
++ */
++ u32 saddr; /* source address */
++ u32 daddr; /* destination address */
++ __u16 sport; /* source port */
++ __u16 dport; /* destination port */
++ __u16 len; /* total length of pending frames */
++};
+
+ /*
+ * This structure really needs to be cleaned up.
+@@ -583,6 +629,7 @@
+ wait_queue_head_t *sleep; /* Sock wait queue */
+ struct dst_entry *dst_cache; /* Destination cache */
+ rwlock_t dst_lock;
++ struct xfrm_policy *policy[2];
+ atomic_t rmem_alloc; /* Receive queue bytes committed */
+ struct sk_buff_head receive_queue; /* Incoming packets */
+ atomic_t wmem_alloc; /* Transmit queue bytes committed */
+@@ -639,10 +686,12 @@
+ union {
+ struct ipv6_pinfo af_inet6;
+ } net_pinfo;
++#define inet6_sk(sk) (&(sk)->net_pinfo.af_inet6)
+ #endif
+
+ union {
+ struct tcp_opt af_tcp;
++ struct udp_opt af_udp;
+ #if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE)
+ struct sctp_opt af_sctp;
+ #endif
+@@ -657,6 +706,10 @@
+ #endif /* CONFIG_SPX */
+
+ } tp_pinfo;
++#define tcp_sk(sk) (&(sk)->tp_pinfo.af_tcp)
++#define udp_sk(sk) (&(sk)->tp_pinfo.af_udp)
++#define raw_sk(sk) (&(sk)->tp_pinfo.tp_raw4)
++#define raw6_sk(sk) (&(sk)->tp_pinfo.tp_raw)
+
+ int err, err_soft; /* Soft holds errors that don't
+ cause failure but are the cause
+@@ -727,8 +780,11 @@
+ #if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE)
+ struct wanpipe_opt *af_wanpipe;
+ #endif
++#if defined(CONFIG_NET_KEY) || defined(CONFIG_NET_KEY_MODULE)
++ struct pfkey_opt *pf_key;
++#endif
+ } protinfo;
+-
++#define inet_sk(sk) (&(sk)->protinfo.af_inet)
+
+ /* This part is used for the timeout functions. */
+ struct timer_list timer; /* This is the sock cleanup timer. */
+@@ -792,6 +848,8 @@
+ int (*recvmsg)(struct sock *sk, struct msghdr *msg,
+ int len, int noblock, int flags,
+ int *addr_len);
++ int (*sendpage)(struct sock *sk, struct page *page,
++ int offset, size_t size, int flags);
+ int (*bind)(struct sock *sk,
+ struct sockaddr *uaddr, int addr_len);
+
+diff -Nru a/include/net/tcp.h b/include/net/tcp.h
+--- a/include/net/tcp.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/tcp.h 2005-02-13 21:25:09 +11:00
+@@ -575,13 +575,6 @@
+ /*
+ * Pointers to address related TCP functions
+ * (i.e. things that depend on the address family)
+- *
+- * BUGGG_FUTURE: all the idea behind this struct is wrong.
+- * It mixes socket frontend with transport function.
+- * With port sharing between IPv6/v4 it gives the only advantage,
+- * only poor IPv6 needs to permanently recheck, that it
+- * is still IPv6 8)8) It must be cleaned up as soon as possible.
+- * --ANK (980802)
+ */
+
+ struct tcp_func {
+@@ -940,9 +933,12 @@
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss_now = tp->mss_cache;
+
+- if (dst && dst->pmtu != tp->pmtu_cookie)
+- mss_now = tcp_sync_mss(sk, dst->pmtu);
+-
++ if (dst) {
++ u32 mtu = dst_pmtu(dst);
++ if (mtu != tp->pmtu_cookie ||
++ tp->ext2_header_len != dst->header_len)
++ mss_now = tcp_sync_mss(sk, mtu);
++ }
+ if (tp->eff_sacks)
+ mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
+@@ -1257,7 +1253,7 @@
+ }
+ }
+
+-extern __u32 tcp_init_cwnd(struct tcp_opt *tp);
++extern __u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst);
+
+ /* Slow start with delack produces 3 packets of burst, so that
+ * it is safe "de facto".
+diff -Nru a/include/net/transp_v6.h b/include/net/transp_v6.h
+--- a/include/net/transp_v6.h 2005-02-13 21:25:09 +11:00
++++ b/include/net/transp_v6.h 2005-02-13 21:25:09 +11:00
+@@ -17,6 +17,13 @@
+
+ extern void ipv6_frag_init(void);
+
++/* extention headers */
++extern void ipv6_rthdr_init(void);
++extern void ipv6_frag_init(void);
++extern void ipv6_nodata_init(void);
++extern void ipv6_destopt_init(void);
++
++/* transport protocols */
+ extern void rawv6_init(void);
+ extern void udpv6_init(void);
+ extern void tcpv6_init(void);
+diff -Nru a/include/net/xfrm.h b/include/net/xfrm.h
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/include/net/xfrm.h 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,905 @@
++#ifndef _NET_XFRM_H
++#define _NET_XFRM_H
++
++#include <linux/xfrm.h>
++#include <linux/spinlock.h>
++#include <linux/list.h>
++#include <linux/skbuff.h>
++#include <linux/netdevice.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <linux/in6.h>
++
++#include <net/sock.h>
++#include <net/dst.h>
++#include <net/route.h>
++#include <net/ipv6.h>
++#include <net/ip6_fib.h>
++
++#define XFRM_ALIGN8(len) (((len) + 7) & ~7)
++
++extern struct semaphore xfrm_cfg_sem;
++
++/* Organization of SPD aka "XFRM rules"
++ ------------------------------------
++
++ Basic objects:
++ - policy rule, struct xfrm_policy (=SPD entry)
++ - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle)
++ - instance of a transformer, struct xfrm_state (=SA)
++ - template to clone xfrm_state, struct xfrm_tmpl
++
++ SPD is plain linear list of xfrm_policy rules, ordered by priority.
++ (To be compatible with existing pfkeyv2 implementations,
++ many rules with priority of 0x7fffffff are allowed to exist and
++ such rules are ordered in an unpredictable way, thanks to bsd folks.)
++
++ Lookup is plain linear search until the first match with selector.
++
++ If "action" is "block", then we prohibit the flow, otherwise:
++ if "xfrms_nr" is zero, the flow passes untransformed. Otherwise,
++ policy entry has list of up to XFRM_MAX_DEPTH transformations,
++ described by templates xfrm_tmpl. Each template is resolved
++ to a complete xfrm_state (see below) and we pack bundle of transformations
++ to a dst_entry returned to requestor.
++
++ dst -. xfrm .-> xfrm_state #1
++ |---. child .-> dst -. xfrm .-> xfrm_state #2
++ |---. child .-> dst -. xfrm .-> xfrm_state #3
++ |---. child .-> NULL
++
++ Bundles are cached at xrfm_policy struct (field ->bundles).
++
++
++ Resolution of xrfm_tmpl
++ -----------------------
++ Template contains:
++ 1. ->mode Mode: transport or tunnel
++ 2. ->id.proto Protocol: AH/ESP/IPCOMP
++ 3. ->id.daddr Remote tunnel endpoint, ignored for transport mode.
++ Q: allow to resolve security gateway?
++ 4. ->id.spi If not zero, static SPI.
++ 5. ->saddr Local tunnel endpoint, ignored for transport mode.
++ 6. ->algos List of allowed algos. Plain bitmask now.
++ Q: ealgos, aalgos, calgos. What a mess...
++ 7. ->share Sharing mode.
++ Q: how to implement private sharing mode? To add struct sock* to
++ flow id?
++
++ Having this template we search through SAD searching for entries
++ with appropriate mode/proto/algo, permitted by selector.
++ If no appropriate entry found, it is requested from key manager.
++
++ PROBLEMS:
++ Q: How to find all the bundles referring to a physical path for
++ PMTU discovery? Seems, dst should contain list of all parents...
++ and enter to infinite locking hierarchy disaster.
++ No! It is easier, we will not search for them, let them find us.
++ We add genid to each dst plus pointer to genid of raw IP route,
++ pmtu disc will update pmtu on raw IP route and increase its genid.
++ dst_check() will see this for top level and trigger resyncing
++ metrics. Plus, it will be made via sk->dst_cache. Solved.
++ */
++
++/* Full description of state of transformer. */
++struct xfrm_state
++{
++ /* Note: bydst is re-used during gc */
++ struct list_head bydst;
++ struct list_head byspi;
++
++ atomic_t refcnt;
++ spinlock_t lock;
++
++ struct xfrm_id id;
++ struct xfrm_selector sel;
++
++ /* Key manger bits */
++ struct {
++ u8 state;
++ u8 dying;
++ u32 seq;
++ } km;
++
++ /* Parameters of this state. */
++ struct {
++ u32 reqid;
++ u8 mode;
++ u8 replay_window;
++ u8 aalgo, ealgo, calgo;
++ u8 flags;
++ u16 family;
++ xfrm_address_t saddr;
++ int header_len;
++ int trailer_len;
++ } props;
++
++ struct xfrm_lifetime_cfg lft;
++
++ /* Data for transformer */
++ struct xfrm_algo *aalg;
++ struct xfrm_algo *ealg;
++ struct xfrm_algo *calg;
++
++ /* Data for encapsulator */
++ struct xfrm_encap_tmpl *encap;
++
++ /* IPComp needs an IPIP tunnel for handling uncompressed packets */
++ struct xfrm_state *tunnel;
++
++ /* If a tunnel, number of users + 1 */
++ atomic_t tunnel_users;
++
++ /* State for replay detection */
++ struct xfrm_replay_state replay;
++
++ /* Statistics */
++ struct xfrm_stats stats;
++
++ struct xfrm_lifetime_cur curlft;
++ struct timer_list timer;
++
++ /* Reference to data common to all the instances of this
++ * transformer. */
++ struct xfrm_type *type;
++
++ /* Private data of this transformer, format is opaque,
++ * interpreted by xfrm_type methods. */
++ void *data;
++};
++
++enum {
++ XFRM_STATE_VOID,
++ XFRM_STATE_ACQ,
++ XFRM_STATE_VALID,
++ XFRM_STATE_ERROR,
++ XFRM_STATE_EXPIRED,
++ XFRM_STATE_DEAD
++};
++
++struct xfrm_type;
++struct xfrm_dst;
++struct xfrm_policy_afinfo {
++ unsigned short family;
++ rwlock_t lock;
++ struct xfrm_type_map *type_map;
++ struct dst_ops *dst_ops;
++ void (*garbage_collect)(void);
++ int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
++ struct dst_entry *(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
++ int (*bundle_create)(struct xfrm_policy *policy,
++ struct xfrm_state **xfrm,
++ int nx,
++ struct flowi *fl,
++ struct dst_entry **dst_p);
++ void (*decode_session)(struct sk_buff *skb,
++ struct flowi *fl);
++};
++
++extern int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo);
++extern int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo);
++
++#define XFRM_ACQ_EXPIRES 30
++
++struct xfrm_tmpl;
++struct xfrm_state_afinfo {
++ unsigned short family;
++ rwlock_t lock;
++ struct list_head *state_bydst;
++ struct list_head *state_byspi;
++ void (*init_tempsel)(struct xfrm_state *x, struct flowi *fl,
++ struct xfrm_tmpl *tmpl,
++ xfrm_address_t *daddr, xfrm_address_t *saddr);
++ struct xfrm_state *(*state_lookup)(xfrm_address_t *daddr, u32 spi, u8 proto);
++ struct xfrm_state *(*find_acq)(u8 mode, u32 reqid, u8 proto,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ int create);
++};
++
++extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
++extern int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
++
++extern void xfrm_state_delete_tunnel(struct xfrm_state *x);
++
++struct xfrm_decap_state;
++struct xfrm_type
++{
++ char *description;
++ struct module *owner;
++ __u8 proto;
++
++ int (*init_state)(struct xfrm_state *x, void *args);
++ void (*destructor)(struct xfrm_state *);
++ int (*input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb);
++ int (*post_input)(struct xfrm_state *, struct xfrm_decap_state *, struct sk_buff *skb);
++ int (*output)(struct sk_buff *skb);
++ /* Estimate maximal size of result of transformation of a dgram */
++ u32 (*get_max_size)(struct xfrm_state *, int size);
++};
++
++struct xfrm_type_map {
++ rwlock_t lock;
++ struct xfrm_type *map[256];
++};
++
++extern int xfrm_register_type(struct xfrm_type *type, unsigned short family);
++extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family);
++extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family);
++extern void xfrm_put_type(struct xfrm_type *type);
++
++struct xfrm_tmpl
++{
++/* id in template is interpreted as:
++ * daddr - destination of tunnel, may be zero for transport mode.
++ * spi - zero to acquire spi. Not zero if spi is static, then
++ * daddr must be fixed too.
++ * proto - AH/ESP/IPCOMP
++ */
++ struct xfrm_id id;
++
++/* Source address of tunnel. Ignored, if it is not a tunnel. */
++ xfrm_address_t saddr;
++
++ __u32 reqid;
++
++/* Mode: transport/tunnel */
++ __u8 mode;
++
++/* Sharing mode: unique, this session only, this user only etc. */
++ __u8 share;
++
++/* May skip this transfomration if no SA is found */
++ __u8 optional;
++
++/* Bit mask of algos allowed for acquisition */
++ __u32 aalgos;
++ __u32 ealgos;
++ __u32 calgos;
++};
++
++#define XFRM_MAX_DEPTH 4
++
++struct xfrm_policy
++{
++ struct xfrm_policy *next;
++ struct list_head list;
++
++ /* This lock only affects elements except for entry. */
++ rwlock_t lock;
++ atomic_t refcnt;
++ struct timer_list timer;
++
++ u32 priority;
++ u32 index;
++ struct xfrm_selector selector;
++ struct xfrm_lifetime_cfg lft;
++ struct xfrm_lifetime_cur curlft;
++ struct dst_entry *bundles;
++ __u16 family;
++ __u8 action;
++ __u8 flags;
++ __u8 dead;
++ __u8 xfrm_nr;
++ struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
++};
++
++#define XFRM_KM_TIMEOUT 30
++
++struct xfrm_mgr
++{
++ struct list_head list;
++ char *id;
++ int (*notify)(struct xfrm_state *x, int event);
++ int (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp, int dir);
++ struct xfrm_policy *(*compile_policy)(u16 family, int opt, u8 *data, int len, int *dir);
++ int (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
++ int (*notify_policy)(struct xfrm_policy *x, int dir, int event);
++};
++
++extern int xfrm_register_km(struct xfrm_mgr *km);
++extern int xfrm_unregister_km(struct xfrm_mgr *km);
++
++
++extern struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
++
++static inline void xfrm_pol_hold(struct xfrm_policy *policy)
++{
++ if (likely(policy != NULL))
++ atomic_inc(&policy->refcnt);
++}
++
++extern void __xfrm_policy_destroy(struct xfrm_policy *policy);
++
++static inline void xfrm_pol_put(struct xfrm_policy *policy)
++{
++ if (atomic_dec_and_test(&policy->refcnt))
++ __xfrm_policy_destroy(policy);
++}
++
++#define XFRM_DST_HSIZE 1024
++
++static __inline__
++unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
++{
++ unsigned h;
++ h = ntohl(addr->a4);
++ h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
++ return h;
++}
++
++static __inline__
++unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
++{
++ unsigned h;
++ h = ntohl(addr->a6[2]^addr->a6[3]);
++ h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
++ return h;
++}
++
++static __inline__
++unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
++{
++ switch (family) {
++ case AF_INET:
++ return __xfrm4_dst_hash(addr);
++ case AF_INET6:
++ return __xfrm6_dst_hash(addr);
++ }
++ return 0;
++}
++
++static __inline__
++unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
++{
++ unsigned h;
++ h = ntohl(addr->a4^spi^proto);
++ h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
++ return h;
++}
++
++static __inline__
++unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
++{
++ unsigned h;
++ h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
++ h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
++ return h;
++}
++
++static __inline__
++unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
++{
++ switch (family) {
++ case AF_INET:
++ return __xfrm4_spi_hash(addr, spi, proto);
++ case AF_INET6:
++ return __xfrm6_spi_hash(addr, spi, proto);
++ }
++ return 0; /*XXX*/
++}
++
++extern void __xfrm_state_destroy(struct xfrm_state *);
++
++static inline void xfrm_state_put(struct xfrm_state *x)
++{
++ if (atomic_dec_and_test(&x->refcnt))
++ __xfrm_state_destroy(x);
++}
++
++static inline void xfrm_state_hold(struct xfrm_state *x)
++{
++ atomic_inc(&x->refcnt);
++}
++
++static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
++{
++ __u32 *a1 = token1;
++ __u32 *a2 = token2;
++ int pdw;
++ int pbi;
++
++ pdw = prefixlen >> 5; /* num of whole __u32 in prefix */
++ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */
++
++ if (pdw)
++ if (memcmp(a1, a2, pdw << 2))
++ return 0;
++
++ if (pbi) {
++ __u32 mask;
++
++ mask = htonl((0xffffffff) << (32 - pbi));
++
++ if ((a1[pdw] ^ a2[pdw]) & mask)
++ return 0;
++ }
++
++ return 1;
++}
++
++static __inline__
++u16 xfrm_flowi_sport(struct flowi *fl)
++{
++ u16 port;
++ switch(fl->proto) {
++ case IPPROTO_TCP:
++ case IPPROTO_UDP:
++ case IPPROTO_SCTP:
++ port = fl->fl_ip_sport;
++ break;
++ case IPPROTO_ICMP:
++ case IPPROTO_ICMPV6:
++ port = htons(fl->fl_icmp_type);
++ break;
++ default:
++ port = 0; /*XXX*/
++ }
++ return port;
++}
++
++static __inline__
++u16 xfrm_flowi_dport(struct flowi *fl)
++{
++ u16 port;
++ switch(fl->proto) {
++ case IPPROTO_TCP:
++ case IPPROTO_UDP:
++ case IPPROTO_SCTP:
++ port = fl->fl_ip_dport;
++ break;
++ case IPPROTO_ICMP:
++ case IPPROTO_ICMPV6:
++ port = htons(fl->fl_icmp_code);
++ break;
++ default:
++ port = 0; /*XXX*/
++ }
++ return port;
++}
++
++static inline int
++__xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
++{
++ return addr_match(&fl->fl4_dst, &sel->daddr, sel->prefixlen_d) &&
++ addr_match(&fl->fl4_src, &sel->saddr, sel->prefixlen_s) &&
++ !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
++ !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
++ (fl->proto == sel->proto || !sel->proto) &&
++ (fl->oif == sel->ifindex || !sel->ifindex);
++}
++
++static inline int
++__xfrm6_selector_match(struct xfrm_selector *sel, struct flowi *fl)
++{
++ return addr_match(&fl->fl6_dst, &sel->daddr, sel->prefixlen_d) &&
++ addr_match(&fl->fl6_src, &sel->saddr, sel->prefixlen_s) &&
++ !((xfrm_flowi_dport(fl) ^ sel->dport) & sel->dport_mask) &&
++ !((xfrm_flowi_sport(fl) ^ sel->sport) & sel->sport_mask) &&
++ (fl->proto == sel->proto || !sel->proto) &&
++ (fl->oif == sel->ifindex || !sel->ifindex);
++}
++
++static inline int
++xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
++ unsigned short family)
++{
++ switch (family) {
++ case AF_INET:
++ return __xfrm4_selector_match(sel, fl);
++ case AF_INET6:
++ return __xfrm6_selector_match(sel, fl);
++ }
++ return 0;
++}
++
++/* A struct encoding bundle of transformations to apply to some set of flow.
++ *
++ * dst->child points to the next element of bundle.
++ * dst->xfrm points to an instanse of transformer.
++ *
++ * Due to unfortunate limitations of current routing cache, which we
++ * have no time to fix, it mirrors struct rtable and bound to the same
++ * routing key, including saddr,daddr. However, we can have many of
++ * bundles differing by session id. All the bundles grow from a parent
++ * policy rule.
++ */
++struct xfrm_dst
++{
++ union {
++ struct xfrm_dst *next;
++ struct dst_entry dst;
++ struct rtable rt;
++ struct rt6_info rt6;
++ } u;
++};
++
++/* Decapsulation state, used by the input to store data during
++ * decapsulation procedure, to be used later (during the policy
++ * check
++ */
++struct xfrm_decap_state {
++ char decap_data[20];
++ __u16 decap_type;
++};
++
++struct sec_decap_state {
++ struct xfrm_state *xvec;
++ struct xfrm_decap_state decap;
++};
++
++struct sec_path
++{
++ atomic_t refcnt;
++ int len;
++ struct sec_decap_state x[XFRM_MAX_DEPTH];
++};
++
++static inline struct sec_path *
++secpath_get(struct sec_path *sp)
++{
++ if (sp)
++ atomic_inc(&sp->refcnt);
++ return sp;
++}
++
++extern void __secpath_destroy(struct sec_path *sp);
++
++static inline void
++secpath_put(struct sec_path *sp)
++{
++ if (sp && atomic_dec_and_test(&sp->refcnt))
++ __secpath_destroy(sp);
++}
++
++extern struct sec_path *secpath_dup(struct sec_path *src);
++
++static inline void
++secpath_reset(struct sk_buff *skb)
++{
++#ifdef CONFIG_XFRM
++ secpath_put(skb->sp);
++ skb->sp = NULL;
++#endif
++}
++
++static inline int
++__xfrm4_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x)
++{
++ return (tmpl->saddr.a4 &&
++ tmpl->saddr.a4 != x->props.saddr.a4);
++}
++
++static inline int
++__xfrm6_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x)
++{
++ return (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) &&
++ ipv6_addr_cmp((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr));
++}
++
++static inline int
++xfrm_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x, unsigned short family)
++{
++ switch (family) {
++ case AF_INET:
++ return __xfrm4_state_addr_cmp(tmpl, x);
++ case AF_INET6:
++ return __xfrm6_state_addr_cmp(tmpl, x);
++ }
++ return !0;
++}
++
++#ifdef CONFIG_XFRM
++
++extern int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family);
++
++static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
++{
++ if (sk && sk->policy[XFRM_POLICY_IN])
++ return __xfrm_policy_check(sk, dir, skb, family);
++
++ return (!xfrm_policy_list[dir] && !skb->sp) ||
++ (skb->dst->flags & DST_NOPOLICY) ||
++ __xfrm_policy_check(sk, dir, skb, family);
++}
++
++static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
++{
++ return xfrm_policy_check(sk, dir, skb, AF_INET);
++}
++
++static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
++{
++ return xfrm_policy_check(sk, dir, skb, AF_INET6);
++}
++
++
++extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
++
++static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
++{
++ return !xfrm_policy_list[XFRM_POLICY_OUT] ||
++ (skb->dst->flags & DST_NOXFRM) ||
++ __xfrm_route_forward(skb, family);
++}
++
++static inline int xfrm4_route_forward(struct sk_buff *skb)
++{
++ return xfrm_route_forward(skb, AF_INET);
++}
++
++static inline int xfrm6_route_forward(struct sk_buff *skb)
++{
++ return xfrm_route_forward(skb, AF_INET6);
++}
++
++extern int __xfrm_sk_clone_policy(struct sock *sk);
++
++static inline int xfrm_sk_clone_policy(struct sock *sk)
++{
++ if (unlikely(sk->policy[0] || sk->policy[1]))
++ return __xfrm_sk_clone_policy(sk);
++ return 0;
++}
++
++extern void xfrm_policy_delete(struct xfrm_policy *pol, int dir);
++
++static inline void xfrm_sk_free_policy(struct sock *sk)
++{
++ if (unlikely(sk->policy[0] != NULL)) {
++ xfrm_policy_delete(sk->policy[0], XFRM_POLICY_MAX);
++ sk->policy[0] = NULL;
++ }
++ if (unlikely(sk->policy[1] != NULL)) {
++ xfrm_policy_delete(sk->policy[1], XFRM_POLICY_MAX+1);
++ sk->policy[1] = NULL;
++ }
++}
++
++#else
++
++static inline void xfrm_sk_free_policy(struct sock *sk) {}
++static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; }
++static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
++static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
++static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
++{
++ return 1;
++}
++static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
++{
++ return 1;
++}
++static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
++{
++ return 1;
++}
++#endif
++
++static __inline__
++xfrm_address_t *xfrm_flowi_daddr(struct flowi *fl, unsigned short family)
++{
++ switch (family){
++ case AF_INET:
++ return (xfrm_address_t *)&fl->fl4_dst;
++ case AF_INET6:
++ return (xfrm_address_t *)&fl->fl6_dst;
++ }
++ return NULL;
++}
++
++static __inline__
++xfrm_address_t *xfrm_flowi_saddr(struct flowi *fl, unsigned short family)
++{
++ switch (family){
++ case AF_INET:
++ return (xfrm_address_t *)&fl->fl4_src;
++ case AF_INET6:
++ return (xfrm_address_t *)&fl->fl6_src;
++ }
++ return NULL;
++}
++
++static __inline__ int
++__xfrm4_state_addr_check(struct xfrm_state *x,
++ xfrm_address_t *daddr, xfrm_address_t *saddr)
++{
++ if (daddr->a4 == x->id.daddr.a4 &&
++ (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4))
++ return 1;
++ return 0;
++}
++
++static __inline__ int
++__xfrm6_state_addr_check(struct xfrm_state *x,
++ xfrm_address_t *daddr, xfrm_address_t *saddr)
++{
++ if (!ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
++ (!ipv6_addr_cmp((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr)||
++ ipv6_addr_any((struct in6_addr *)saddr) ||
++ ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
++ return 1;
++ return 0;
++}
++
++static __inline__ int
++xfrm_state_addr_check(struct xfrm_state *x,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ unsigned short family)
++{
++ switch (family) {
++ case AF_INET:
++ return __xfrm4_state_addr_check(x, daddr, saddr);
++ case AF_INET6:
++ return __xfrm6_state_addr_check(x, daddr, saddr);
++ }
++ return 0;
++}
++
++static inline int xfrm_state_kern(struct xfrm_state *x)
++{
++ return atomic_read(&x->tunnel_users);
++}
++
++/*
++ * xfrm algorithm information
++ */
++struct xfrm_algo_auth_info {
++ u16 icv_truncbits;
++ u16 icv_fullbits;
++};
++
++struct xfrm_algo_encr_info {
++ u16 blockbits;
++ u16 defkeybits;
++};
++
++struct xfrm_algo_comp_info {
++ u16 threshold;
++};
++
++struct xfrm_algo_desc {
++ char *name;
++ u8 available:1;
++ union {
++ struct xfrm_algo_auth_info auth;
++ struct xfrm_algo_encr_info encr;
++ struct xfrm_algo_comp_info comp;
++ } uinfo;
++ struct sadb_alg desc;
++};
++
++/* XFRM tunnel handlers. */
++struct xfrm_tunnel {
++ int (*handler)(struct sk_buff *skb);
++ void (*err_handler)(struct sk_buff *skb, void *info);
++};
++
++struct xfrm6_tunnel {
++ int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
++ void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info);
++};
++
++extern void xfrm_init(void);
++extern void xfrm4_init(void);
++extern void xfrm4_fini(void);
++extern void xfrm6_init(void);
++extern void xfrm6_fini(void);
++extern void xfrm_state_init(void);
++extern void xfrm4_state_init(void);
++extern void xfrm4_state_fini(void);
++extern void xfrm6_state_init(void);
++extern void xfrm6_state_fini(void);
++
++extern int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), void *);
++extern struct xfrm_state *xfrm_state_alloc(void);
++extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
++ struct flowi *fl, struct xfrm_tmpl *tmpl,
++ struct xfrm_policy *pol, int *err,
++ unsigned short family);
++extern int xfrm_state_check_expire(struct xfrm_state *x);
++extern void xfrm_state_insert(struct xfrm_state *x);
++extern int xfrm_state_add(struct xfrm_state *x);
++extern int xfrm_state_update(struct xfrm_state *x);
++extern struct xfrm_state *xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family);
++extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
++extern void xfrm_state_delete(struct xfrm_state *x);
++extern void xfrm_state_flush(u8 proto);
++extern int xfrm_replay_check(struct xfrm_state *x, u32 seq);
++extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq);
++extern int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb);
++extern int xfrm4_rcv(struct sk_buff *skb);
++extern int xfrm4_output(struct sk_buff *skb);
++extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
++extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
++extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
++extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
++extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
++extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
++extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
++extern void xfrm6_tunnel_free_spi(xfrm_address_t *saddr);
++extern u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr);
++extern int xfrm6_output(struct sk_buff *skb);
++
++#ifdef CONFIG_XFRM
++extern int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type);
++extern int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen);
++extern int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsigned short family);
++#else
++static inline int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen)
++{
++ return -ENOPROTOOPT;
++}
++
++static inline int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
++{
++ /* should not happen */
++ kfree_skb(skb);
++ return 0;
++}
++static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsigned short family)
++{
++ return -EINVAL;
++}
++#endif
++
++void xfrm_policy_init(void);
++struct xfrm_policy *xfrm_policy_alloc(int gfp);
++extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
++int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
++struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
++ int delete);
++struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
++void xfrm_policy_flush(void);
++u32 xfrm_get_acqseq(void);
++void xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
++struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ int create, unsigned short family);
++extern void xfrm_policy_flush(void);
++extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
++extern struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl);
++extern int xfrm_flush_bundles(void);
++
++extern wait_queue_head_t km_waitq;
++extern void km_state_expired(struct xfrm_state *x, int hard);
++extern int km_query(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *pol);
++extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
++extern void km_policy_expired(struct xfrm_policy *pol, int dir, int hard);
++
++extern void xfrm_input_init(void);
++extern int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq);
++
++extern void xfrm_probe_algs(void);
++extern int xfrm_count_auth_supported(void);
++extern int xfrm_count_enc_supported(void);
++extern struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx);
++extern struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx);
++extern struct xfrm_algo_desc *xfrm_calg_get_byidx(unsigned int idx);
++extern struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id);
++extern struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id);
++extern struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id);
++extern struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name);
++extern struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name);
++extern struct xfrm_algo_desc *xfrm_calg_get_byname(char *name);
++
++struct crypto_tfm;
++typedef void (icv_update_fn_t)(struct crypto_tfm *, struct scatterlist *, unsigned int);
++
++extern void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm,
++ int offset, int len, icv_update_fn_t icv_update);
++
++static inline int xfrm_addr_cmp(xfrm_address_t *a, xfrm_address_t *b,
++ int family)
++{
++ switch (family) {
++ default:
++ case AF_INET:
++ return a->a4 - b->a4;
++ case AF_INET6:
++ return ipv6_addr_cmp((struct in6_addr *)a,
++ (struct in6_addr *)b);
++ }
++}
++
++#endif /* _NET_XFRM_H */
+diff -Nru a/net/Config.in b/net/Config.in
+--- a/net/Config.in 2005-02-13 21:25:09 +11:00
++++ b/net/Config.in 2005-02-13 21:25:09 +11:00
+@@ -16,6 +16,7 @@
+ fi
+ bool 'Socket Filtering' CONFIG_FILTER
+ tristate 'Unix domain sockets' CONFIG_UNIX
++tristate 'PF_KEY sockets' CONFIG_NET_KEY
+ bool 'TCP/IP networking' CONFIG_INET
+ if [ "$CONFIG_INET" = "y" ]; then
+ source net/ipv4/Config.in
+@@ -25,6 +26,28 @@
+ if [ "$CONFIG_IPV6" != "n" ]; then
+ source net/ipv6/Config.in
+ fi
++ fi
++ if [ "$CONFIG_NET_KEY" != "n" -o \
++ "$CONFIG_NET_IPGRE" != "n" -o \
++ "$CONFIG_INET_AH" != "n" -o \
++ "$CONFIG_INET_ESP" != "n" -o \
++ "$CONFIG_INET_TUNNEL" != "n" ]; then
++ define_bool CONFIG_XFRM y
++ else
++ if [ "$CONFIG_IPV6" != "n" ]; then
++ if [ "$CONFIG_INET6_AH" != "n" -o \
++ "$CONFIG_INET6_ESP" != "n" -o \
++ "$CONFIG_INET6_TUNNEL" != "n" ]; then
++ define_bool CONFIG_XFRM y
++ else
++ bool ' XFRM support' CONFIG_XFRM
++ fi
++ else
++ bool ' XFRM support' CONFIG_XFRM
++ fi
++ fi
++ if [ "$CONFIG_XFRM" = "y" ]; then
++ source net/xfrm/Config.in
+ fi
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ source net/khttpd/Config.in
+diff -Nru a/net/Makefile b/net/Makefile
+--- a/net/Makefile 2005-02-13 21:25:09 +11:00
++++ b/net/Makefile 2005-02-13 21:25:09 +11:00
+@@ -7,28 +7,24 @@
+
+ O_TARGET := network.o
+
+-mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802
++mod-subdirs := ipv4/netfilter ipv6 ipx irda bluetooth atm netlink sched core sctp 802 xfrm
+ export-objs := netsyms.o
+
+ subdir-y := core ethernet
+-subdir-m := ipv4 # hum?
++subdir-m := ipv4 xfrm # hum?
+
+
+ subdir-$(CONFIG_NET) += 802 sched netlink
+ subdir-$(CONFIG_IPV6) += ipv6
+ subdir-$(CONFIG_INET) += ipv4
++subdir-$(CONFIG_XFRM) += xfrm
+ subdir-$(CONFIG_NETFILTER) += ipv4/netfilter
+ subdir-$(CONFIG_UNIX) += unix
+ subdir-$(CONFIG_IP_SCTP) += sctp
+
+-ifneq ($(CONFIG_IPV6),n)
+-ifneq ($(CONFIG_IPV6),)
+-subdir-$(CONFIG_NETFILTER) += ipv6/netfilter
+-endif
+-endif
+-
+ subdir-$(CONFIG_KHTTPD) += khttpd
+ subdir-$(CONFIG_PACKET) += packet
++subdir-$(CONFIG_NET_KEY) += key
+ subdir-$(CONFIG_NET_SCHED) += sched
+ subdir-$(CONFIG_BRIDGE) += bridge
+ subdir-$(CONFIG_IPX) += ipx
+diff -Nru a/net/atm/clip.c b/net/atm/clip.c
+--- a/net/atm/clip.c 2005-02-13 21:25:09 +11:00
++++ b/net/atm/clip.c 2005-02-13 21:25:09 +11:00
+@@ -503,6 +503,7 @@
+ struct atmarp_entry *entry;
+ int error;
+ struct clip_vcc *clip_vcc;
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = 1 } } };
+ struct rtable *rt;
+
+ if (vcc->push != clip_push) {
+@@ -519,7 +520,7 @@
+ unlink_clip_vcc(clip_vcc);
+ return 0;
+ }
+- error = ip_route_output(&rt,ip,0,1,0);
++ error = ip_route_output_key(&rt,&fl);
+ if (error) return error;
+ neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1);
+ ip_rt_put(rt);
+diff -Nru a/net/core/Makefile b/net/core/Makefile
+--- a/net/core/Makefile 2005-02-13 21:25:10 +11:00
++++ b/net/core/Makefile 2005-02-13 21:25:10 +11:00
+@@ -21,8 +21,8 @@
+
+ obj-$(CONFIG_FILTER) += filter.o
+
+-obj-$(CONFIG_NET) += dev.o ethtool.o dev_mcast.o dst.o neighbour.o \
+- rtnetlink.o utils.o
++obj-$(CONFIG_NET) += flow.o dev.o ethtool.o dev_mcast.o dst.o \
++ neighbour.o rtnetlink.o utils.o
+
+ obj-$(CONFIG_NETFILTER) += netfilter.o
+ obj-$(CONFIG_NET_DIVERT) += dv.o
+diff -Nru a/net/core/dev.c b/net/core/dev.c
+--- a/net/core/dev.c 2005-02-13 21:25:09 +11:00
++++ b/net/core/dev.c 2005-02-13 21:25:09 +11:00
+@@ -912,6 +912,13 @@
+ return notifier_chain_register(&netdev_chain, nb);
+ }
+
++/* Synchronize with packet receive processing. */
++void synchronize_net(void)
++{
++ br_write_lock_bh(BR_NETPROTO_LOCK);
++ br_write_unlock_bh(BR_NETPROTO_LOCK);
++}
++
+ /**
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
+@@ -1479,6 +1486,7 @@
+ #endif
+
+ skb->h.raw = skb->nh.raw = skb->data;
++ skb->mac_len = skb->nh.raw - skb->mac.raw;
+
+ pt_prev = NULL;
+ for (ptype = ptype_all; ptype; ptype = ptype->next) {
+diff -Nru a/net/core/dst.c b/net/core/dst.c
+--- a/net/core/dst.c 2005-02-13 21:25:09 +11:00
++++ b/net/core/dst.c 2005-02-13 21:25:09 +11:00
+@@ -36,11 +36,11 @@
+ static unsigned long dst_gc_timer_expires;
+ static unsigned long dst_gc_timer_inc = DST_GC_MAX;
+ static void dst_run_gc(unsigned long);
++static void ___dst_free(struct dst_entry * dst);
+
+ static struct timer_list dst_gc_timer =
+ { data: DST_GC_MIN, function: dst_run_gc };
+
+-
+ static void dst_run_gc(unsigned long dummy)
+ {
+ int delayed = 0;
+@@ -61,7 +61,25 @@
+ continue;
+ }
+ *dstp = dst->next;
+- dst_destroy(dst);
++
++ dst = dst_destroy(dst);
++ if (dst) {
++ /* NOHASH and still referenced. Unless it is already
++ * on gc list, invalidate it and add to gc list.
++ *
++ * Note: this is temporary. Actually, NOHASH dst's
++ * must be obsoleted when parent is obsoleted.
++ * But we do not have state "obsoleted, but
++ * referenced by parent", so it is right.
++ */
++ if (dst->obsolete > 1)
++ continue;
++
++ ___dst_free(dst);
++ dst->next = *dstp;
++ *dstp = dst;
++ dstp = &dst->next;
++ }
+ }
+ if (!dst_garbage_list) {
+ dst_gc_timer_inc = DST_GC_MAX;
+@@ -108,6 +126,7 @@
+ atomic_set(&dst->__refcnt, 0);
+ dst->ops = ops;
+ dst->lastuse = jiffies;
++ dst->path = dst;
+ dst->input = dst_discard;
+ dst->output = dst_blackhole;
+ #if RT_CACHE_DEBUG >= 2
+@@ -117,10 +136,8 @@
+ return dst;
+ }
+
+-void __dst_free(struct dst_entry * dst)
++static void ___dst_free(struct dst_entry * dst)
+ {
+- spin_lock_bh(&dst_lock);
+-
+ /* The first case (dev==NULL) is required, when
+ protocol module is unloaded.
+ */
+@@ -129,6 +146,12 @@
+ dst->output = dst_blackhole;
+ }
+ dst->obsolete = 2;
++}
++
++void __dst_free(struct dst_entry * dst)
++{
++ spin_lock_bh(&dst_lock);
++ ___dst_free(dst);
+ dst->next = dst_garbage_list;
+ dst_garbage_list = dst;
+ if (dst_gc_timer_inc > DST_GC_INC) {
+@@ -136,14 +159,19 @@
+ dst_gc_timer_expires = DST_GC_MIN;
+ mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+ }
+-
+ spin_unlock_bh(&dst_lock);
+ }
+
+-void dst_destroy(struct dst_entry * dst)
++struct dst_entry *dst_destroy(struct dst_entry * dst)
+ {
+- struct neighbour *neigh = dst->neighbour;
+- struct hh_cache *hh = dst->hh;
++ struct dst_entry *child;
++ struct neighbour *neigh;
++ struct hh_cache *hh;
++
++again:
++ neigh = dst->neighbour;
++ hh = dst->hh;
++ child = dst->child;
+
+ dst->hh = NULL;
+ if (hh && atomic_dec_and_test(&hh->hh_refcnt))
+@@ -164,6 +192,21 @@
+ atomic_dec(&dst_total);
+ #endif
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
++
++ dst = child;
++ if (dst) {
++ if (atomic_dec_and_test(&dst->__refcnt)) {
++ /* We were real parent of this dst, so kill child. */
++ if (dst->flags&DST_NOHASH)
++ goto again;
++ } else {
++ /* Child is still referenced, return it for freeing. */
++ if (dst->flags&DST_NOHASH)
++ return dst;
++ /* Child is still in his hash table */
++ }
++ }
++ return NULL;
+ }
+
+ static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+diff -Nru a/net/core/flow.c b/net/core/flow.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/core/flow.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,322 @@
++/* flow.c: Generic flow cache.
++ *
++ * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet at ms2.inr.ac.ru)
++ * Copyright (C) 2003 David S. Miller (davem at redhat.com)
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/list.h>
++#include <linux/jhash.h>
++#include <linux/interrupt.h>
++#include <linux/mm.h>
++#include <linux/random.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/smp.h>
++#include <linux/completion.h>
++#include <net/flow.h>
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++
++struct flow_cache_entry {
++ struct flow_cache_entry *next;
++ u16 family;
++ u8 dir;
++ struct flowi key;
++ u32 genid;
++ void *object;
++ atomic_t *object_ref;
++};
++
++atomic_t flow_cache_genid = ATOMIC_INIT(0);
++
++static u32 flow_hash_shift;
++#define flow_hash_size (1 << flow_hash_shift)
++static struct flow_cache_entry **flow_table;
++static kmem_cache_t *flow_cachep;
++
++static int flow_lwm, flow_hwm;
++
++struct flow_percpu_info {
++ int hash_rnd_recalc;
++ u32 hash_rnd;
++ int count;
++} ____cacheline_aligned;
++static struct flow_percpu_info flow_hash_info[NR_CPUS];
++
++#define flow_hash_rnd_recalc(cpu) (flow_hash_info[cpu].hash_rnd_recalc)
++#define flow_hash_rnd(cpu) (flow_hash_info[cpu].hash_rnd)
++#define flow_count(cpu) (flow_hash_info[cpu].count)
++
++static struct timer_list flow_hash_rnd_timer;
++
++#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
++
++struct flow_flush_info {
++ atomic_t cpuleft;
++ struct completion completion;
++};
++static struct tasklet_struct flow_flush_tasklets[NR_CPUS];
++static DECLARE_MUTEX(flow_flush_sem);
++
++static void flow_cache_new_hashrnd(unsigned long arg)
++{
++ int i;
++
++ for (i = 0; i < NR_CPUS; i++)
++ flow_hash_rnd_recalc(i) = 1;
++
++ flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
++ add_timer(&flow_hash_rnd_timer);
++}
++
++static void __flow_cache_shrink(int cpu, int shrink_to)
++{
++ struct flow_cache_entry *fle, **flp;
++ int i;
++
++ for (i = 0; i < flow_hash_size; i++) {
++ int k = 0;
++
++ flp = &flow_table[cpu*flow_hash_size+i];
++ while ((fle = *flp) != NULL && k < shrink_to) {
++ k++;
++ flp = &fle->next;
++ }
++ while ((fle = *flp) != NULL) {
++ *flp = fle->next;
++ if (fle->object)
++ atomic_dec(fle->object_ref);
++ kmem_cache_free(flow_cachep, fle);
++ flow_count(cpu)--;
++ }
++ }
++}
++
++static void flow_cache_shrink(int cpu)
++{
++ int shrink_to = flow_lwm / flow_hash_size;
++
++ __flow_cache_shrink(cpu, shrink_to);
++}
++
++static void flow_new_hash_rnd(int cpu)
++{
++ get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
++ flow_hash_rnd_recalc(cpu) = 0;
++
++ __flow_cache_shrink(cpu, 0);
++}
++
++static u32 flow_hash_code(struct flowi *key, int cpu)
++{
++ u32 *k = (u32 *) key;
++
++ return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
++ (flow_hash_size - 1));
++}
++
++#if (BITS_PER_LONG == 64)
++typedef u64 flow_compare_t;
++#else
++typedef u32 flow_compare_t;
++#endif
++
++extern void flowi_is_missized(void);
++
++/* I hear what you're saying, use memcmp. But memcmp cannot make
++ * important assumptions that we can here, such as alignment and
++ * constant size.
++ */
++static int flow_key_compare(struct flowi *key1, struct flowi *key2)
++{
++ flow_compare_t *k1, *k1_lim, *k2;
++ const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
++
++ if (sizeof(struct flowi) % sizeof(flow_compare_t))
++ flowi_is_missized();
++
++ k1 = (flow_compare_t *) key1;
++ k1_lim = k1 + n_elem;
++
++ k2 = (flow_compare_t *) key2;
++
++ do {
++ if (*k1++ != *k2++)
++ return 1;
++ } while (k1 < k1_lim);
++
++ return 0;
++}
++
++void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
++ flow_resolve_t resolver)
++{
++ struct flow_cache_entry *fle, **head;
++ unsigned int hash;
++ int cpu;
++
++ local_bh_disable();
++ cpu = smp_processor_id();
++ if (flow_hash_rnd_recalc(cpu))
++ flow_new_hash_rnd(cpu);
++ hash = flow_hash_code(key, cpu);
++
++ head = &flow_table[(cpu << flow_hash_shift) + hash];
++ for (fle = *head; fle; fle = fle->next) {
++ if (fle->family == family &&
++ fle->dir == dir &&
++ flow_key_compare(key, &fle->key) == 0) {
++ if (fle->genid == atomic_read(&flow_cache_genid)) {
++ void *ret = fle->object;
++
++ if (ret)
++ atomic_inc(fle->object_ref);
++ local_bh_enable();
++
++ return ret;
++ }
++ break;
++ }
++ }
++
++ if (!fle) {
++ if (flow_count(cpu) > flow_hwm)
++ flow_cache_shrink(cpu);
++
++ fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC);
++ if (fle) {
++ fle->next = *head;
++ *head = fle;
++ fle->family = family;
++ fle->dir = dir;
++ memcpy(&fle->key, key, sizeof(*key));
++ fle->object = NULL;
++ flow_count(cpu)++;
++ }
++ }
++
++ {
++ void *obj;
++ atomic_t *obj_ref;
++
++ resolver(key, family, dir, &obj, &obj_ref);
++
++ if (fle) {
++ fle->genid = atomic_read(&flow_cache_genid);
++
++ if (fle->object)
++ atomic_dec(fle->object_ref);
++
++ fle->object = obj;
++ fle->object_ref = obj_ref;
++ if (obj)
++ atomic_inc(fle->object_ref);
++ }
++ local_bh_enable();
++
++ return obj;
++ }
++}
++
++static void flow_cache_flush_tasklet(unsigned long data)
++{
++ struct flow_flush_info *info = (void *)data;
++ int i;
++ int cpu;
++
++ cpu = smp_processor_id();
++ for (i = 0; i < flow_hash_size; i++) {
++ struct flow_cache_entry *fle;
++
++ fle = flow_table[(cpu << flow_hash_shift) + i];
++ for (; fle; fle = fle->next) {
++ unsigned genid = atomic_read(&flow_cache_genid);
++
++ if (!fle->object || fle->genid == genid)
++ continue;
++
++ fle->object = NULL;
++ atomic_dec(fle->object_ref);
++ }
++ }
++
++ if (atomic_dec_and_test(&info->cpuleft))
++ complete(&info->completion);
++}
++
++static void flow_cache_flush_per_cpu(void *data)
++{
++ struct flow_flush_info *info = data;
++ int cpu;
++ struct tasklet_struct *tasklet;
++
++ cpu = smp_processor_id();
++ tasklet = &flow_flush_tasklets[cpu];
++ tasklet_init(tasklet, flow_cache_flush_tasklet, (unsigned long)info);
++ tasklet_schedule(tasklet);
++}
++
++void flow_cache_flush(void)
++{
++ struct flow_flush_info info;
++
++ atomic_set(&info.cpuleft, smp_num_cpus);
++ init_completion(&info.completion);
++
++ down(&flow_flush_sem);
++
++ local_bh_disable();
++ smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
++ flow_cache_flush_per_cpu(&info);
++ local_bh_enable();
++
++ wait_for_completion(&info.completion);
++
++ up(&flow_flush_sem);
++}
++
++static int __init flow_cache_init(void)
++{
++ unsigned long order;
++ int i;
++
++ flow_cachep = kmem_cache_create("flow_cache",
++ sizeof(struct flow_cache_entry),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++
++ if (!flow_cachep)
++ panic("NET: failed to allocate flow cache slab\n");
++
++ flow_hash_shift = 10;
++ flow_lwm = 2 * flow_hash_size;
++ flow_hwm = 4 * flow_hash_size;
++
++ for (i = 0; i < NR_CPUS; i++)
++ flow_hash_rnd_recalc(i) = 1;
++
++ init_timer(&flow_hash_rnd_timer);
++ flow_hash_rnd_timer.function = flow_cache_new_hashrnd;
++ flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
++ add_timer(&flow_hash_rnd_timer);
++
++ for (order = 0;
++ (PAGE_SIZE << order) <
++ (NR_CPUS*sizeof(struct flow_entry *)*flow_hash_size);
++ order++)
++ /* NOTHING */;
++
++ flow_table = (struct flow_cache_entry **)
++ __get_free_pages(GFP_ATOMIC, order);
++
++ if (!flow_table)
++ panic("Failed to allocate flow cache hash table\n");
++
++ memset(flow_table, 0, PAGE_SIZE << order);
++
++ return 0;
++}
++
++module_init(flow_cache_init);
+diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
+--- a/net/core/neighbour.c 2005-02-13 21:25:09 +11:00
++++ b/net/core/neighbour.c 2005-02-13 21:25:09 +11:00
+@@ -737,7 +737,9 @@
+ static __inline__ int neigh_max_probes(struct neighbour *n)
+ {
+ struct neigh_parms *p = n->parms;
+- return p->ucast_probes + p->app_probes + p->mcast_probes;
++ return (n->nud_state & NUD_PROBE ?
++ p->ucast_probes :
++ p->ucast_probes + p->app_probes + p->mcast_probes);
+ }
+
+
+@@ -1227,9 +1229,6 @@
+ if (*p == parms) {
+ *p = parms->next;
+ write_unlock_bh(&tbl->lock);
+-#ifdef CONFIG_SYSCTL
+- neigh_sysctl_unregister(parms);
+-#endif
+ kfree(parms);
+ return;
+ }
+@@ -1326,9 +1325,6 @@
+ kfree(tbl->phash_buckets);
+ tbl->phash_buckets = NULL;
+
+-#ifdef CONFIG_SYSCTL
+- neigh_sysctl_unregister(&tbl->parms);
+-#endif
+ return 0;
+ }
+
+diff -Nru a/net/core/netfilter.c b/net/core/netfilter.c
+--- a/net/core/netfilter.c 2005-02-13 21:25:09 +11:00
++++ b/net/core/netfilter.c 2005-02-13 21:25:09 +11:00
+@@ -563,7 +563,7 @@
+ {
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt;
+- struct rt_key key = {};
++ struct flowi fl = {};
+ struct dst_entry *odst;
+ unsigned int hh_len;
+
+@@ -571,14 +571,15 @@
+ * packets with foreign saddr to be appear on the NF_IP_LOCAL_OUT hook.
+ */
+ if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+- key.dst = iph->daddr;
+- key.src = iph->saddr;
+- key.oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0;
+- key.tos = RT_TOS(iph->tos);
++ fl.nl_u.ip4_u.daddr = iph->daddr;
++ fl.nl_u.ip4_u.saddr = iph->saddr;
++ fl.oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0;
++ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- key.fwmark = (*pskb)->nfmark;
++ fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+ #endif
+- if (ip_route_output_key(&rt, &key) != 0)
++ fl.proto = iph->protocol;
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ /* Drop old route. */
+@@ -587,8 +588,8 @@
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+- key.dst = iph->saddr;
+- if (ip_route_output_key(&rt, &key) != 0)
++ fl.nl_u.ip4_u.daddr = iph->saddr;
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ odst = (*pskb)->dst;
+diff -Nru a/net/core/rtnetlink.c b/net/core/rtnetlink.c
+--- a/net/core/rtnetlink.c 2005-02-13 21:25:10 +11:00
++++ b/net/core/rtnetlink.c 2005-02-13 21:25:10 +11:00
+@@ -128,7 +128,7 @@
+ return err;
+ }
+
+-int rtnetlink_put_metrics(struct sk_buff *skb, unsigned *metrics)
++int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
+ {
+ struct rtattr *mx = (struct rtattr*)skb->tail;
+ int i;
+@@ -136,7 +136,7 @@
+ RTA_PUT(skb, RTA_METRICS, 0, NULL);
+ for (i=0; i<RTAX_MAX; i++) {
+ if (metrics[i])
+- RTA_PUT(skb, i+1, sizeof(unsigned), metrics+i);
++ RTA_PUT(skb, i+1, sizeof(u32), metrics+i);
+ }
+ mx->rta_len = skb->tail - (u8*)mx;
+ if (mx->rta_len == RTA_LENGTH(0))
+diff -Nru a/net/core/skbuff.c b/net/core/skbuff.c
+--- a/net/core/skbuff.c 2005-02-13 21:25:09 +11:00
++++ b/net/core/skbuff.c 2005-02-13 21:25:09 +11:00
+@@ -57,6 +57,7 @@
+ #include <net/dst.h>
+ #include <net/sock.h>
+ #include <net/checksum.h>
++#include <net/xfrm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -201,6 +202,7 @@
+
+ /* Set up other state */
+ skb->len = 0;
++ skb->local_df = 0;
+ skb->cloned = 0;
+ skb->data_len = 0;
+
+@@ -233,6 +235,7 @@
+ skb->dev = NULL;
+ skb->real_dev = NULL;
+ skb->dst = NULL;
++ skb->sp = NULL;
+ memset(skb->cb, 0, sizeof(skb->cb));
+ skb->pkt_type = PACKET_HOST; /* Default type */
+ skb->ip_summed = 0;
+@@ -317,6 +320,9 @@
+ }
+
+ dst_release(skb->dst);
++#ifdef CONFIG_XFRM
++ secpath_put(skb->sp);
++#endif
+ if(skb->destructor) {
+ if (in_irq()) {
+ printk(KERN_WARNING "Warning: kfree_skb on hard IRQ %p\n",
+@@ -369,10 +375,15 @@
+ C(mac);
+ C(dst);
+ dst_clone(n->dst);
++ C(sp);
++#ifdef CONFIG_INET
++ secpath_get(n->sp);
++#endif
+ memcpy(n->cb, skb->cb, sizeof(skb->cb));
+ C(len);
+ C(data_len);
+ C(csum);
++ C(local_df);
+ n->cloned = 1;
+ C(pkt_type);
+ C(ip_summed);
+@@ -423,11 +434,15 @@
+ new->priority=old->priority;
+ new->protocol=old->protocol;
+ new->dst=dst_clone(old->dst);
++#ifdef CONFIG_INET
++ new->sp=secpath_get(old->sp);
++#endif
+ new->h.raw=old->h.raw+offset;
+ new->nh.raw=old->nh.raw+offset;
+ new->mac.raw=old->mac.raw+offset;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ atomic_set(&new->users, 1);
++ new->local_df=old->local_df;
+ new->pkt_type=old->pkt_type;
+ new->stamp=old->stamp;
+ new->destructor = NULL;
+diff -Nru a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
+--- a/net/decnet/dn_nsp_out.c 2005-02-13 21:25:10 +11:00
++++ b/net/decnet/dn_nsp_out.c 2005-02-13 21:25:10 +11:00
+@@ -593,7 +593,7 @@
+ * associations.
+ */
+ skb->dst = dst_clone(dst);
+- skb->dst->output(skb);
++ dst_output(skb);
+ }
+
+
+diff -Nru a/net/decnet/dn_route.c b/net/decnet/dn_route.c
+--- a/net/decnet/dn_route.c 2005-02-13 21:25:10 +11:00
++++ b/net/decnet/dn_route.c 2005-02-13 21:25:10 +11:00
+@@ -100,7 +100,6 @@
+
+ static int dn_dst_gc(void);
+ static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
+-static struct dst_entry *dn_dst_reroute(struct dst_entry *, struct sk_buff *skb);
+ static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
+ static void dn_dst_link_failure(struct sk_buff *);
+ static int dn_route_input(struct sk_buff *);
+@@ -119,7 +118,6 @@
+ gc_thresh: 128,
+ gc: dn_dst_gc,
+ check: dn_dst_check,
+- reroute: dn_dst_reroute,
+ negative_advice: dn_dst_negative_advice,
+ link_failure: dn_dst_link_failure,
+ entry_size: sizeof(struct dn_route),
+@@ -202,12 +200,6 @@
+ return NULL;
+ }
+
+-static struct dst_entry *dn_dst_reroute(struct dst_entry *dst,
+- struct sk_buff *skb)
+-{
+- return NULL;
+-}
+-
+ /*
+ * This is called through sendmsg() when you specify MSG_TRYHARD
+ * and there is already a route in cache.
+@@ -396,7 +388,7 @@
+ int err;
+
+ if ((err = dn_route_input(skb)) == 0)
+- return skb->dst->input(skb);
++ return dst_input(skb);
+
+ if (decnet_debug_level & 4) {
+ char *devname = skb->dev ? skb->dev->name : "???";
+@@ -1049,10 +1041,12 @@
+ RTA_PUT(skb, RTA_SRC, 2, &rt->rt_saddr);
+ if (rt->u.dst.dev)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+- if (rt->u.dst.window)
+- RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+- if (rt->u.dst.rtt)
+- RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
++ if (dst_metric(&rt->u.dst, RTAX_WINDOW))
++ RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned),
++ &rt->u.dst.metrics[RTAX_WINDOW - 1]);
++ if (dst_metric(&rt->u.dst, RTAX_RTT))
++ RTA_PUT(skb, RTAX_RTT, sizeof(unsigned),
++ &rt->u.dst.metrics[RTAX_RTT]);
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+@@ -1208,7 +1202,7 @@
+ dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2),
+ atomic_read(&rt->u.dst.__refcnt),
+ rt->u.dst.__use,
+- (int)rt->u.dst.rtt
++ (int) dst_metric(&rt->u.dst, RTAX_RTT)
+ );
+
+
+diff -Nru a/net/ipv4/Config.in b/net/ipv4/Config.in
+--- a/net/ipv4/Config.in 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/Config.in 2005-02-13 21:25:09 +11:00
+@@ -40,6 +40,18 @@
+ fi
+ bool ' IP: TCP Explicit Congestion Notification support' CONFIG_INET_ECN
+ bool ' IP: TCP syncookie support (disabled per default)' CONFIG_SYN_COOKIES
++tristate ' IP: AH transformation' CONFIG_INET_AH
++tristate ' IP: ESP transformation' CONFIG_INET_ESP
++tristate ' IP: IPComp transformation' CONFIG_INET_IPCOMP
++if [ "$CONFIG_INET_IPIP" = "y" -o "$CONFIG_INET_IPCOMP" = "y" ]; then
++ define_tristate CONFIG_INET_TUNNEL y
++else
++ if [ "$CONFIG_INET_IPIP" = "m" -o "$CONFIG_INET_IPCOMP" = "m" ]; then
++ define_tristate CONFIG_INET_TUNNEL m
++ else
++ tristate ' IP: tunnel transformation' CONFIG_INET_TUNNEL
++ fi
++fi
+ if [ "$CONFIG_NETFILTER" != "n" ]; then
+ source net/ipv4/netfilter/Config.in
+ fi
+diff -Nru a/net/ipv4/Makefile b/net/ipv4/Makefile
+--- a/net/ipv4/Makefile 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/Makefile 2005-02-13 21:25:09 +11:00
+@@ -9,7 +9,7 @@
+
+ O_TARGET := ipv4.o
+
+-export-objs = ipip.o ip_gre.o
++export-objs = ipip.o ip_gre.o xfrm4_input.o xfrm4_tunnel.o
+
+ obj-y := utils.o route.o inetpeer.o proc.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+@@ -24,6 +24,13 @@
+ obj-$(CONFIG_NET_IPIP) += ipip.o
+ obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+ obj-$(CONFIG_SYN_COOKIES) += syncookies.o
++obj-$(CONFIG_INET_AH) += ah4.o
++obj-$(CONFIG_INET_ESP) += esp4.o
++obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
++obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o
+ obj-$(CONFIG_IP_PNP) += ipconfig.o
++
++obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
++ xfrm4_output.o
+
+ include $(TOPDIR)/Rules.make
+diff -Nru a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+--- a/net/ipv4/af_inet.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/af_inet.c 2005-02-13 21:25:09 +11:00
+@@ -89,6 +89,7 @@
+
+ #include <linux/smp_lock.h>
+ #include <linux/inet.h>
++#include <linux/igmp.h>
+ #include <linux/netdevice.h>
+ #include <linux/brlock.h>
+ #include <net/ip.h>
+@@ -103,6 +104,7 @@
+ #include <net/icmp.h>
+ #include <net/ipip.h>
+ #include <net/inet_common.h>
++#include <net/xfrm.h>
+ #ifdef CONFIG_IP_MROUTE
+ #include <linux/mroute.h>
+ #endif
+@@ -213,6 +215,8 @@
+
+ sock_orphan(sk);
+
++ xfrm_sk_free_policy(sk);
++
+ #ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&sk->refcnt) != 1) {
+ printk(KERN_DEBUG "Destruction inet %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
+@@ -386,7 +390,7 @@
+
+ sk->backlog_rcv = sk->prot->backlog_rcv;
+
+- sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
++ sk->protinfo.af_inet.uc_ttl = -1;
+
+ sk->protinfo.af_inet.mc_loop = 1;
+ sk->protinfo.af_inet.mc_ttl = 1;
+@@ -698,6 +702,27 @@
+ return err;
+ }
+
++#ifdef CONFIG_IP_MULTICAST
++static struct inet_protocol igmp_protocol = {
++ .handler = igmp_rcv,
++};
++#endif
++
++static struct inet_protocol tcp_protocol = {
++ .handler = tcp_v4_rcv,
++ .err_handler = tcp_v4_err,
++ .no_policy = 1,
++};
++
++static struct inet_protocol udp_protocol = {
++ .handler = udp_rcv,
++ .err_handler = udp_err,
++ .no_policy = 1,
++};
++
++static struct inet_protocol icmp_protocol = {
++ .handler = icmp_rcv,
++};
+
+ /*
+ * This does both peername and sockname.
+@@ -724,6 +749,7 @@
+ sin->sin_port = sk->sport;
+ sin->sin_addr.s_addr = addr;
+ }
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return(0);
+ }
+@@ -757,6 +783,21 @@
+ return sk->prot->sendmsg(sk, msg, size);
+ }
+
++
++ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
++{
++ struct sock *sk = sock->sk;
++
++ /* We may need to bind the socket. */
++ if (!sk->num && inet_autobind(sk))
++ return -EAGAIN;
++
++ if (sk->prot->sendpage)
++ return sk->prot->sendpage(sk, page, offset, size, flags);
++ return sock_no_sendpage(sock, page, offset, size, flags);
++}
++
++
+ int inet_shutdown(struct socket *sock, int how)
+ {
+ struct sock *sk = sock->sk;
+@@ -1002,7 +1043,7 @@
+ sendmsg: inet_sendmsg,
+ recvmsg: inet_recvmsg,
+ mmap: sock_no_mmap,
+- sendpage: sock_no_sendpage,
++ sendpage: inet_sendpage,
+ };
+
+ struct net_proto_family inet_family_ops = {
+@@ -1130,7 +1171,6 @@
+ static int __init inet_init(void)
+ {
+ struct sk_buff *dummy_skb;
+- struct inet_protocol *p;
+ struct inet_protosw *q;
+ struct list_head *r;
+
+@@ -1148,16 +1188,19 @@
+ (void) sock_register(&inet_family_ops);
+
+ /*
+- * Add all the protocols.
++ * Add all the base protocols.
+ */
+
+- printk(KERN_INFO "IP Protocols: ");
+- for (p = inet_protocol_base; p != NULL;) {
+- struct inet_protocol *tmp = (struct inet_protocol *) p->next;
+- inet_add_protocol(p);
+- printk("%s%s",p->name,tmp?", ":"\n");
+- p = tmp;
+- }
++ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
++ printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
++ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
++ printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
++ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
++ printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
++#ifdef CONFIG_IP_MULTICAST
++ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
++ printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
++#endif
+
+ /* Register the socket-side information for inet_create. */
+ for(r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+diff -Nru a/net/ipv4/ah4.c b/net/ipv4/ah4.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/ah4.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,337 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/ah.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <net/icmp.h>
++#include <asm/scatterlist.h>
++
++
++/* Clear mutable options and find final destination to substitute
++ * into IP header for icv calculation. Options are already checked
++ * for validity, so paranoia is not required. */
++
++static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
++{
++ unsigned char * optptr = (unsigned char*)(iph+1);
++ int l = iph->ihl*4 - sizeof(struct iphdr);
++ int optlen;
++
++ while (l > 0) {
++ switch (*optptr) {
++ case IPOPT_END:
++ return 0;
++ case IPOPT_NOOP:
++ l--;
++ optptr++;
++ continue;
++ }
++ optlen = optptr[1];
++ if (optlen<2 || optlen>l)
++ return -EINVAL;
++ switch (*optptr) {
++ case IPOPT_SEC:
++ case 0x85: /* Some "Extended Security" crap. */
++ case 0x86: /* Another "Commercial Security" crap. */
++ case IPOPT_RA:
++ case 0x80|21: /* RFC1770 */
++ break;
++ case IPOPT_LSRR:
++ case IPOPT_SSRR:
++ if (optlen < 6)
++ return -EINVAL;
++ memcpy(daddr, optptr+optlen-4, 4);
++ /* Fall through */
++ default:
++ memset(optptr+2, 0, optlen-2);
++ }
++ l -= optlen;
++ optptr += optlen;
++ }
++ return 0;
++}
++
++static int ah_output(struct sk_buff *skb)
++{
++ int err;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct iphdr *iph, *top_iph;
++ struct ip_auth_hdr *ah;
++ struct ah_data *ahp;
++ union {
++ struct iphdr iph;
++ char buf[60];
++ } tmp_iph;
++
++ top_iph = skb->nh.iph;
++ iph = &tmp_iph.iph;
++
++ iph->tos = top_iph->tos;
++ iph->ttl = top_iph->ttl;
++ iph->frag_off = top_iph->frag_off;
++
++ if (top_iph->ihl != 5) {
++ iph->daddr = top_iph->daddr;
++ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
++ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
++ if (err)
++ goto error;
++ }
++
++ ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
++ ah->nexthdr = top_iph->protocol;
++
++ top_iph->tos = 0;
++ top_iph->tot_len = htons(skb->len);
++ top_iph->frag_off = 0;
++ top_iph->ttl = 0;
++ top_iph->protocol = IPPROTO_AH;
++ top_iph->check = 0;
++
++ ahp = x->data;
++ ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
++ ahp->icv_trunc_len) >> 2) - 2;
++
++ ah->reserved = 0;
++ ah->spi = x->id.spi;
++ ah->seq_no = htonl(++x->replay.oseq);
++ ahp->icv(ahp, skb, ah->auth_data);
++
++ top_iph->tos = iph->tos;
++ top_iph->ttl = iph->ttl;
++ top_iph->frag_off = iph->frag_off;
++ if (top_iph->ihl != 5) {
++ top_iph->daddr = iph->daddr;
++ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
++ }
++
++ ip_send_check(top_iph);
++
++ err = 0;
++
++error:
++ return err;
++}
++
++static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ int ah_hlen;
++ struct iphdr *iph;
++ struct ip_auth_hdr *ah;
++ struct ah_data *ahp;
++ char work_buf[60];
++
++ if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
++ goto out;
++
++ ah = (struct ip_auth_hdr*)skb->data;
++ ahp = x->data;
++ ah_hlen = (ah->hdrlen + 2) << 2;
++
++ if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
++ ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
++ goto out;
++
++ if (!pskb_may_pull(skb, ah_hlen))
++ goto out;
++
++ /* We are going to _remove_ AH header to keep sockets happy,
++ * so... Later this can change. */
++ if (skb_cloned(skb) &&
++ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ goto out;
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ ah = (struct ip_auth_hdr*)skb->data;
++ iph = skb->nh.iph;
++
++ memcpy(work_buf, iph, iph->ihl*4);
++
++ iph->ttl = 0;
++ iph->tos = 0;
++ iph->frag_off = 0;
++ iph->check = 0;
++ if (iph->ihl != 5) {
++ u32 dummy;
++ if (ip_clear_mutable_options(iph, &dummy))
++ goto out;
++ }
++ {
++ u8 auth_data[MAX_AH_AUTH_LEN];
++
++ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
++ skb_push(skb, skb->data - skb->nh.raw);
++ ahp->icv(ahp, skb, ah->auth_data);
++ if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
++ x->stats.integrity_failed++;
++ goto out;
++ }
++ }
++ ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
++ skb->nh.raw = skb_pull(skb, ah_hlen);
++ memcpy(skb->nh.raw, work_buf, iph->ihl*4);
++ skb->nh.iph->tot_len = htons(skb->len);
++ skb_pull(skb, skb->nh.iph->ihl*4);
++ skb->h.raw = skb->data;
++
++ return 0;
++
++out:
++ return -EINVAL;
++}
++
++static void ah4_err(struct sk_buff *skb, u32 info)
++{
++ struct iphdr *iph = (struct iphdr*)skb->data;
++ struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
++ struct xfrm_state *x;
++
++ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
++ skb->h.icmph->code != ICMP_FRAG_NEEDED)
++ return;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
++ if (!x)
++ return;
++ printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
++ ntohl(ah->spi), ntohl(iph->daddr));
++ xfrm_state_put(x);
++}
++
++static int ah_init_state(struct xfrm_state *x, void *args)
++{
++ struct ah_data *ahp = NULL;
++ struct xfrm_algo_desc *aalg_desc;
++
++ if (!x->aalg)
++ goto error;
++
++ /* null auth can use a zero length key */
++ if (x->aalg->alg_key_len > 512)
++ goto error;
++
++ if (x->encap)
++ goto error;
++
++ ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
++ if (ahp == NULL)
++ return -ENOMEM;
++
++ memset(ahp, 0, sizeof(*ahp));
++
++ ahp->key = x->aalg->alg_key;
++ ahp->key_len = (x->aalg->alg_key_len+7)/8;
++ ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
++ if (!ahp->tfm)
++ goto error;
++ ahp->icv = ah_hmac_digest;
++
++ /*
++ * Lookup the algorithm description maintained by xfrm_algo,
++ * verify crypto transform properties, and store information
++ * we need for AH processing. This lookup cannot fail here
++ * after a successful crypto_alloc_tfm().
++ */
++ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name);
++ BUG_ON(!aalg_desc);
++
++ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
++ crypto_tfm_alg_digestsize(ahp->tfm)) {
++ printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
++ x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
++ aalg_desc->uinfo.auth.icv_fullbits/8);
++ goto error;
++ }
++
++ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
++ ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
++
++ BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
++
++ ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
++ if (!ahp->work_icv)
++ goto error;
++
++ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct iphdr);
++ x->data = ahp;
++
++ return 0;
++
++error:
++ if (ahp) {
++ if (ahp->work_icv)
++ kfree(ahp->work_icv);
++ if (ahp->tfm)
++ crypto_free_tfm(ahp->tfm);
++ kfree(ahp);
++ }
++ return -EINVAL;
++}
++
++static void ah_destroy(struct xfrm_state *x)
++{
++ struct ah_data *ahp = x->data;
++
++ if (!ahp)
++ return;
++
++ if (ahp->work_icv) {
++ kfree(ahp->work_icv);
++ ahp->work_icv = NULL;
++ }
++ if (ahp->tfm) {
++ crypto_free_tfm(ahp->tfm);
++ ahp->tfm = NULL;
++ }
++ kfree(ahp);
++}
++
++
++static struct xfrm_type ah_type =
++{
++ .description = "AH4",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_AH,
++ .init_state = ah_init_state,
++ .destructor = ah_destroy,
++ .input = ah_input,
++ .output = ah_output
++};
++
++static struct inet_protocol ah4_protocol = {
++ .handler = xfrm4_rcv,
++ .err_handler = ah4_err,
++ .no_policy = 1,
++};
++
++static int __init ah4_init(void)
++{
++ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
++ printk(KERN_INFO "ip ah init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
++ printk(KERN_INFO "ip ah init: can't add protocol\n");
++ xfrm_unregister_type(&ah_type, AF_INET);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit ah4_fini(void)
++{
++ if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
++ printk(KERN_INFO "ip ah close: can't remove protocol\n");
++ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
++ printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
++}
++
++module_init(ah4_init);
++module_exit(ah4_fini);
++MODULE_LICENSE("GPL");
+diff -Nru a/net/ipv4/arp.c b/net/ipv4/arp.c
+--- a/net/ipv4/arp.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/arp.c 2005-02-13 21:25:09 +11:00
+@@ -409,11 +409,13 @@
+
+ static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
+ {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
++ .saddr = tip } } };
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+
+- if (ip_route_output(&rt, sip, tip, 0, 0) < 0)
++ if (ip_route_output_key(&rt, &fl) < 0)
+ return 1;
+ if (rt->u.dst.dev != dev) {
+ NET_INC_STATS_BH(ArpFilter);
+@@ -559,11 +561,11 @@
+ */
+
+ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+- + dev->hard_header_len + 15, GFP_ATOMIC);
++ + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+- skb_reserve(skb, (dev->hard_header_len+15)&~15);
++ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb->nh.raw = skb->data;
+ arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+ skb->dev = dev;
+@@ -1012,8 +1014,10 @@
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (dev == NULL) {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
++ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+- if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
++ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+@@ -1113,8 +1117,10 @@
+ }
+
+ if (dev == NULL) {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
++ .tos = RTO_ONLINK } } };
+ struct rtable * rt;
+- if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
++ if ((err = ip_route_output_key(&rt, &fl)) != 0)
+ return err;
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+diff -Nru a/net/ipv4/devinet.c b/net/ipv4/devinet.c
+--- a/net/ipv4/devinet.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/devinet.c 2005-02-13 21:25:09 +11:00
+@@ -180,7 +180,9 @@
+ /* in_dev_put following below will kill the in_device */
+ write_unlock_bh(&inetdev_lock);
+
+-
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_unregister(in_dev->arp_parms);
++#endif
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ in_dev_put(in_dev);
+ }
+@@ -942,6 +944,8 @@
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ inet_insert_ifa(ifa);
+ }
++ in_dev->cnf.no_xfrm = 1;
++ in_dev->cnf.no_policy = 1;
+ }
+ ip_mc_up(in_dev);
+ break;
+@@ -1148,6 +1152,62 @@
+ return ret;
+ }
+
++int ipv4_doint_and_flush(ctl_table *ctl, int write,
++ struct file* filp, void *buffer,
++ size_t *lenp)
++{
++ int *valp = ctl->data;
++ int val = *valp;
++ int ret = proc_dointvec(ctl, write, filp, buffer, lenp);
++
++ if (write && *valp != val)
++ rt_cache_flush(0);
++
++ return ret;
++}
++
++int ipv4_doint_and_flush_strategy(ctl_table *table, int *name, int nlen,
++ void *oldval, size_t *oldlenp,
++ void *newval, size_t newlen,
++ void **context)
++{
++ int *valp = table->data;
++ int new;
++
++ if (!newval || !newlen)
++ return 0;
++
++ if (newlen != sizeof(int))
++ return -EINVAL;
++
++ if (get_user(new, (int *)newval))
++ return -EFAULT;
++
++ if (new == *valp)
++ return 0;
++
++ if (oldval && oldlenp) {
++ size_t len;
++
++ if (get_user(len, oldlenp))
++ return -EFAULT;
++
++ if (len) {
++ if (len > table->maxlen)
++ len = table->maxlen;
++ if (copy_to_user(oldval, valp, len))
++ return -EFAULT;
++ if (put_user(len, oldlenp))
++ return -EFAULT;
++ }
++ }
++
++ *valp = new;
++ rt_cache_flush(0);
++ return 1;
++}
++
++
+ static struct devinet_sysctl_table
+ {
+ struct ctl_table_header *sysctl_header;
+@@ -1206,6 +1266,12 @@
+ {NET_IPV4_CONF_ARP_IGNORE, "arp_ignore",
+ &ipv4_devconf.arp_ignore, sizeof(int), 0644, NULL,
+ &proc_dointvec},
++ {NET_IPV4_CONF_NOXFRM, "disable_xfrm",
++ &ipv4_devconf.no_xfrm, sizeof(int), 0644, NULL,
++ &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy,},
++ {NET_IPV4_CONF_NOPOLICY, "disable_policy",
++ &ipv4_devconf.no_policy, sizeof(int), 0644, NULL,
++ &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy},
+ {NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version",
+ &ipv4_devconf.force_igmp_version, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+diff -Nru a/net/ipv4/esp4.c b/net/ipv4/esp4.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/esp4.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,511 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/esp.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <linux/random.h>
++#include <net/icmp.h>
++#include <net/udp.h>
++
++/* decapsulation data for use when post-processing */
++struct esp_decap_data {
++ xfrm_address_t saddr;
++ __u16 sport;
++ __u8 proto;
++};
++
++static int esp_output(struct sk_buff *skb)
++{
++ int err;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct iphdr *top_iph;
++ struct ip_esp_hdr *esph;
++ struct crypto_tfm *tfm;
++ struct esp_data *esp;
++ struct sk_buff *trailer;
++ int blksize;
++ int clen;
++ int alen;
++ int nfrags;
++
++ /* Strip IP+ESP header. */
++ __skb_pull(skb, skb->h.raw - skb->data);
++ /* Now skb is pure payload to encrypt */
++
++ err = -ENOMEM;
++
++ /* Round to block size */
++ clen = skb->len;
++
++ esp = x->data;
++ alen = esp->auth.icv_trunc_len;
++ tfm = esp->conf.tfm;
++ blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
++ clen = (clen + 2 + blksize-1)&~(blksize-1);
++ if (esp->conf.padlen)
++ clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
++
++ if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
++ goto error;
++
++ /* Fill padding... */
++ do {
++ int i;
++ for (i=0; i<clen-skb->len - 2; i++)
++ *(u8*)(trailer->tail + i) = i+1;
++ } while (0);
++ *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
++ pskb_put(skb, trailer, clen - skb->len);
++
++ __skb_push(skb, skb->data - skb->nh.raw);
++ top_iph = skb->nh.iph;
++ esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
++ top_iph->tot_len = htons(skb->len + alen);
++ *(u8*)(trailer->tail - 1) = top_iph->protocol;
++
++ /* this is non-NULL only with UDP Encapsulation */
++ if (x->encap) {
++ struct xfrm_encap_tmpl *encap = x->encap;
++ struct udphdr *uh;
++ u32 *udpdata32;
++
++ uh = (struct udphdr *)esph;
++ uh->source = encap->encap_sport;
++ uh->dest = encap->encap_dport;
++ uh->len = htons(skb->len + alen - top_iph->ihl*4);
++ uh->check = 0;
++
++ switch (encap->encap_type) {
++ default:
++ case UDP_ENCAP_ESPINUDP:
++ esph = (struct ip_esp_hdr *)(uh + 1);
++ break;
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ udpdata32 = (u32 *)(uh + 1);
++ udpdata32[0] = udpdata32[1] = 0;
++ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
++ break;
++ }
++
++ top_iph->protocol = IPPROTO_UDP;
++ } else
++ top_iph->protocol = IPPROTO_ESP;
++
++ esph->spi = x->id.spi;
++ esph->seq_no = htonl(++x->replay.oseq);
++
++ if (esp->conf.ivlen)
++ crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++
++ do {
++ struct scatterlist *sg = &esp->sgbuf[0];
++
++ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
++ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
++ if (!sg)
++ goto error;
++ }
++ skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
++ crypto_cipher_encrypt(tfm, sg, sg, clen);
++ if (unlikely(sg != &esp->sgbuf[0]))
++ kfree(sg);
++ } while (0);
++
++ if (esp->conf.ivlen) {
++ memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++ crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++ }
++
++ if (esp->auth.icv_full_len) {
++ esp->auth.icv(esp, skb, (u8*)esph-skb->data,
++ sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
++ pskb_put(skb, trailer, alen);
++ }
++
++ ip_send_check(top_iph);
++
++ err = 0;
++
++error:
++ return err;
++}
++
++/*
++ * Note: detecting truncated vs. non-truncated authentication data is very
++ * expensive, so we only support truncated data, which is the recommended
++ * and common case.
++ */
++static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ struct iphdr *iph;
++ struct ip_esp_hdr *esph;
++ struct esp_data *esp = x->data;
++ struct sk_buff *trailer;
++ int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
++ int alen = esp->auth.icv_trunc_len;
++ int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
++ int nfrags;
++ int encap_len = 0;
++
++ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
++ goto out;
++
++ if (elen <= 0 || (elen & (blksize-1)))
++ goto out;
++
++ /* If integrity check is required, do this. */
++ if (esp->auth.icv_full_len) {
++ u8 sum[esp->auth.icv_full_len];
++ u8 sum1[alen];
++
++ esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
++
++ if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
++ BUG();
++
++ if (unlikely(memcmp(sum, sum1, alen))) {
++ x->stats.integrity_failed++;
++ goto out;
++ }
++ }
++
++ if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
++ goto out;
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ esph = (struct ip_esp_hdr*)skb->data;
++ iph = skb->nh.iph;
++
++ /* Get ivec. This can be wrong, check against another impls. */
++ if (esp->conf.ivlen)
++ crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
++
++ {
++ u8 nexthdr[2];
++ struct scatterlist *sg = &esp->sgbuf[0];
++ u8 workbuf[60];
++ int padlen;
++
++ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
++ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
++ if (!sg)
++ goto out;
++ }
++ skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
++ crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
++ if (unlikely(sg != &esp->sgbuf[0]))
++ kfree(sg);
++
++ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
++ BUG();
++
++ padlen = nexthdr[0];
++ if (padlen+2 >= elen)
++ goto out;
++
++ /* ... check padding bits here. Silly. :-) */
++
++ if (x->encap && decap && decap->decap_type) {
++ struct esp_decap_data *encap_data;
++ struct udphdr *uh = (struct udphdr *) (iph+1);
++
++ encap_data = (struct esp_decap_data *) (decap->decap_data);
++ encap_data->proto = 0;
++
++ switch (decap->decap_type) {
++ case UDP_ENCAP_ESPINUDP:
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ encap_data->proto = AF_INET;
++ encap_data->saddr.a4 = iph->saddr;
++ encap_data->sport = uh->source;
++ encap_len = (void*)esph - (void*)uh;
++ break;
++
++ default:
++ goto out;
++ }
++ }
++
++ iph->protocol = nexthdr[1];
++ pskb_trim(skb, skb->len - alen - padlen - 2);
++ memcpy(workbuf, skb->nh.raw, iph->ihl*4);
++ skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
++ skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
++ memcpy(skb->nh.raw, workbuf, iph->ihl*4);
++ skb->nh.iph->tot_len = htons(skb->len);
++ }
++
++ return 0;
++
++out:
++ return -EINVAL;
++}
++
++static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++
++ if (x->encap) {
++ struct xfrm_encap_tmpl *encap;
++ struct esp_decap_data *decap_data;
++
++ encap = x->encap;
++ decap_data = (struct esp_decap_data *)(decap->decap_data);
++
++ /* first, make sure that the decap type == the encap type */
++ if (encap->encap_type != decap->decap_type)
++ return -EINVAL;
++
++ switch (encap->encap_type) {
++ default:
++ case UDP_ENCAP_ESPINUDP:
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ /*
++ * 1) if the NAT-T peer's IP or port changed then
++ * advertize the change to the keying daemon.
++ * This is an inbound SA, so just compare
++ * SRC ports.
++ */
++ if (decap_data->proto == AF_INET &&
++ (decap_data->saddr.a4 != x->props.saddr.a4 ||
++ decap_data->sport != encap->encap_sport)) {
++ xfrm_address_t ipaddr;
++
++ ipaddr.a4 = decap_data->saddr.a4;
++ km_new_mapping(x, &ipaddr, decap_data->sport);
++
++ /* XXX: perhaps add an extra
++ * policy check here, to see
++ * if we should allow or
++ * reject a packet from a
++ * different source
++ * address/port.
++ */
++ }
++
++ /*
++ * 2) ignore UDP/TCP checksums in case
++ * of NAT-T in Transport Mode, or
++ * perform other post-processing fixes
++ * as per * draft-ietf-ipsec-udp-encaps-06,
++ * section 3.1.2
++ */
++ if (!x->props.mode)
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
++
++ break;
++ }
++ }
++ return 0;
++}
++
++static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
++{
++ struct esp_data *esp = x->data;
++ u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
++
++ if (x->props.mode) {
++ mtu = (mtu + 2 + blksize-1)&~(blksize-1);
++ } else {
++ /* The worst case. */
++ mtu += 2 + blksize;
++ }
++ if (esp->conf.padlen)
++ mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
++
++ return mtu + x->props.header_len + esp->auth.icv_trunc_len;
++}
++
++static void esp4_err(struct sk_buff *skb, u32 info)
++{
++ struct iphdr *iph = (struct iphdr*)skb->data;
++ struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
++ struct xfrm_state *x;
++
++ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
++ skb->h.icmph->code != ICMP_FRAG_NEEDED)
++ return;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
++ if (!x)
++ return;
++ printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
++ ntohl(esph->spi), ntohl(iph->daddr));
++ xfrm_state_put(x);
++}
++
++static void esp_destroy(struct xfrm_state *x)
++{
++ struct esp_data *esp = x->data;
++
++ if (!esp)
++ return;
++
++ if (esp->conf.tfm) {
++ crypto_free_tfm(esp->conf.tfm);
++ esp->conf.tfm = NULL;
++ }
++ if (esp->conf.ivec) {
++ kfree(esp->conf.ivec);
++ esp->conf.ivec = NULL;
++ }
++ if (esp->auth.tfm) {
++ crypto_free_tfm(esp->auth.tfm);
++ esp->auth.tfm = NULL;
++ }
++ if (esp->auth.work_icv) {
++ kfree(esp->auth.work_icv);
++ esp->auth.work_icv = NULL;
++ }
++ kfree(esp);
++}
++
++static int esp_init_state(struct xfrm_state *x, void *args)
++{
++ struct esp_data *esp = NULL;
++
++ /* null auth and encryption can have zero length keys */
++ if (x->aalg) {
++ if (x->aalg->alg_key_len > 512)
++ goto error;
++ }
++ if (x->ealg == NULL)
++ goto error;
++
++ esp = kmalloc(sizeof(*esp), GFP_KERNEL);
++ if (esp == NULL)
++ return -ENOMEM;
++
++ memset(esp, 0, sizeof(*esp));
++
++ if (x->aalg) {
++ struct xfrm_algo_desc *aalg_desc;
++
++ esp->auth.key = x->aalg->alg_key;
++ esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
++ esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
++ if (esp->auth.tfm == NULL)
++ goto error;
++ esp->auth.icv = esp_hmac_digest;
++
++ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name);
++ BUG_ON(!aalg_desc);
++
++ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
++ crypto_tfm_alg_digestsize(esp->auth.tfm)) {
++ printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
++ x->aalg->alg_name,
++ crypto_tfm_alg_digestsize(esp->auth.tfm),
++ aalg_desc->uinfo.auth.icv_fullbits/8);
++ goto error;
++ }
++
++ esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
++ esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
++
++ esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
++ if (!esp->auth.work_icv)
++ goto error;
++ }
++ esp->conf.key = x->ealg->alg_key;
++ esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
++ if (x->props.ealgo == SADB_EALG_NULL)
++ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
++ else
++ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
++ if (esp->conf.tfm == NULL)
++ goto error;
++ esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
++ esp->conf.padlen = 0;
++ if (esp->conf.ivlen) {
++ esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
++ if (unlikely(esp->conf.ivec == NULL))
++ goto error;
++ get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
++ }
++ crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len);
++ x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct iphdr);
++ if (x->encap) {
++ struct xfrm_encap_tmpl *encap = x->encap;
++
++ switch (encap->encap_type) {
++ default:
++ goto error;
++ case UDP_ENCAP_ESPINUDP:
++ x->props.header_len += sizeof(struct udphdr);
++ break;
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
++ break;
++ }
++ }
++ x->data = esp;
++ x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
++ return 0;
++
++error:
++ x->data = esp;
++ esp_destroy(x);
++ x->data = NULL;
++ return -EINVAL;
++}
++
++static struct xfrm_type esp_type =
++{
++ .description = "ESP4",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_ESP,
++ .init_state = esp_init_state,
++ .destructor = esp_destroy,
++ .get_max_size = esp4_get_max_size,
++ .input = esp_input,
++ .post_input = esp_post_input,
++ .output = esp_output
++};
++
++static struct inet_protocol esp4_protocol = {
++ .handler = xfrm4_rcv,
++ .err_handler = esp4_err,
++ .no_policy = 1,
++};
++
++static int __init esp4_init(void)
++{
++ struct xfrm_decap_state decap;
++
++ if (sizeof(struct esp_decap_data) <
++ sizeof(decap.decap_data)) {
++ extern void decap_data_too_small(void);
++
++ decap_data_too_small();
++ }
++
++ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
++ printk(KERN_INFO "ip esp init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
++ printk(KERN_INFO "ip esp init: can't add protocol\n");
++ xfrm_unregister_type(&esp_type, AF_INET);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit esp4_fini(void)
++{
++ if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
++ printk(KERN_INFO "ip esp close: can't remove protocol\n");
++ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
++ printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
++}
++
++module_init(esp4_init);
++module_exit(esp4_fini);
++MODULE_LICENSE("GPL");
+diff -Nru a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
+--- a/net/ipv4/fib_frontend.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/fib_frontend.c 2005-02-13 21:25:10 +11:00
+@@ -144,17 +144,15 @@
+
+ struct net_device * ip_dev_find(u32 addr)
+ {
+- struct rt_key key;
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ struct net_device *dev = NULL;
+
+- memset(&key, 0, sizeof(key));
+- key.dst = addr;
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+ #endif
+
+- if (!local_table || local_table->tb_lookup(local_table, &key, &res)) {
++ if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) {
+ return NULL;
+ }
+ if (res.type != RTN_LOCAL)
+@@ -170,7 +168,7 @@
+
+ unsigned inet_addr_type(u32 addr)
+ {
+- struct rt_key key;
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+ struct fib_result res;
+ unsigned ret = RTN_BROADCAST;
+
+@@ -179,15 +177,13 @@
+ if (MULTICAST(addr))
+ return RTN_MULTICAST;
+
+- memset(&key, 0, sizeof(key));
+- key.dst = addr;
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+ #endif
+
+ if (local_table) {
+ ret = RTN_UNICAST;
+- if (local_table->tb_lookup(local_table, &key, &res) == 0) {
++ if (local_table->tb_lookup(local_table, &fl, &res) == 0) {
+ ret = res.type;
+ fib_res_put(&res);
+ }
+@@ -207,18 +203,15 @@
+ struct net_device *dev, u32 *spec_dst, u32 *itag)
+ {
+ struct in_device *in_dev;
+- struct rt_key key;
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = src,
++ .saddr = dst,
++ .tos = tos } },
++ .iif = oif };
+ struct fib_result res;
+ int no_addr, rpf;
+ int ret;
+
+- key.dst = src;
+- key.src = dst;
+- key.tos = tos;
+- key.oif = 0;
+- key.iif = oif;
+- key.scope = RT_SCOPE_UNIVERSE;
+-
+ no_addr = rpf = 0;
+ read_lock(&inetdev_lock);
+ in_dev = __in_dev_get(dev);
+@@ -231,7 +224,7 @@
+ if (in_dev == NULL)
+ goto e_inval;
+
+- if (fib_lookup(&key, &res))
++ if (fib_lookup(&fl, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST)
+ goto e_inval_res;
+@@ -252,10 +245,10 @@
+ goto last_resort;
+ if (rpf)
+ goto e_inval;
+- key.oif = dev->ifindex;
++ fl.oif = dev->ifindex;
+
+ ret = 0;
+- if (fib_lookup(&key, &res) == 0) {
++ if (fib_lookup(&fl, &res) == 0) {
+ if (res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(res);
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+diff -Nru a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
+--- a/net/ipv4/fib_hash.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/fib_hash.c 2005-02-13 21:25:09 +11:00
+@@ -290,7 +290,7 @@
+ }
+
+ static int
+-fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
++fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+ {
+ int err;
+ struct fn_zone *fz;
+@@ -299,7 +299,7 @@
+ read_lock(&fib_hash_lock);
+ for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+ struct fib_node *f;
+- fn_key_t k = fz_key(key->dst, fz);
++ fn_key_t k = fz_key(flp->fl4_dst, fz);
+
+ for (f = fz_chain(k, fz); f; f = f->fn_next) {
+ if (!fn_key_eq(k, f->fn_key)) {
+@@ -309,17 +309,17 @@
+ continue;
+ }
+ #ifdef CONFIG_IP_ROUTE_TOS
+- if (f->fn_tos && f->fn_tos != key->tos)
++ if (f->fn_tos && f->fn_tos != flp->fl4_tos)
+ continue;
+ #endif
+ f->fn_state |= FN_S_ACCESSED;
+
+ if (f->fn_state&FN_S_ZOMBIE)
+ continue;
+- if (f->fn_scope < key->scope)
++ if (f->fn_scope < flp->fl4_scope)
+ continue;
+
+- err = fib_semantic_match(f->fn_type, FIB_INFO(f), key, res);
++ err = fib_semantic_match(f->fn_type, FIB_INFO(f), flp, res);
+ if (err == 0) {
+ res->type = f->fn_type;
+ res->scope = f->fn_scope;
+@@ -362,7 +362,7 @@
+ }
+
+ static void
+-fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res)
++fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+ {
+ int order, last_idx;
+ struct fib_node *f;
+diff -Nru a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
+--- a/net/ipv4/fib_rules.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/fib_rules.c 2005-02-13 21:25:09 +11:00
+@@ -307,28 +307,28 @@
+ }
+ }
+
+-int fib_lookup(const struct rt_key *key, struct fib_result *res)
++int fib_lookup(const struct flowi *flp, struct fib_result *res)
+ {
+ int err;
+ struct fib_rule *r, *policy;
+ struct fib_table *tb;
+
+- u32 daddr = key->dst;
+- u32 saddr = key->src;
++ u32 daddr = flp->fl4_dst;
++ u32 saddr = flp->fl4_src;
+
+ FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
+- NIPQUAD(key->dst), NIPQUAD(key->src));
++ NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+ read_lock(&fib_rules_lock);
+ for (r = fib_rules; r; r=r->r_next) {
+ if (((saddr^r->r_src) & r->r_srcmask) ||
+ ((daddr^r->r_dst) & r->r_dstmask) ||
+ #ifdef CONFIG_IP_ROUTE_TOS
+- (r->r_tos && r->r_tos != key->tos) ||
++ (r->r_tos && r->r_tos != flp->fl4_tos) ||
+ #endif
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- (r->r_fwmark && r->r_fwmark != key->fwmark) ||
++ (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+ #endif
+- (r->r_ifindex && r->r_ifindex != key->iif))
++ (r->r_ifindex && r->r_ifindex != flp->iif))
+ continue;
+
+ FRprintk("tb %d r %d ", r->r_table, r->r_action);
+@@ -351,7 +351,7 @@
+
+ if ((tb = fib_get_table(r->r_table)) == NULL)
+ continue;
+- err = tb->tb_lookup(tb, key, res);
++ err = tb->tb_lookup(tb, flp, res);
+ if (err == 0) {
+ res->r = policy;
+ if (policy)
+@@ -369,13 +369,13 @@
+ return -ENETUNREACH;
+ }
+
+-void fib_select_default(const struct rt_key *key, struct fib_result *res)
++void fib_select_default(const struct flowi *flp, struct fib_result *res)
+ {
+ if (res->r && res->r->r_action == RTN_UNICAST &&
+ FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+ struct fib_table *tb;
+ if ((tb = fib_get_table(res->r->r_table)) != NULL)
+- tb->tb_select_default(tb, key, res);
++ tb->tb_select_default(tb, flp, res);
+ }
+ }
+
+diff -Nru a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
+--- a/net/ipv4/fib_semantics.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/fib_semantics.c 2005-02-13 21:25:09 +11:00
+@@ -349,7 +349,6 @@
+ int err;
+
+ if (nh->nh_gw) {
+- struct rt_key key;
+ struct fib_result res;
+
+ #ifdef CONFIG_IP_ROUTE_PERVASIVE
+@@ -372,16 +371,18 @@
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+- memset(&key, 0, sizeof(key));
+- key.dst = nh->nh_gw;
+- key.oif = nh->nh_oif;
+- key.scope = r->rtm_scope + 1;
+-
+- /* It is not necessary, but requires a bit of thinking */
+- if (key.scope < RT_SCOPE_LINK)
+- key.scope = RT_SCOPE_LINK;
+- if ((err = fib_lookup(&key, &res)) != 0)
+- return err;
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = nh->nh_gw,
++ .scope = r->rtm_scope + 1 } },
++ .oif = nh->nh_oif };
++
++ /* It is not necessary, but requires a bit of thinking */
++ if (fl.fl4_scope < RT_SCOPE_LINK)
++ fl.fl4_scope = RT_SCOPE_LINK;
++ if ((err = fib_lookup(&fl, &res)) != 0)
++ return err;
++ }
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+@@ -578,7 +579,7 @@
+ }
+
+ int
+-fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, struct fib_result *res)
++fib_semantic_match(int type, struct fib_info *fi, const struct flowi *flp, struct fib_result *res)
+ {
+ int err = fib_props[type].error;
+
+@@ -603,7 +604,7 @@
+ for_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ continue;
+- if (!key->oif || key->oif == nh->nh_oif)
++ if (!flp->oif || flp->oif == nh->nh_oif)
+ break;
+ }
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH
+@@ -949,7 +950,7 @@
+ fair weighted route distribution.
+ */
+
+-void fib_select_multipath(const struct rt_key *key, struct fib_result *res)
++void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+ {
+ struct fib_info *fi = res->fi;
+ int w;
+diff -Nru a/net/ipv4/icmp.c b/net/ipv4/icmp.c
+--- a/net/ipv4/icmp.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/icmp.c 2005-02-13 21:25:10 +11:00
+@@ -101,7 +101,6 @@
+ int offset;
+ int data_len;
+
+- unsigned int csum;
+ struct {
+ struct icmphdr icmph;
+ __u32 times[3];
+@@ -139,8 +138,6 @@
+ { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */
+ };
+
+-extern int sysctl_ip_default_ttl;
+-
+ /* Control parameters for ECHO replies. */
+ int sysctl_icmp_echo_ignore_all;
+ int sysctl_icmp_echo_ignore_broadcasts;
+@@ -281,39 +278,47 @@
+ * Checksum each fragment, and on the first include the headers and final checksum.
+ */
+
+-static int icmp_glue_bits(const void *p, char *to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
++int
++icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+ {
+- struct icmp_bxm *icmp_param = (struct icmp_bxm *)p;
+- struct icmphdr *icmph;
++ struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ unsigned int csum;
+
++ csum = skb_copy_and_csum_bits(icmp_param->skb,
++ icmp_param->offset + offset,
++ to, len, 0);
++
++ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (icmp_pointers[icmp_param->data.icmph.type].error)
+ nf_ct_attach(skb, icmp_param->skb);
++ return 0;
++}
+
+- if (offset) {
+- icmp_param->csum=skb_copy_and_csum_bits(icmp_param->skb,
+- icmp_param->offset+(offset-icmp_param->head_len),
+- to, fraglen,icmp_param->csum);
+- return 0;
+- }
++static void
++icmp_push_reply(struct icmp_bxm *icmp_param, struct ipcm_cookie *ipc, struct rtable *rt)
++{
++ struct sk_buff *skb;
+
+- /*
+- * First fragment includes header. Note that we've done
+- * the other fragments first, so that we get the checksum
+- * for the whole packet here.
+- */
+- csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+- to, icmp_param->head_len,
+- icmp_param->csum);
+- csum=skb_copy_and_csum_bits(icmp_param->skb,
+- icmp_param->offset,
+- to+icmp_param->head_len,
+- fraglen-icmp_param->head_len,
+- csum);
+- icmph=(struct icmphdr *)to;
+- icmph->checksum = csum_fold(csum);
+- return 0;
++ ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
++ icmp_param->data_len+icmp_param->head_len,
++ icmp_param->head_len,
++ ipc, rt, MSG_DONTWAIT);
++
++ if ((skb = skb_peek(&icmp_socket->sk->write_queue)) != NULL) {
++ struct icmphdr *icmph = skb->h.icmph;
++ unsigned int csum = 0;
++ struct sk_buff *skb1;
++
++ skb_queue_walk(&icmp_socket->sk->write_queue, skb1) {
++ csum = csum_add(csum, skb1->csum);
++ }
++ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
++ (char*)icmph, icmp_param->head_len,
++ csum);
++ icmph->checksum = csum_fold(csum);
++ skb->ip_summed = CHECKSUM_NONE;
++ ip_push_pending_frames(icmp_socket->sk);
++ }
+ }
+
+ /*
+@@ -334,11 +339,9 @@
+ return;
+
+ icmp_param->data.icmph.checksum=0;
+- icmp_param->csum=0;
+ icmp_out_count(icmp_param->data.icmph.type);
+
+ sk->protinfo.af_inet.tos = skb->nh.iph->tos;
+- sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+ if (icmp_param->replyopts.optlen) {
+@@ -346,14 +349,18 @@
+ if (ipc.opt->srr)
+ daddr = icmp_param->replyopts.faddr;
+ }
+- if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
+- goto out;
+- if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+- icmp_param->data.icmph.code)) {
+- ip_build_xmit(sk, icmp_glue_bits, icmp_param,
+- icmp_param->data_len+icmp_param->head_len,
+- &ipc, rt, MSG_DONTWAIT);
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = rt->rt_spec_dst,
++ .tos = RT_TOS(skb->nh.iph->tos) } },
++ .proto = IPPROTO_ICMP };
++ if (ip_route_output_key(&rt, &fl))
++ goto out;
+ }
++ if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
++ icmp_param->data.icmph.code))
++ icmp_push_reply(icmp_param, &ipc, rt);
+ ip_rt_put(rt);
+ out:
+ icmp_xmit_unlock();
+@@ -450,8 +457,8 @@
+ * Restore original addresses if packet has been translated.
+ */
+ if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) {
+- iph->daddr = rt->key.dst;
+- iph->saddr = rt->key.src;
++ iph->daddr = rt->fl.fl4_dst;
++ iph->saddr = rt->fl.fl4_src;
+ }
+ #endif
+
+@@ -463,9 +470,14 @@
+ ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+
+- if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0))
+- goto out;
+-
++ {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->saddr,
++ .saddr = saddr,
++ .tos = RT_TOS(tos) } },
++ .proto = IPPROTO_ICMP };
++ if (ip_route_output_key(&rt, &fl))
++ goto out;
++ }
+ if (ip_options_echo(&icmp_param.replyopts, skb_in))
+ goto ende;
+
+@@ -478,17 +490,20 @@
+ icmp_param.data.icmph.code=code;
+ icmp_param.data.icmph.un.gateway = info;
+ icmp_param.data.icmph.checksum=0;
+- icmp_param.csum=0;
+ icmp_param.skb=skb_in;
+ icmp_param.offset=skb_in->nh.raw - skb_in->data;
+ icmp_out_count(icmp_param.data.icmph.type);
+ icmp_socket->sk->protinfo.af_inet.tos = tos;
+- icmp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param.replyopts;
+ if (icmp_param.replyopts.srr) {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = icmp_param.replyopts.faddr,
++ .saddr = saddr,
++ .tos = RT_TOS(tos) } },
++ .proto = IPPROTO_ICMP };
+ ip_rt_put(rt);
+- if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0))
++ if (ip_route_output_key(&rt, &fl))
+ goto out;
+ }
+
+@@ -497,7 +512,7 @@
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+- room = rt->u.dst.pmtu;
++ room = dst_pmtu(&rt->u.dst);
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+@@ -508,9 +523,7 @@
+ icmp_param.data_len = room;
+ icmp_param.head_len = sizeof(struct icmphdr);
+
+- ip_build_xmit(icmp_socket->sk, icmp_glue_bits, &icmp_param,
+- icmp_param.data_len+sizeof(struct icmphdr),
+- &ipc, rt, MSG_DONTWAIT);
++ icmp_push_reply(&icmp_param, &ipc, rt);
+
+ ende:
+ ip_rt_put(rt);
+@@ -649,24 +662,10 @@
+ * we are OK.
+ */
+
+- ipprot = (struct inet_protocol *) inet_protos[hash];
+- while (ipprot) {
+- struct inet_protocol *nextip;
+-
+- nextip = (struct inet_protocol *) ipprot->next;
+-
+- /*
+- * Pass it off to everyone who wants it.
+- */
++ ipprot = inet_protos[hash];
++ if (ipprot && ipprot->err_handler)
++ ipprot->err_handler(skb, info);
+
+- /* RFC1122: OK. Passes appropriate ICMP errors to the */
+- /* appropriate protocol layer (MUST), as per 3.2.2. */
+-
+- if (protocol == ipprot->protocol && ipprot->err_handler)
+- ipprot->err_handler(skb, info);
+-
+- ipprot = nextip;
+- }
+ out:;
+ }
+
+@@ -995,7 +994,7 @@
+ icmp_socket_cpu(i)->sk->sndbuf =
+ (2 * ((64 * 1024) + sizeof(struct sk_buff)));
+
+- icmp_socket_cpu(i)->sk->protinfo.af_inet.ttl = MAXTTL;
++ icmp_socket_cpu(i)->sk->protinfo.af_inet.uc_ttl = -1;
+ icmp_socket_cpu(i)->sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
+
+ /* Unhash it so that IP input processing does not even
+diff -Nru a/net/ipv4/igmp.c b/net/ipv4/igmp.c
+--- a/net/ipv4/igmp.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/igmp.c 2005-02-13 21:25:10 +11:00
+@@ -218,15 +218,6 @@
+
+ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+-/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+- changes route */
+-static inline int
+-output_maybe_reroute(struct sk_buff *skb)
+-{
+- return skb->dst->output(skb);
+-}
+-
+-
+ static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+ {
+@@ -283,13 +274,18 @@
+ u32 dst;
+
+ dst = IGMPV3_ALL_MCR;
+- if (ip_route_output(&rt, dst, 0, 0, dev->ifindex))
+- return 0;
++ {
++ struct flowi fl = { .oif = dev->ifindex,
++ .nl_u = { .ip4_u = { .daddr = dst } },
++ .proto = IPPROTO_IGMP };
++ if (ip_route_output_key(&rt, &fl))
++ return 0;
++ }
+ if (rt->rt_src == 0) {
+ ip_rt_put(rt);
+ return 0;
+ }
+- skb = alloc_skb(size + dev->hard_header_len + 15, GFP_ATOMIC);
++ skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL) {
+ ip_rt_put(rt);
+ return 0;
+@@ -298,7 +294,7 @@
+ skb->dst = &rt->u.dst;
+ skb->dev = dev;
+
+- skb_reserve(skb, (dev->hard_header_len+15)&~15);
++ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+@@ -341,7 +337,7 @@
+ pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
+- output_maybe_reroute);
++ dst_output);
+ }
+
+ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+@@ -623,14 +619,19 @@
+ else
+ dst = group;
+
+- if (ip_route_output(&rt, dst, 0, 0, dev->ifindex))
+- return -1;
++ {
++ struct flowi fl = { .oif = dev->ifindex,
++ .nl_u = { .ip4_u = { .daddr = dst } },
++ .proto = IPPROTO_IGMP };
++ if (ip_route_output_key(&rt, &fl))
++ return -1;
++ }
+ if (rt->rt_src == 0) {
+ ip_rt_put(rt);
+ return -1;
+ }
+
+- skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC);
++ skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+ if (skb == NULL) {
+ ip_rt_put(rt);
+ return -1;
+@@ -638,7 +639,7 @@
+
+ skb->dst = &rt->u.dst;
+
+- skb_reserve(skb, (dev->hard_header_len+15)&~15);
++ skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+@@ -666,7 +667,7 @@
+ ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+- output_maybe_reroute);
++ dst_output);
+ }
+
+ static void igmp_gq_timer_expire(unsigned long data)
+@@ -874,7 +875,7 @@
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+- if (((struct rtable*)skb->dst)->key.iif == 0)
++ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ break;
+ igmp_heard_report(in_dev, ih->group);
+ break;
+@@ -1283,6 +1284,8 @@
+
+ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = imr->imr_multiaddr.s_addr } } };
+ struct rtable *rt;
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
+@@ -1300,7 +1303,7 @@
+ __dev_put(dev);
+ }
+
+- if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) {
++ if (!dev && !ip_route_output_key(&rt, &fl)) {
+ dev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+diff -Nru a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
+--- a/net/ipv4/ip_forward.c 2005-02-13 21:25:08 +11:00
++++ b/net/ipv4/ip_forward.c 2005-02-13 21:25:08 +11:00
+@@ -40,6 +40,7 @@
+ #include <net/checksum.h>
+ #include <linux/route.h>
+ #include <net/route.h>
++#include <net/xfrm.h>
+
+ static inline int ip_forward_finish(struct sk_buff *skb)
+ {
+@@ -47,36 +48,20 @@
+
+ IP_INC_STATS_BH(IpForwDatagrams);
+
+- if (opt->optlen == 0) {
+-#ifdef CONFIG_NET_FASTROUTE
+- struct rtable *rt = (struct rtable*)skb->dst;
+-
+- if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
+- struct dst_entry *old_dst;
+- unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
+-
+- write_lock_irq(&skb->dev->fastpath_lock);
+- old_dst = skb->dev->fastpath[h];
+- skb->dev->fastpath[h] = dst_clone(&rt->u.dst);
+- write_unlock_irq(&skb->dev->fastpath_lock);
+-
+- dst_release(old_dst);
+- }
+-#endif
+- return (ip_send(skb));
+- }
++ if (unlikely(opt->optlen))
++ ip_forward_options(skb);
+
+- ip_forward_options(skb);
+- return (ip_send(skb));
++ return dst_output(skb);
+ }
+
+ int ip_forward(struct sk_buff *skb)
+ {
+- struct net_device *dev2; /* Output device */
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options * opt = &(IPCB(skb)->opt);
+- unsigned short mtu;
++
++ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
++ goto drop;
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return NET_RX_SUCCESS;
+@@ -93,32 +78,21 @@
+ */
+
+ iph = skb->nh.iph;
+- rt = (struct rtable*)skb->dst;
+
+ if (iph->ttl <= 1)
+ goto too_many_hops;
+
+- if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+- goto sr_failed;
+-
+- /*
+- * Having picked a route we can now send the frame out
+- * after asking the firewall permission to do so.
+- */
++ if (!xfrm4_route_forward(skb))
++ goto drop;
+
+- skb->priority = rt_tos2priority(iph->tos);
+- dev2 = rt->u.dst.dev;
+- mtu = rt->u.dst.pmtu;
++ iph = skb->nh.iph;
++ rt = (struct rtable*)skb->dst;
+
+- /*
+- * We now generate an ICMP HOST REDIRECT giving the route
+- * we calculated.
+- */
+- if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+- ip_rt_send_redirect(skb);
++ if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
++ goto sr_failed;
+
+ /* We are about to mangle packet. Copy it! */
+- if (skb_cow(skb, dev2->hard_header_len))
++ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ goto drop;
+ iph = skb->nh.iph;
+
+@@ -126,29 +100,16 @@
+ ip_decrease_ttl(iph);
+
+ /*
+- * We now may allocate a new buffer, and copy the datagram into it.
+- * If the indicated interface is up and running, kick it.
++ * We now generate an ICMP HOST REDIRECT giving the route
++ * we calculated.
+ */
++ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
++ ip_rt_send_redirect(skb);
+
+- if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF))
+- goto frag_needed;
+-
+-#ifdef CONFIG_IP_ROUTE_NAT
+- if (rt->rt_flags & RTCF_NAT) {
+- if (ip_do_nat(skb)) {
+- kfree_skb(skb);
+- return NET_RX_BAD;
+- }
+- }
+-#endif
++ skb->priority = rt_tos2priority(iph->tos);
+
+- return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev2,
++ return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+ ip_forward_finish);
+-
+-frag_needed:
+- IP_INC_STATS_BH(IpFragFails);
+- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+- goto drop;
+
+ sr_failed:
+ /*
+diff -Nru a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
+--- a/net/ipv4/ip_gre.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ip_gre.c 2005-02-13 21:25:09 +11:00
+@@ -37,6 +37,7 @@
+ #include <net/arp.h>
+ #include <net/checksum.h>
+ #include <net/inet_ecn.h>
++#include <net/xfrm.h>
+
+ #ifdef CONFIG_IPV6
+ #include <net/ipv6.h>
+@@ -410,6 +411,7 @@
+ u16 flags;
+ int grehlen = (iph->ihl<<2) + 4;
+ struct sk_buff *skb2;
++ struct flowi fl;
+ struct rtable *rt;
+
+ if (p[1] != htons(ETH_P_IP))
+@@ -486,7 +488,11 @@
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+- if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
++ memset(&fl, 0, sizeof(fl));
++ fl.fl4_dst = eiph->saddr;
++ fl.fl4_tos = RT_TOS(eiph->tos);
++ fl.proto = IPPROTO_GRE;
++ if (ip_route_output_key(&rt, &fl)) {
+ kfree_skb(skb2);
+ return;
+ }
+@@ -496,7 +502,10 @@
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+- if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
++ fl.fl4_dst = eiph->daddr;
++ fl.fl4_src = eiph->saddr;
++ fl.fl4_tos = eiph->tos;
++ if (ip_route_output_key(&rt, &fl) ||
+ rt->u.dst.dev->type != ARPHRD_IPGRE) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+@@ -513,11 +522,11 @@
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+- if (rel_info > skb2->dst->pmtu) {
++ if (rel_info > dst_pmtu(skb2->dst)) {
+ kfree_skb(skb2);
+ return;
+ }
+- skb2->dst->pmtu = rel_info;
++ skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+@@ -606,6 +615,8 @@
+
+ read_lock(&ipgre_lock);
+ if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
++ secpath_reset(skb);
++
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = __pskb_pull(skb, offset);
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+@@ -617,7 +628,7 @@
+ #ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (MULTICAST(iph->daddr)) {
+ /* Looped back packet, drop it! */
+- if (((struct rtable*)skb->dst)->key.iif == 0)
++ if (((struct rtable*)skb->dst)->fl.iif == 0)
+ goto drop;
+ tunnel->stat.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+@@ -659,12 +670,6 @@
+ return(0);
+ }
+
+-/* Need this wrapper because NF_HOOK takes the function address */
+-static inline int do_ip_send(struct sk_buff *skb)
+-{
+- return ip_send(skb);
+-}
+-
+ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+@@ -741,9 +746,17 @@
+ tos &= ~1;
+ }
+
+- if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
+- tunnel->stat.tx_carrier_errors++;
+- goto tx_error;
++ {
++ struct flowi fl = { .oif = tunnel->parms.link,
++ .nl_u = { .ip4_u =
++ { .daddr = dst,
++ .saddr = tiph->saddr,
++ .tos = RT_TOS(tos) } },
++ .proto = IPPROTO_GRE };
++ if (ip_route_output_key(&rt, &fl)) {
++ tunnel->stat.tx_carrier_errors++;
++ goto tx_error;
++ }
+ }
+ tdev = rt->u.dst.dev;
+
+@@ -755,14 +768,14 @@
+
+ df = tiph->frag_off;
+ if (df)
+- mtu = rt->u.dst.pmtu - tunnel->hlen;
++ mtu = dst_pmtu(&rt->u.dst) - tunnel->hlen;
+ else
+- mtu = skb->dst ? skb->dst->pmtu : dev->mtu;
++ mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+
+- if (skb->protocol == htons(ETH_P_IP)) {
+- if (skb->dst && mtu < skb->dst->pmtu && mtu >= 68)
+- skb->dst->pmtu = mtu;
++ if (skb->dst)
++ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
++ if (skb->protocol == htons(ETH_P_IP)) {
+ df |= (old_iph->frag_off&htons(IP_DF));
+
+ if ((old_iph->frag_off&htons(IP_DF)) &&
+@@ -776,11 +789,11 @@
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+
+- if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
++ if (rt6 && mtu < dst_pmtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+- skb->dst->pmtu = mtu;
++ skb->dst->metrics[RTAX_MTU-1] = mtu;
+ }
+ }
+
+@@ -801,7 +814,7 @@
+ tunnel->err_count = 0;
+ }
+
+- max_headroom = ((tdev->hard_header_len+15)&~15)+ gre_hlen;
++ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+@@ -846,7 +859,7 @@
+ iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+ #endif
+ else
+- iph->ttl = sysctl_ip_default_ttl;
++ iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ }
+
+ ((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+@@ -1090,10 +1103,14 @@
+
+ MOD_INC_USE_COUNT;
+ if (MULTICAST(t->parms.iph.daddr)) {
++ struct flowi fl = { .oif = t->parms.link,
++ .nl_u = { .ip4_u =
++ { .daddr = t->parms.iph.daddr,
++ .saddr = t->parms.iph.saddr,
++ .tos = RT_TOS(t->parms.iph.tos) } },
++ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+- if (ip_route_output(&rt, t->parms.iph.daddr,
+- t->parms.iph.saddr, RT_TOS(t->parms.iph.tos),
+- t->parms.link)) {
++ if (ip_route_output_key(&rt, &fl)) {
+ MOD_DEC_USE_COUNT;
+ return -EADDRNOTAVAIL;
+ }
+@@ -1163,8 +1180,14 @@
+ /* Guess output device to choose reasonable mtu and hard_header_len */
+
+ if (iph->daddr) {
++ struct flowi fl = { .oif = tunnel->parms.link,
++ .nl_u = { .ip4_u =
++ { .daddr = iph->daddr,
++ .saddr = iph->saddr,
++ .tos = RT_TOS(iph->tos) } },
++ .proto = IPPROTO_GRE };
+ struct rtable *rt;
+- if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
++ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+@@ -1245,13 +1268,8 @@
+
+
+ static struct inet_protocol ipgre_protocol = {
+- ipgre_rcv, /* GRE handler */
+- ipgre_err, /* TUNNEL error control */
+- 0, /* next */
+- IPPROTO_GRE, /* protocol ID */
+- 0, /* copy */
+- NULL, /* data */
+- "GRE" /* name */
++ .handler = ipgre_rcv,
++ .err_handler = ipgre_err,
+ };
+
+
+@@ -1267,9 +1285,13 @@
+ {
+ printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
++ if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
++ printk(KERN_INFO "ipgre init: can't add protocol\n");
++ return -EAGAIN;
++ }
++
+ ipgre_fb_tunnel_dev.priv = (void*)&ipgre_fb_tunnel;
+ register_netdev(&ipgre_fb_tunnel_dev);
+- inet_add_protocol(&ipgre_protocol);
+ return 0;
+ }
+
+@@ -1277,7 +1299,7 @@
+
+ void cleanup_module(void)
+ {
+- if ( inet_del_protocol(&ipgre_protocol) < 0 )
++ if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+ printk(KERN_INFO "ipgre close: can't remove protocol\n");
+
+ unregister_netdev(&ipgre_fb_tunnel_dev);
+diff -Nru a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
+--- a/net/ipv4/ip_input.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ip_input.c 2005-02-13 21:25:09 +11:00
+@@ -141,6 +141,7 @@
+ #include <net/raw.h>
+ #include <net/checksum.h>
+ #include <linux/netfilter_ipv4.h>
++#include <net/xfrm.h>
+ #include <linux/mroute.h>
+ #include <linux/netlink.h>
+
+@@ -194,34 +195,13 @@
+ return 0;
+ }
+
+-/* Handle this out of line, it is rare. */
+-static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph,
+- struct inet_protocol *ipprot, int force_copy)
+-{
+- int ret = 0;
+-
+- do {
+- if (ipprot->protocol == iph->protocol) {
+- struct sk_buff *skb2 = skb;
+- if (ipprot->copy || force_copy)
+- skb2 = skb_clone(skb, GFP_ATOMIC);
+- if(skb2 != NULL) {
+- ret = 1;
+- ipprot->handler(skb2);
+- }
+- }
+- ipprot = (struct inet_protocol *) ipprot->next;
+- } while(ipprot != NULL);
+-
+- return ret;
+-}
+-
+ static inline int ip_local_deliver_finish(struct sk_buff *skb)
+ {
+ int ihl = skb->nh.iph->ihl*4;
+
+ #ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_local_deliver(skb);
++ skb->nf_debug = 0;
+ #endif /*CONFIG_NETFILTER_DEBUG*/
+
+ __skb_pull(skb, ihl);
+@@ -236,44 +216,40 @@
+ {
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ int protocol = skb->nh.iph->protocol;
+- int hash = protocol & (MAX_INET_PROTOS - 1);
+- struct sock *raw_sk = raw_v4_htable[hash];
++ int hash;
++ struct sock *raw_sk;
+ struct inet_protocol *ipprot;
+- int flag;
++
++ resubmit:
++ hash = protocol & (MAX_INET_PROTOS - 1);
++ raw_sk = raw_v4_htable[hash];
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+- if(raw_sk != NULL)
+- raw_sk = raw_v4_input(skb, skb->nh.iph, hash);
++ if (raw_sk)
++ raw_v4_input(skb, skb->nh.iph, hash);
+
+- ipprot = (struct inet_protocol *) inet_protos[hash];
+- flag = 0;
+- if(ipprot != NULL) {
+- if(raw_sk == NULL &&
+- ipprot->next == NULL &&
+- ipprot->protocol == protocol) {
+- int ret;
+-
+- /* Fast path... */
+- ret = ipprot->handler(skb);
+-
+- return ret;
+- } else {
+- flag = ip_run_ipprot(skb, skb->nh.iph, ipprot, (raw_sk != NULL));
+- }
+- }
++ if ((ipprot = inet_protos[hash]) != NULL) {
++ int ret;
+
+- /* All protocols checked.
+- * If this packet was a broadcast, we may *not* reply to it, since that
+- * causes (proven, grin) ARP storms and a leakage of memory (i.e. all
+- * ICMP reply messages get queued up for transmission...)
+- */
+- if(raw_sk != NULL) { /* Shift to last raw user */
+- raw_rcv(raw_sk, skb);
+- sock_put(raw_sk);
+- } else if (!flag) { /* Free and report errors */
+- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
++ if (!ipprot->no_policy &&
++ !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return 0;
++ }
++ ret = ipprot->handler(skb);
++ if (ret < 0) {
++ protocol = -ret;
++ goto resubmit;
++ }
++ } else {
++ if (!raw_sk) {
++ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ icmp_send(skb, ICMP_DEST_UNREACH,
++ ICMP_PROT_UNREACH, 0);
++ }
++ }
+ kfree_skb(skb);
+ }
+ }
+@@ -361,7 +337,7 @@
+ }
+ }
+
+- return skb->dst->input(skb);
++ return dst_input(skb);
+
+ inhdr_error:
+ IP_INC_STATS_BH(IpInHdrErrors);
+diff -Nru a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c
+--- a/net/ipv4/ip_nat_dumb.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ip_nat_dumb.c 2005-02-13 21:25:09 +11:00
+@@ -117,23 +117,23 @@
+ if (rt->rt_flags&RTCF_SNAT) {
+ if (ciph->daddr != osaddr) {
+ struct fib_result res;
+- struct rt_key key;
+ unsigned flags = 0;
+-
+- key.src = ciph->daddr;
+- key.dst = ciph->saddr;
+- key.iif = skb->dev->ifindex;
+- key.oif = 0;
++ struct flowi fl = {
++ .iif = skb->dev->ifindex,
++ .nl_u =
++ { .ip4_u =
++ { .daddr = ciph->saddr,
++ .saddr = ciph->daddr,
+ #ifdef CONFIG_IP_ROUTE_TOS
+- key.tos = RT_TOS(ciph->tos);
+-#endif
+-#ifdef CONFIG_IP_ROUTE_FWMARK
+- key.fwmark = 0;
++ .tos = RT_TOS(ciph->tos)
+ #endif
++ } },
++ .proto = ciph->protocol };
++
+ /* Use fib_lookup() until we get our own
+ * hash table of NATed hosts -- Rani
+ */
+- if (fib_lookup(&key, &res) == 0) {
++ if (fib_lookup(&fl, &res) == 0) {
+ if (res.r) {
+ ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
+ if (ciph->daddr != idaddr)
+diff -Nru a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
+--- a/net/ipv4/ip_output.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/ip_output.c 2005-02-13 21:25:10 +11:00
+@@ -15,6 +15,7 @@
+ * Stefan Becker, <stefanb at yello.ping.de>
+ * Jorge Cwik, <jorge at laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra at nvg.unit.no>
++ * Hirokazu Takahashi, <taka at valinux.co.jp>
+ *
+ * See ip_input.c for original log
+ *
+@@ -38,6 +39,9 @@
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ * Detlev Wengorz : Copy protocol for fragments.
++ * Hirokazu Takahashi: HW checksumming for outgoing UDP
++ * datagrams.
++ * Hirokazu Takahashi: sendfile() on UDP works now.
+ */
+
+ #include <asm/uaccess.h>
+@@ -108,16 +112,18 @@
+ return 0;
+ }
+
+-/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+- changes route */
+-static inline int
+-output_maybe_reroute(struct sk_buff *skb)
++static inline int ip_select_ttl(struct inet_opt *inet, struct dst_entry *dst)
+ {
+- return skb->dst->output(skb);
++ int ttl = inet->uc_ttl;
++
++ if (ttl < 0)
++ ttl = dst_metric(dst, RTAX_HOPLIMIT);
++ return ttl;
+ }
+
+ /*
+ * Add an ip header to a skbuff and send it out.
++ *
+ */
+ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ u32 saddr, u32 daddr, struct ip_options *opt)
+@@ -138,7 +144,7 @@
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+- iph->ttl = sk->protinfo.af_inet.ttl;
++ iph->ttl = ip_select_ttl(&sk->protinfo.af_inet, &rt->u.dst);
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+@@ -152,15 +158,34 @@
+ }
+ ip_send_check(iph);
+
++ skb->priority = sk->priority;
++
+ /* Send it out. */
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+- output_maybe_reroute);
++ dst_output);
+ }
+
+ static inline int ip_finish_output2(struct sk_buff *skb)
+ {
+ struct dst_entry *dst = skb->dst;
+ struct hh_cache *hh = dst->hh;
++ struct net_device *dev = dst->dev;
++ int hh_len = LL_RESERVED_SPACE(dev);
++
++ /* Be paranoid, rather than too clever. */
++ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
++ struct sk_buff *skb2;
++
++ skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
++ if (skb2 == NULL) {
++ kfree_skb(skb);
++ return -ENOMEM;
++ }
++ if (skb->sk)
++ skb_set_owner_w(skb2, skb->sk);
++ kfree_skb(skb);
++ skb = skb2;
++ }
+
+ #ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_finish_output2(skb);
+@@ -184,7 +209,7 @@
+ return -EINVAL;
+ }
+
+-static __inline__ int __ip_finish_output(struct sk_buff *skb)
++int ip_finish_output(struct sk_buff *skb)
+ {
+ struct net_device *dev = skb->dst->dev;
+
+@@ -195,11 +220,6 @@
+ ip_finish_output2);
+ }
+
+-int ip_finish_output(struct sk_buff *skb)
+-{
+- return __ip_finish_output(skb);
+-}
+-
+ int ip_mc_output(struct sk_buff *skb)
+ {
+ struct sock *sk = skb->sk;
+@@ -210,10 +230,6 @@
+ * If the indicated interface is up and running, send the packet.
+ */
+ IP_INC_STATS(IpOutRequests);
+-#ifdef CONFIG_IP_ROUTE_NAT
+- if (rt->rt_flags & RTCF_NAT)
+- ip_do_nat(skb);
+-#endif
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+@@ -258,90 +274,26 @@
+ newskb->dev, ip_dev_loopback_xmit);
+ }
+
+- return __ip_finish_output(skb);
++ if (skb->len > dst_pmtu(&rt->u.dst) || skb_shinfo(skb)->frag_list)
++ return ip_fragment(skb, ip_finish_output);
++ else
++ return ip_finish_output(skb);
+ }
+
+ int ip_output(struct sk_buff *skb)
+ {
+-#ifdef CONFIG_IP_ROUTE_NAT
+- struct rtable *rt = (struct rtable*)skb->dst;
+-#endif
+-
+ IP_INC_STATS(IpOutRequests);
+
+-#ifdef CONFIG_IP_ROUTE_NAT
+- if (rt->rt_flags&RTCF_NAT)
+- ip_do_nat(skb);
++ if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list) &&
++#ifdef NETIF_F_TSO
++ !skb_shinfo(skb)->tso_size
++#else
++ 1
+ #endif
+-
+- return __ip_finish_output(skb);
+-}
+-
+-/* Queues a packet to be sent, and starts the transmitter if necessary.
+- * This routine also needs to put in the total length and compute the
+- * checksum. We use to do this in two stages, ip_build_header() then
+- * this, but that scheme created a mess when routes disappeared etc.
+- * So we do it all here, and the TCP send engine has been changed to
+- * match. (No more unroutable FIN disasters, etc. wheee...) This will
+- * most likely make other reliable transport layers above IP easier
+- * to implement under Linux.
+- */
+-static inline int ip_queue_xmit2(struct sk_buff *skb)
+-{
+- struct sock *sk = skb->sk;
+- struct rtable *rt = (struct rtable *)skb->dst;
+- struct net_device *dev;
+- struct iphdr *iph = skb->nh.iph;
+-
+- dev = rt->u.dst.dev;
+-
+- /* This can happen when the transport layer has segments queued
+- * with a cached route, and by the time we get here things are
+- * re-routed to a device with a different MTU than the original
+- * device. Sick, but we must cover it.
+- */
+- if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
+- struct sk_buff *skb2;
+-
+- skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
+- kfree_skb(skb);
+- if (skb2 == NULL)
+- return -ENOMEM;
+- if (sk)
+- skb_set_owner_w(skb2, sk);
+- skb = skb2;
+- iph = skb->nh.iph;
+- }
+-
+- if (skb->len > rt->u.dst.pmtu)
+- goto fragment;
+-
+- ip_select_ident(iph, &rt->u.dst, sk);
+-
+- /* Add an IP checksum. */
+- ip_send_check(iph);
+-
+- skb->priority = sk->priority;
+- return skb->dst->output(skb);
+-
+-fragment:
+- if (ip_dont_fragment(sk, &rt->u.dst)) {
+- /* Reject packet ONLY if TCP might fragment
+- * it itself, if were careful enough.
+- */
+- NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
+- skb->len, rt->u.dst.pmtu));
+-
+- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+- htonl(rt->u.dst.pmtu));
+- kfree_skb(skb);
+- return -EMSGSIZE;
+- }
+- ip_select_ident(iph, &rt->u.dst, sk);
+- if (skb->ip_summed == CHECKSUM_HW &&
+- (skb = skb_checksum_help(skb)) == NULL)
+- return -ENOMEM;
+- return ip_fragment(skb, skb->dst->output);
++ )
++ return ip_fragment(skb, ip_finish_output);
++ else
++ return ip_finish_output(skb);
+ }
+
+ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+@@ -350,6 +302,9 @@
+ struct ip_options *opt = sk->protinfo.af_inet.opt;
+ struct rtable *rt;
+ struct iphdr *iph;
++#ifdef NETIF_F_TSO
++ u32 mtu;
++#endif
+
+ /* Skip all of this if the packet is already routed,
+ * f.e. by something like SCTP.
+@@ -368,14 +323,24 @@
+ if(opt && opt->srr)
+ daddr = opt->faddr;
+
+- /* If this fails, retransmit mechanism of transport layer will
+- * keep trying until route appears or the connection times itself
+- * out.
+- */
+- if (ip_route_output(&rt, daddr, sk->saddr,
+- RT_CONN_FLAGS(sk),
+- sk->bound_dev_if))
+- goto no_route;
++ {
++ struct flowi fl = { .oif = sk->bound_dev_if,
++ .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = sk->saddr,
++ .tos = RT_CONN_FLAGS(sk) } },
++ .proto = sk->protocol,
++ .uli_u = { .ports =
++ { .sport = sk->sport,
++ .dport = sk->dport } } };
++
++ /* If this fails, retransmit mechanism of transport layer will
++ * keep trying until route appears or the connection times
++ * itself out.
++ */
++ if (ip_route_output_flow(&rt, &fl, sk, 0))
++ goto no_route;
++ }
+ __sk_dst_set(sk, &rt->u.dst);
+ sk->route_caps = rt->u.dst.dev->features;
+ }
+@@ -393,7 +358,7 @@
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+- iph->ttl = sk->protinfo.af_inet.ttl;
++ iph->ttl = ip_select_ttl(&sk->protinfo.af_inet, &rt->u.dst);
+ iph->protocol = sk->protocol;
+ iph->saddr = rt->rt_src;
+ iph->daddr = rt->rt_dst;
+@@ -405,8 +370,30 @@
+ ip_options_build(skb, opt, sk->daddr, rt, 0);
+ }
+
++#ifdef NETIF_F_TSO
++ mtu = dst_pmtu(&rt->u.dst);
++ if (skb->len > mtu && (sk->route_caps&NETIF_F_TSO)) {
++ unsigned int hlen;
++
++ /* Hack zone: all this must be done by TCP. */
++ hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
++ skb_shinfo(skb)->tso_size = mtu - hlen;
++ skb_shinfo(skb)->tso_segs =
++ (skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
++ skb_shinfo(skb)->tso_size - 1;
++ }
++ ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
++#else
++ ip_select_ident(iph, &rt->u.dst, sk);
++#endif
++
++ /* Add an IP checksum. */
++ ip_send_check(iph);
++
++ skb->priority = sk->priority;
++
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+- ip_queue_xmit2);
++ dst_output);
+
+ no_route:
+ IP_INC_STATS(IpOutNoRoutes);
+@@ -414,338 +401,32 @@
+ return -EHOSTUNREACH;
+ }
+
+-/*
+- * Build and send a packet, with as little as one copy
+- *
+- * Doesn't care much about ip options... option length can be
+- * different for fragment at 0 and other fragments.
+- *
+- * Note that the fragment at the highest offset is sent first,
+- * so the getfrag routine can fill in the TCP/UDP checksum header
+- * field in the last fragment it sends... actually it also helps
+- * the reassemblers, they can put most packets in at the head of
+- * the fragment queue, and they know the total size in advance. This
+- * last feature will measurably improve the Linux fragment handler one
+- * day.
+- *
+- * The callback has five args, an arbitrary pointer (copy of frag),
+- * the source IP address (may depend on the routing table), the
+- * destination address (char *), the offset to copy from, and the
+- * length to be copied.
+- */
+-
+-static int ip_build_xmit_slow(struct sock *sk,
+- int getfrag (const void *,
+- char *,
+- unsigned int,
+- unsigned int,
+- struct sk_buff *),
+- const void *frag,
+- unsigned length,
+- struct ipcm_cookie *ipc,
+- struct rtable *rt,
+- int flags)
+-{
+- unsigned int fraglen, maxfraglen, fragheaderlen;
+- int err;
+- int offset, mf;
+- int mtu;
+- u16 id;
+-
+- int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
+- int nfrags=0;
+- struct ip_options *opt = ipc->opt;
+- int df = 0;
+-
+- mtu = rt->u.dst.pmtu;
+- if (ip_dont_fragment(sk, &rt->u.dst))
+- df = htons(IP_DF);
+-
+- length -= sizeof(struct iphdr);
+-
+- if (opt) {
+- fragheaderlen = sizeof(struct iphdr) + opt->optlen;
+- maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
+- } else {
+- fragheaderlen = sizeof(struct iphdr);
+-
+- /*
+- * Fragheaderlen is the size of 'overhead' on each buffer. Now work
+- * out the size of the frames to send.
+- */
+-
+- maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
+- }
+-
+- if (length + fragheaderlen > 0xFFFF) {
+- ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
+- return -EMSGSIZE;
+- }
+-
+- /*
+- * Start at the end of the frame by handling the remainder.
+- */
+-
+- offset = length - (length % (maxfraglen - fragheaderlen));
+-
+- /*
+- * Amount of memory to allocate for final fragment.
+- */
+-
+- fraglen = length - offset + fragheaderlen;
+-
+- if (length-offset==0) {
+- fraglen = maxfraglen;
+- offset -= maxfraglen-fragheaderlen;
+- }
+-
+- /*
+- * The last fragment will not have MF (more fragments) set.
+- */
+-
+- mf = 0;
+-
+- /*
+- * Don't fragment packets for path mtu discovery.
+- */
+-
+- if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
+- ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
+- return -EMSGSIZE;
+- }
+- if (flags&MSG_PROBE)
+- goto out;
+-
+- /*
+- * Begin outputting the bytes.
+- */
+-
+- id = sk->protinfo.af_inet.id++;
+-
+- do {
+- char *data;
+- struct sk_buff * skb;
+-
+- /*
+- * Get the memory we require with some space left for alignment.
+- */
+- if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
+- skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
+- (flags & MSG_DONTWAIT), &err);
+- } else {
+- /* On a non-blocking write, we check for send buffer
+- * usage on the first fragment only.
+- */
+- skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
+- sk->allocation);
+- if (!skb)
+- err = -ENOBUFS;
+- }
+- if (skb == NULL)
+- goto error;
+-
+- /*
+- * Fill in the control structures
+- */
+-
+- skb->priority = sk->priority;
+- skb->dst = dst_clone(&rt->u.dst);
+- skb_reserve(skb, hh_len);
+-
+- /*
+- * Find where to start putting bytes.
+- */
+-
+- data = skb_put(skb, fraglen);
+- skb->nh.iph = (struct iphdr *)data;
+-
+- /*
+- * Only write IP header onto non-raw packets
+- */
+-
+- {
+- struct iphdr *iph = (struct iphdr *)data;
+-
+- iph->version = 4;
+- iph->ihl = 5;
+- if (opt) {
+- iph->ihl += opt->optlen>>2;
+- ip_options_build(skb, opt,
+- ipc->addr, rt, offset);
+- }
+- iph->tos = sk->protinfo.af_inet.tos;
+- iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
+- iph->frag_off = htons(offset>>3)|mf|df;
+- iph->id = id;
+- if (!mf) {
+- if (offset || !df) {
+- /* Select an unpredictable ident only
+- * for packets without DF or having
+- * been fragmented.
+- */
+- __ip_select_ident(iph, &rt->u.dst);
+- id = iph->id;
+- }
+-
+- /*
+- * Any further fragments will have MF set.
+- */
+- mf = htons(IP_MF);
+- }
+- if (rt->rt_type == RTN_MULTICAST)
+- iph->ttl = sk->protinfo.af_inet.mc_ttl;
+- else
+- iph->ttl = sk->protinfo.af_inet.ttl;
+- iph->protocol = sk->protocol;
+- iph->check = 0;
+- iph->saddr = rt->rt_src;
+- iph->daddr = rt->rt_dst;
+- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+- data += iph->ihl*4;
+- }
+-
+- /*
+- * User data callback
+- */
+-
+- if (getfrag(frag, data, offset, fraglen-fragheaderlen, skb)) {
+- err = -EFAULT;
+- kfree_skb(skb);
+- goto error;
+- }
+-
+- offset -= (maxfraglen-fragheaderlen);
+- fraglen = maxfraglen;
+-
+- nfrags++;
+-
+- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+- skb->dst->dev, output_maybe_reroute);
+- if (err) {
+- if (err > 0)
+- err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
+- if (err)
+- goto error;
+- }
+- } while (offset >= 0);
+-
+- if (nfrags>1)
+- ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
+-out:
+- return 0;
+-
+-error:
+- IP_INC_STATS(IpOutDiscards);
+- if (nfrags>1)
+- ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
+- return err;
+-}
+-
+-/*
+- * Fast path for unfragmented packets.
+- */
+-int ip_build_xmit(struct sock *sk,
+- int getfrag (const void *,
+- char *,
+- unsigned int,
+- unsigned int,
+- struct sk_buff *),
+- const void *frag,
+- unsigned length,
+- struct ipcm_cookie *ipc,
+- struct rtable *rt,
+- int flags)
++static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+ {
+- int err;
+- struct sk_buff *skb;
+- int df;
+- struct iphdr *iph;
+-
+- /*
+- * Try the simple case first. This leaves fragmented frames, and by
+- * choice RAW frames within 20 bytes of maximum size(rare) to the long path
+- */
+-
+- if (!sk->protinfo.af_inet.hdrincl) {
+- length += sizeof(struct iphdr);
+-
+- /*
+- * Check for slow path.
+- */
+- if (length > rt->u.dst.pmtu || ipc->opt != NULL)
+- return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
+- } else {
+- if (length > rt->u.dst.dev->mtu) {
+- ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
+- return -EMSGSIZE;
+- }
+- }
+- if (flags&MSG_PROBE)
+- goto out;
+-
+- /*
+- * Do path mtu discovery if needed.
+- */
+- df = 0;
+- if (ip_dont_fragment(sk, &rt->u.dst))
+- df = htons(IP_DF);
+-
+- /*
+- * Fast path for unfragmented frames without options.
+- */
+- {
+- int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
+-
+- skb = sock_alloc_send_skb(sk, length+hh_len+15,
+- flags&MSG_DONTWAIT, &err);
+- if(skb==NULL)
+- goto error;
+- skb_reserve(skb, hh_len);
+- }
+-
+- skb->priority = sk->priority;
+- skb->dst = dst_clone(&rt->u.dst);
+-
+- skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+-
+- if(!sk->protinfo.af_inet.hdrincl) {
+- iph->version=4;
+- iph->ihl=5;
+- iph->tos=sk->protinfo.af_inet.tos;
+- iph->tot_len = htons(length);
+- iph->frag_off = df;
+- iph->ttl=sk->protinfo.af_inet.mc_ttl;
+- ip_select_ident(iph, &rt->u.dst, sk);
+- if (rt->rt_type != RTN_MULTICAST)
+- iph->ttl=sk->protinfo.af_inet.ttl;
+- iph->protocol=sk->protocol;
+- iph->saddr=rt->rt_src;
+- iph->daddr=rt->rt_dst;
+- iph->check=0;
+- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+- err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4, skb);
+- }
+- else
+- err = getfrag(frag, (void *)iph, 0, length, skb);
+-
+- if (err)
+- goto error_fault;
++ to->pkt_type = from->pkt_type;
++ to->priority = from->priority;
++ to->protocol = from->protocol;
++ to->security = from->security;
++ to->dst = dst_clone(from->dst);
++ to->dev = from->dev;
+
+- err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+- output_maybe_reroute);
+- if (err > 0)
+- err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
+- if (err)
+- goto error;
+-out:
+- return 0;
++ /* Copy the flags to each fragment. */
++ IPCB(to)->flags = IPCB(from)->flags;
+
+-error_fault:
+- err = -EFAULT;
+- kfree_skb(skb);
+-error:
+- IP_INC_STATS(IpOutDiscards);
+- return err;
++#ifdef CONFIG_NET_SCHED
++ to->tc_index = from->tc_index;
++#endif
++#ifdef CONFIG_NETFILTER
++ to->nfmark = from->nfmark;
++ to->nfcache = from->nfcache;
++ /* Connection association is same as pre-frag packet */
++ nf_conntrack_put(to->nfct);
++ to->nfct = from->nfct;
++ nf_conntrack_get(to->nfct);
++#ifdef CONFIG_NETFILTER_DEBUG
++ to->nf_debug = from->nf_debug;
++#endif
++#endif
+ }
+
+ /*
+@@ -753,8 +434,6 @@
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+- *
+- * Yes this is inefficient, feel free to submit a quicker one.
+ */
+
+ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+@@ -778,13 +457,111 @@
+
+ iph = skb->nh.iph;
+
++ if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
++ htonl(dst_pmtu(&rt->u.dst)));
++ kfree_skb(skb);
++ return -EMSGSIZE;
++ }
++
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
++ mtu = dst_pmtu(&rt->u.dst) - hlen; /* Size of data space */
++
++ /* When frag_list is given, use it. First, check its validity:
++ * some transformers could create wrong frag_list or break existing
++ * one, it is not prohibited. In this case fall back to copying.
++ *
++ * LATER: this step can be merged to real generation of fragments,
++ * we can switch to copy when see the first bad fragment.
++ */
++ if (skb_shinfo(skb)->frag_list) {
++ struct sk_buff *frag;
++ int first_len = skb_pagelen(skb);
++
++ if (first_len - hlen > mtu ||
++ ((first_len - hlen) & 7) ||
++ (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
++ skb_cloned(skb))
++ goto slow_path;
++
++ for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
++ /* Correct geometry. */
++ if (frag->len > mtu ||
++ ((frag->len & 7) && frag->next) ||
++ skb_headroom(frag) < hlen)
++ goto slow_path;
++
++ /* Correct socket ownership. */
++ if (frag->sk == NULL)
++ goto slow_path;
++
++ /* Partially cloned skb? */
++ if (skb_shared(frag))
++ goto slow_path;
++ }
++
++ /* Everything is OK. Generate! */
++
++ err = 0;
++ offset = 0;
++ frag = skb_shinfo(skb)->frag_list;
++ skb_shinfo(skb)->frag_list = 0;
++ skb->data_len = first_len - skb_headlen(skb);
++ skb->len = first_len;
++ iph->tot_len = htons(first_len);
++ iph->frag_off |= htons(IP_MF);
++ ip_send_check(iph);
++
++ for (;;) {
++ /* Prepare header of the next frame,
++ * before previous one went down. */
++ if (frag) {
++ frag->h.raw = frag->data;
++ frag->nh.raw = __skb_push(frag, hlen);
++ memcpy(frag->nh.raw, iph, hlen);
++ iph = frag->nh.iph;
++ iph->tot_len = htons(frag->len);
++ ip_copy_metadata(frag, skb);
++ if (offset == 0)
++ ip_options_fragment(frag);
++ offset += skb->len - hlen;
++ iph->frag_off = htons(offset>>3);
++ if (frag->next != NULL)
++ iph->frag_off |= htons(IP_MF);
++ /* Ready, complete checksum */
++ ip_send_check(iph);
++ }
++
++ err = output(skb);
++
++ if (err || !frag)
++ break;
++
++ skb = frag;
++ frag = skb->next;
++ skb->next = NULL;
++ }
++
++ if (err == 0) {
++ IP_INC_STATS(IpFragOKs);
++ return 0;
++ }
++
++ while (frag) {
++ skb = frag->next;
++ kfree_skb(frag);
++ frag = skb;
++ }
++ IP_INC_STATS(IpFragFails);
++ return err;
++ }
++
++slow_path:
+ left = skb->len - hlen; /* Space per frame */
+- mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
+ ptr = raw + hlen; /* Where to start from */
+
+ /*
+@@ -812,7 +589,7 @@
+ * Allocate buffer.
+ */
+
+- if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
++ if ((skb2 = alloc_skb(len+hlen+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
+ NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ err = -ENOMEM;
+ goto fail;
+@@ -822,14 +599,11 @@
+ * Set up data on packet
+ */
+
+- skb2->pkt_type = skb->pkt_type;
+- skb2->priority = skb->priority;
+- skb_reserve(skb2, (dev->hard_header_len+15)&~15);
++ ip_copy_metadata(skb2, skb);
++ skb_reserve(skb2, LL_RESERVED_SPACE(rt->u.dst.dev));
+ skb_put(skb2, len + hlen);
+ skb2->nh.raw = skb2->data;
+ skb2->h.raw = skb2->data + hlen;
+- skb2->protocol = skb->protocol;
+- skb2->security = skb->security;
+
+ /*
+ * Charge the memory for the fragment to any owner
+@@ -838,8 +612,6 @@
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+- skb2->dst = dst_clone(skb->dst);
+- skb2->dev = skb->dev;
+
+ /*
+ * Copy the packet header into the new buffer.
+@@ -869,9 +641,6 @@
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+- /* Copy the flags to each fragment. */
+- IPCB(skb2)->flags = IPCB(skb)->flags;
+-
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+@@ -881,20 +650,6 @@
+ ptr += len;
+ offset += len;
+
+-#ifdef CONFIG_NET_SCHED
+- skb2->tc_index = skb->tc_index;
+-#endif
+-#ifdef CONFIG_NETFILTER
+- skb2->nfmark = skb->nfmark;
+- skb2->nfcache = skb->nfcache;
+- /* Connection association is same as pre-frag packet */
+- skb2->nfct = skb->nfct;
+- nf_conntrack_get(skb2->nfct);
+-#ifdef CONFIG_NETFILTER_DEBUG
+- skb2->nf_debug = skb->nf_debug;
+-#endif
+-#endif
+-
+ /*
+ * Put this fragment into the sending queue.
+ */
+@@ -919,40 +674,555 @@
+ return err;
+ }
+
++int
++ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
++{
++ struct iovec *iov = from;
++
++ if (skb->ip_summed == CHECKSUM_HW) {
++ if (memcpy_fromiovecend(to, iov, offset, len) < 0)
++ return -EFAULT;
++ } else {
++ unsigned int csum = 0;
++ if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
++ return -EFAULT;
++ skb->csum = csum_block_add(skb->csum, csum, odd);
++ }
++ return 0;
++}
++
++static inline int
++skb_can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
++{
++ if (i) {
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
++ return page == frag->page &&
++ off == frag->page_offset+frag->size;
++ }
++ return 0;
++}
++
++static inline unsigned int
++csum_page(struct page *page, int offset, int copy)
++{
++ char *kaddr;
++ unsigned int csum;
++ kaddr = kmap(page);
++ csum = csum_partial(kaddr + offset, copy, 0);
++ kunmap(page);
++ return csum;
++}
++
+ /*
+- * Fetch data from kernel space and fill in checksum if needed.
++ * ip_append_data() and ip_append_page() can make one large IP datagram
++ * from many pieces of data. Each pieces will be holded on the socket
++ * until ip_push_pending_frames() is called. Eache pieces can be a page
++ * or non-page data.
++ *
++ * Not only UDP, other transport protocols - e.g. raw sockets - can use
++ * this interface potentially.
++ *
++ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+-static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
++int ip_append_data(struct sock *sk,
++ int getfrag(void *from, char *to, int offset, int len,
++ int odd, struct sk_buff *skb),
++ void *from, int length, int transhdrlen,
++ struct ipcm_cookie *ipc, struct rtable *rt,
++ unsigned int flags)
+ {
+- struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
+- u16 *pktp = (u16 *)to;
+- struct iovec *iov;
+- int len;
+- int hdrflag = 1;
+-
+- iov = &dp->iov[0];
+- if (offset >= iov->iov_len) {
+- offset -= iov->iov_len;
+- iov++;
+- hdrflag = 0;
+- }
+- len = iov->iov_len - offset;
+- if (fraglen > len) { /* overlapping. */
+- dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
+- dp->csum);
+- offset = 0;
+- fraglen -= len;
+- to += len;
+- iov++;
++ struct inet_opt *inet = inet_sk(sk);
++ struct sk_buff *skb;
++
++ struct ip_options *opt = NULL;
++ int hh_len;
++ int exthdrlen;
++ int mtu;
++ int copy;
++ int err;
++ int offset = 0;
++ unsigned int maxfraglen, fragheaderlen;
++ int csummode = CHECKSUM_NONE;
++
++ if (flags&MSG_PROBE)
++ return 0;
++
++ if (skb_queue_empty(&sk->write_queue)) {
++ /*
++ * setup for corking.
++ */
++ opt = ipc->opt;
++ if (opt) {
++ if (inet->cork.opt == NULL) {
++ inet->cork.opt = kmalloc(sizeof(struct ip_options)+40, sk->allocation);
++ if (unlikely(inet->cork.opt == NULL))
++ return -ENOBUFS;
++ }
++ memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
++ inet->cork.flags |= IPCORK_OPT;
++ inet->cork.addr = ipc->addr;
++ }
++ dst_hold(&rt->u.dst);
++ inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
++ inet->cork.rt = rt;
++ inet->cork.length = 0;
++ inet->sndmsg_page = NULL;
++ inet->sndmsg_off = 0;
++ if ((exthdrlen = rt->u.dst.header_len) != 0) {
++ length += exthdrlen;
++ transhdrlen += exthdrlen;
++ }
++ } else {
++ rt = inet->cork.rt;
++ if (inet->cork.flags & IPCORK_OPT)
++ opt = inet->cork.opt;
++
++ transhdrlen = 0;
++ exthdrlen = 0;
++ mtu = inet->cork.fragsize;
++ }
++ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
++
++ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
++ maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
++
++ if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
++ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu-exthdrlen);
++ return -EMSGSIZE;
++ }
++
++ /*
++ * transhdrlen > 0 means that this is the first fragment and we wish
++ * it won't be fragmented in the future.
++ */
++ if (transhdrlen &&
++ length + fragheaderlen <= maxfraglen &&
++ rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
++ !exthdrlen)
++ csummode = CHECKSUM_HW;
++
++ inet->cork.length += length;
++
++ /* So, what's going on in the loop below?
++ *
++ * We use calculated fragment length to generate chained skb,
++ * each of segments is IP fragment ready for sending to network after
++ * adding appropriate IP header.
++ *
++ * Mistake is:
++ *
++ * If mtu-fragheaderlen is not 0 modulo 8, we generate additional
++ * small fragment of length (mtu-fragheaderlen)%8, even though
++ * it is not necessary. Not a big bug, but needs a fix.
++ */
++
++ if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
++ goto alloc_new_skb;
++
++ while (length > 0) {
++ if ((copy = maxfraglen - skb->len) <= 0) {
++ char *data;
++ unsigned int datalen;
++ unsigned int fraglen;
++ unsigned int alloclen;
++ BUG_TRAP(copy == 0);
++
++alloc_new_skb:
++ datalen = maxfraglen - fragheaderlen;
++ if (datalen > length)
++ datalen = length;
++
++ fraglen = datalen + fragheaderlen;
++ if ((flags & MSG_MORE) &&
++ !(rt->u.dst.dev->features&NETIF_F_SG))
++ alloclen = maxfraglen;
++ else
++ alloclen = datalen + fragheaderlen;
++
++ /* The last fragment gets additional space at tail.
++ * Note, with MSG_MORE we overallocate on fragments,
++ * because we have no idea what fragment will be
++ * the last.
++ */
++ if (datalen == length)
++ alloclen += rt->u.dst.trailer_len;
++
++ if (transhdrlen) {
++ skb = sock_alloc_send_skb(sk,
++ alloclen + hh_len + 15,
++ (flags & MSG_DONTWAIT), &err);
++ } else {
++ skb = NULL;
++ if (atomic_read(&sk->wmem_alloc) <= 2*sk->sndbuf)
++ skb = sock_wmalloc(sk,
++ alloclen + hh_len + 15, 1,
++ sk->allocation);
++ if (unlikely(skb == NULL))
++ err = -ENOBUFS;
++ }
++ if (skb == NULL)
++ goto error;
++
++ /*
++ * Fill in the control structures
++ */
++ skb->ip_summed = csummode;
++ skb->csum = 0;
++ skb_reserve(skb, hh_len);
++
++ /*
++ * Find where to start putting bytes.
++ */
++ data = skb_put(skb, fraglen);
++ skb->nh.raw = data + exthdrlen;
++ data += fragheaderlen;
++ skb->h.raw = data + exthdrlen;
++
++ copy = datalen - transhdrlen;
++ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
++ err = -EFAULT;
++ kfree_skb(skb);
++ goto error;
++ }
++
++ offset += copy;
++ length -= datalen;
++ transhdrlen = 0;
++ exthdrlen = 0;
++ csummode = CHECKSUM_NONE;
++
++ /*
++ * Put the packet on the pending queue.
++ */
++ __skb_queue_tail(&sk->write_queue, skb);
++ continue;
++ }
++
++ if (copy > length)
++ copy = length;
++
++ if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
++ unsigned int off;
++
++ off = skb->len;
++ if (getfrag(from, skb_put(skb, copy),
++ offset, copy, off, skb) < 0) {
++ __skb_trim(skb, off);
++ err = -EFAULT;
++ goto error;
++ }
++ } else {
++ int i = skb_shinfo(skb)->nr_frags;
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
++ struct page *page = inet->sndmsg_page;
++ int off = inet->sndmsg_off;
++ unsigned int left;
++
++ if (page && (left = PAGE_SIZE - off) > 0) {
++ if (copy >= left)
++ copy = left;
++ if (page != frag->page) {
++ if (i == MAX_SKB_FRAGS) {
++ err = -EMSGSIZE;
++ goto error;
++ }
++ get_page(page);
++ skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
++ frag = &skb_shinfo(skb)->frags[i];
++ }
++ } else if (i < MAX_SKB_FRAGS) {
++ if (copy > PAGE_SIZE)
++ copy = PAGE_SIZE;
++ page = alloc_pages(sk->allocation, 0);
++ if (page == NULL) {
++ err = -ENOMEM;
++ goto error;
++ }
++ inet->sndmsg_page = page;
++ inet->sndmsg_off = 0;
++
++ skb_fill_page_desc(skb, i, page, 0, 0);
++ frag = &skb_shinfo(skb)->frags[i];
++ skb->truesize += PAGE_SIZE;
++ atomic_add(PAGE_SIZE, &sk->wmem_alloc);
++ } else {
++ err = -EMSGSIZE;
++ goto error;
++ }
++ if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
++ err = -EFAULT;
++ goto error;
++ }
++ inet->sndmsg_off += copy;
++ frag->size += copy;
++ skb->len += copy;
++ skb->data_len += copy;
++ }
++ offset += copy;
++ length -= copy;
+ }
+
+- dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
+- dp->csum);
++ return 0;
+
+- if (hdrflag && dp->csumoffset)
+- *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
+- return 0;
++error:
++ inet->cork.length -= length;
++ IP_INC_STATS(IpOutDiscards);
++ return err;
++}
++
++ssize_t ip_append_page(struct sock *sk, struct page *page,
++ int offset, size_t size, int flags)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ struct sk_buff *skb;
++ struct rtable *rt;
++ struct ip_options *opt = NULL;
++ int hh_len;
++ int mtu;
++ int len;
++ int err;
++ unsigned int maxfraglen, fragheaderlen;
++
++ if (inet->hdrincl)
++ return -EPERM;
++
++ if (flags&MSG_PROBE)
++ return 0;
++
++ if (skb_queue_empty(&sk->write_queue))
++ return -EINVAL;
++
++ rt = inet->cork.rt;
++ if (inet->cork.flags & IPCORK_OPT)
++ opt = inet->cork.opt;
++
++ if (!(rt->u.dst.dev->features&NETIF_F_SG))
++ return -EOPNOTSUPP;
++
++ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
++ mtu = inet->cork.fragsize;
++
++ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
++ maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
++
++ if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
++ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
++ return -EMSGSIZE;
++ }
++
++ if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
++ return -EINVAL;
++
++ inet->cork.length += size;
++
++ while (size > 0) {
++ int i;
++ if ((len = maxfraglen - skb->len) <= 0) {
++ char *data;
++ struct iphdr *iph;
++ BUG_TRAP(len == 0);
++
++ skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1,
++ sk->allocation);
++ if (unlikely(!skb)) {
++ err = -ENOBUFS;
++ goto error;
++ }
++
++ /*
++ * Fill in the control structures
++ */
++ skb->ip_summed = CHECKSUM_NONE;
++ skb->csum = 0;
++ skb_reserve(skb, hh_len);
++
++ /*
++ * Find where to start putting bytes.
++ */
++ data = skb_put(skb, fragheaderlen);
++ skb->nh.iph = iph = (struct iphdr *)data;
++ data += fragheaderlen;
++ skb->h.raw = data;
++
++ /*
++ * Put the packet on the pending queue.
++ */
++ __skb_queue_tail(&sk->write_queue, skb);
++ continue;
++ }
++
++ i = skb_shinfo(skb)->nr_frags;
++ if (len > size)
++ len = size;
++ if (skb_can_coalesce(skb, i, page, offset)) {
++ skb_shinfo(skb)->frags[i-1].size += len;
++ } else if (i < MAX_SKB_FRAGS) {
++ get_page(page);
++ skb_fill_page_desc(skb, i, page, offset, len);
++ } else {
++ err = -EMSGSIZE;
++ goto error;
++ }
++
++ if (skb->ip_summed == CHECKSUM_NONE) {
++ unsigned int csum;
++ csum = csum_page(page, offset, len);
++ skb->csum = csum_block_add(skb->csum, csum, skb->len);
++ }
++
++ skb->len += len;
++ skb->data_len += len;
++ offset += len;
++ size -= len;
++ }
++ return 0;
++
++error:
++ inet->cork.length -= size;
++ IP_INC_STATS(IpOutDiscards);
++ return err;
++}
++
++/*
++ * Combined all pending IP fragments on the socket as one IP datagram
++ * and push them out.
++ */
++int ip_push_pending_frames(struct sock *sk)
++{
++ struct sk_buff *skb, *tmp_skb;
++ struct sk_buff **tail_skb;
++ struct inet_opt *inet = inet_sk(sk);
++ struct ip_options *opt = NULL;
++ struct rtable *rt = inet->cork.rt;
++ struct iphdr *iph;
++ int df = 0;
++ __u8 ttl;
++ int err = 0;
++
++ if ((skb = __skb_dequeue(&sk->write_queue)) == NULL)
++ goto out;
++ tail_skb = &(skb_shinfo(skb)->frag_list);
++
++ /* move skb->data to ip header from ext header */
++ if (skb->data < skb->nh.raw)
++ __skb_pull(skb, skb->nh.raw - skb->data);
++ while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) {
++ __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
++ *tail_skb = tmp_skb;
++ tail_skb = &(tmp_skb->next);
++ skb->len += tmp_skb->len;
++ skb->data_len += tmp_skb->len;
++#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
++ skb->truesize += tmp_skb->truesize;
++ __sock_put(tmp_skb->sk);
++ tmp_skb->destructor = NULL;
++ tmp_skb->sk = NULL;
++#endif
++ }
++
++ /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
++ * to fragment the frame generated here. No matter, what transforms
++ * how transforms change size of the packet, it will come out.
++ */
++ if (inet->pmtudisc != IP_PMTUDISC_DO)
++ skb->local_df = 1;
++
++ /* DF bit is set when we want to see DF on outgoing frames.
++ * If local_df is set too, we still allow to fragment this frame
++ * locally. */
++ if (inet->pmtudisc == IP_PMTUDISC_DO ||
++ (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
++ df = htons(IP_DF);
++
++ if (inet->cork.flags & IPCORK_OPT)
++ opt = inet->cork.opt;
++
++ if (rt->rt_type == RTN_MULTICAST)
++ ttl = inet->mc_ttl;
++ else
++ ttl = ip_select_ttl(inet, &rt->u.dst);
++
++ iph = (struct iphdr *)skb->data;
++ iph->version = 4;
++ iph->ihl = 5;
++ if (opt) {
++ iph->ihl += opt->optlen>>2;
++ ip_options_build(skb, opt, inet->cork.addr, rt, 0);
++ }
++ iph->tos = inet->tos;
++ iph->tot_len = htons(skb->len);
++ iph->frag_off = df;
++ if (!df) {
++ __ip_select_ident(iph, &rt->u.dst);
++ } else {
++ iph->id = htons(inet->id++);
++ }
++ iph->ttl = ttl;
++ iph->protocol = sk->protocol;
++ iph->saddr = rt->rt_src;
++ iph->daddr = rt->rt_dst;
++ ip_send_check(iph);
++
++ skb->priority = sk->priority;
++ skb->dst = dst_clone(&rt->u.dst);
++
++ /* Netfilter gets whole the not fragmented skb. */
++ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
++ skb->dst->dev, dst_output);
++ if (err) {
++ if (err > 0)
++ err = inet->recverr ? net_xmit_errno(err) : 0;
++ if (err)
++ goto error;
++ }
++
++out:
++ inet->cork.flags &= ~IPCORK_OPT;
++ if (inet->cork.rt) {
++ ip_rt_put(inet->cork.rt);
++ inet->cork.rt = NULL;
++ }
++ return err;
++
++error:
++ IP_INC_STATS(IpOutDiscards);
++ goto out;
++}
++
++/*
++ * Throw away all pending data on the socket.
++ */
++void ip_flush_pending_frames(struct sock *sk)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ struct sk_buff *skb;
++
++ while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL)
++ kfree_skb(skb);
++
++ inet->cork.flags &= ~IPCORK_OPT;
++ if (inet->cork.opt) {
++ kfree(inet->cork.opt);
++ inet->cork.opt = NULL;
++ }
++ if (inet->cork.rt) {
++ ip_rt_put(inet->cork.rt);
++ inet->cork.rt = NULL;
++ }
++}
++
++
++/*
++ * Fetch data from kernel space and fill in checksum if needed.
++ */
++static int ip_reply_glue_bits(void *dptr, char *to, int offset,
++ int len, int odd, struct sk_buff *skb)
++{
++ unsigned int csum;
++
++ csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
++ skb->csum = csum_block_add(skb->csum, csum, odd);
++ return 0;
+ }
+
+ /*
+@@ -961,6 +1231,8 @@
+ *
+ * Should run single threaded per socket because it uses the sock
+ * structure to pass arguments.
++ *
++ * LATER: switch from ip_build_xmit to ip_append_*
+ */
+ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+ unsigned int len)
+@@ -986,8 +1258,19 @@
+ daddr = replyopts.opt.faddr;
+ }
+
+- if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
+- return;
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = rt->rt_spec_dst,
++ .tos = RT_TOS(skb->nh.iph->tos) } },
++ /* Not quite clean, but right. */
++ .uli_u = { .ports =
++ { .sport = skb->h.th->dest,
++ .dport = skb->h.th->source } },
++ .proto = sk->protocol };
++ if (ip_route_output_key(&rt, &fl))
++ return;
++ }
+
+ /* And let IP do all the hard work.
+
+@@ -999,7 +1282,15 @@
+ sk->protinfo.af_inet.tos = skb->nh.iph->tos;
+ sk->priority = skb->priority;
+ sk->protocol = skb->nh.iph->protocol;
+- ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
++ ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
++ &ipc, rt, MSG_DONTWAIT);
++ if ((skb = skb_peek(&sk->write_queue)) != NULL) {
++ if (arg->csumoffset >= 0)
++ *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
++ skb->ip_summed = CHECKSUM_NONE;
++ ip_push_pending_frames(sk);
++ }
++
+ bh_unlock_sock(sk);
+
+ ip_rt_put(rt);
+diff -Nru a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
+--- a/net/ipv4/ip_sockglue.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ip_sockglue.c 2005-02-13 21:25:09 +11:00
+@@ -36,6 +36,7 @@
+ #include <linux/route.h>
+ #include <linux/mroute.h>
+ #include <net/route.h>
++#include <net/xfrm.h>
+ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ #include <net/transp_v6.h>
+ #endif
+@@ -377,6 +378,7 @@
+
+ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
+ {
++ struct inet_opt *inet = inet_sk(sk);
+ int val=0,err;
+
+ if (level != SOL_IP)
+@@ -428,8 +430,10 @@
+ (!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE))
+ && sk->daddr != LOOPBACK4_IPV6)) {
+ #endif
++ if (inet->opt)
++ tp->ext_header_len -= inet->opt->optlen;
+ if (opt)
+- tp->ext_header_len = opt->optlen;
++ tp->ext_header_len += opt->optlen;
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ }
+@@ -489,11 +493,9 @@
+ case IP_TTL:
+ if (optlen<1)
+ goto e_inval;
+- if(val==-1)
+- val = sysctl_ip_default_ttl;
+- if(val<1||val>255)
++ if (val != -1 && (val < 1 || val>255))
+ goto e_inval;
+- sk->protinfo.af_inet.ttl=val;
++ sk->protinfo.af_inet.uc_ttl = val;
+ break;
+ case IP_HDRINCL:
+ if(sk->type!=SOCK_RAW) {
+@@ -837,6 +839,11 @@
+ sk->protinfo.af_inet.freebind = !!val;
+ break;
+
++ case IP_IPSEC_POLICY:
++ case IP_XFRM_POLICY:
++ err = xfrm_user_policy(sk, optname, optval, optlen);
++ break;
++
+ default:
+ #ifdef CONFIG_NETFILTER
+ err = nf_setsockopt(sk, PF_INET, optname, optval,
+@@ -924,7 +931,9 @@
+ val=sk->protinfo.af_inet.tos;
+ break;
+ case IP_TTL:
+- val=sk->protinfo.af_inet.ttl;
++ val = (sk->protinfo.af_inet.uc_ttl == -1 ?
++ sysctl_ip_default_ttl :
++ sk->protinfo.af_inet.uc_ttl);
+ break;
+ case IP_HDRINCL:
+ val=sk->protinfo.af_inet.hdrincl;
+@@ -938,7 +947,7 @@
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+- val = dst->pmtu;
++ val = dst_pmtu(dst) - dst->header_len;
+ dst_release(dst);
+ }
+ if (!val) {
+diff -Nru a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/ipcomp.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,378 @@
++/*
++ * IP Payload Compression Protocol (IPComp) - RFC3173.
++ *
++ * Copyright (c) 2003 James Morris <jmorris at intercode.com.au>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2 of the License, or (at your option)
++ * any later version.
++ *
++ * Todo:
++ * - Tunable compression parameters.
++ * - Compression stats.
++ * - Adaptive compression.
++ */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/icmp.h>
++#include <net/ipcomp.h>
++
++static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
++{
++ int err, plen, dlen;
++ struct iphdr *iph;
++ struct ipcomp_data *ipcd = x->data;
++ u8 *start, *scratch = ipcd->scratch;
++
++ plen = skb->len;
++ dlen = IPCOMP_SCRATCH_SIZE;
++ start = skb->data;
++
++ err = crypto_comp_decompress(ipcd->tfm, start, plen, scratch, &dlen);
++ if (err)
++ goto out;
++
++ if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
++ if (err)
++ goto out;
++
++ skb_put(skb, dlen - plen);
++ memcpy(skb->data, scratch, dlen);
++ iph = skb->nh.iph;
++ iph->tot_len = htons(dlen + iph->ihl * 4);
++out:
++ return err;
++}
++
++static int ipcomp_input(struct xfrm_state *x,
++ struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ u8 nexthdr;
++ int err = 0;
++ struct iphdr *iph;
++ union {
++ struct iphdr iph;
++ char buf[60];
++ } tmp_iph;
++
++
++ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
++ skb_linearize(skb, GFP_ATOMIC) != 0) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ /* Remove ipcomp header and decompress original payload */
++ iph = skb->nh.iph;
++ memcpy(&tmp_iph, iph, iph->ihl * 4);
++ nexthdr = *(u8 *)skb->data;
++ skb_pull(skb, sizeof(struct ip_comp_hdr));
++ skb->nh.raw += sizeof(struct ip_comp_hdr);
++ memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
++ iph = skb->nh.iph;
++ iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
++ iph->protocol = nexthdr;
++ skb->h.raw = skb->data;
++ err = ipcomp_decompress(x, skb);
++
++out:
++ return err;
++}
++
++static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
++{
++ int err, plen, dlen, ihlen;
++ struct iphdr *iph = skb->nh.iph;
++ struct ipcomp_data *ipcd = x->data;
++ u8 *start, *scratch = ipcd->scratch;
++
++ ihlen = iph->ihl * 4;
++ plen = skb->len - ihlen;
++ dlen = IPCOMP_SCRATCH_SIZE;
++ start = skb->data + ihlen;
++
++ err = crypto_comp_compress(ipcd->tfm, start, plen, scratch, &dlen);
++ if (err)
++ goto out;
++
++ if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
++ err = -EMSGSIZE;
++ goto out;
++ }
++
++ memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
++ pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
++
++out:
++ return err;
++}
++
++static int ipcomp_output(struct sk_buff *skb)
++{
++ int err;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct iphdr *iph;
++ struct ip_comp_hdr *ipch;
++ struct ipcomp_data *ipcd = x->data;
++ int hdr_len = 0;
++
++ iph = skb->nh.iph;
++ iph->tot_len = htons(skb->len);
++ hdr_len = iph->ihl * 4;
++ if ((skb->len - hdr_len) < ipcd->threshold) {
++ /* Don't bother compressing */
++ if (x->props.mode) {
++ ip_send_check(iph);
++ }
++ goto out_ok;
++ }
++
++ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
++ skb_linearize(skb, GFP_ATOMIC) != 0) {
++ err = -ENOMEM;
++ goto error;
++ }
++
++ err = ipcomp_compress(x, skb);
++ if (err) {
++ if (err == -EMSGSIZE) {
++ if (x->props.mode) {
++ iph = skb->nh.iph;
++ ip_send_check(iph);
++ }
++ goto out_ok;
++ }
++ goto error;
++ }
++
++ /* Install ipcomp header, convert into ipcomp datagram. */
++ iph = skb->nh.iph;
++ iph->tot_len = htons(skb->len);
++ ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
++ ipch->nexthdr = iph->protocol;
++ ipch->flags = 0;
++ ipch->cpi = htons((u16 )ntohl(x->id.spi));
++ iph->protocol = IPPROTO_COMP;
++ ip_send_check(iph);
++
++out_ok:
++ err = 0;
++
++error:
++ return err;
++}
++
++static void ipcomp4_err(struct sk_buff *skb, u32 info)
++{
++ u32 spi;
++ struct iphdr *iph = (struct iphdr *)skb->data;
++ struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
++ struct xfrm_state *x;
++
++ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
++ skb->h.icmph->code != ICMP_FRAG_NEEDED)
++ return;
++
++ spi = ntohl(ntohs(ipch->cpi));
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
++ spi, IPPROTO_COMP, AF_INET);
++ if (!x)
++ return;
++ printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
++ spi, NIPQUAD(iph->daddr));
++ xfrm_state_put(x);
++}
++
++/* We always hold one tunnel user reference to indicate a tunnel */
++static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
++{
++ struct xfrm_state *t;
++
++ t = xfrm_state_alloc();
++ if (t == NULL)
++ goto out;
++
++ t->id.proto = IPPROTO_IPIP;
++ t->id.spi = x->props.saddr.a4;
++ t->id.daddr.a4 = x->id.daddr.a4;
++ memcpy(&t->sel, &x->sel, sizeof(t->sel));
++ t->props.family = AF_INET;
++ t->props.mode = 1;
++ t->props.saddr.a4 = x->props.saddr.a4;
++ t->props.flags = x->props.flags;
++
++ t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
++ if (t->type == NULL)
++ goto error;
++
++ if (t->type->init_state(t, NULL))
++ goto error;
++
++ t->km.state = XFRM_STATE_VALID;
++ atomic_set(&t->tunnel_users, 1);
++out:
++ return t;
++
++error:
++ t->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(t);
++ t = NULL;
++ goto out;
++}
++
++/*
++ * Must be protected by xfrm_cfg_sem. State and tunnel user references are
++ * always incremented on success.
++ */
++static int ipcomp_tunnel_attach(struct xfrm_state *x)
++{
++ int err = 0;
++ struct xfrm_state *t;
++
++ t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
++ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
++ if (!t) {
++ t = ipcomp_tunnel_create(x);
++ if (!t) {
++ err = -EINVAL;
++ goto out;
++ }
++ xfrm_state_insert(t);
++ xfrm_state_hold(t);
++ }
++ x->tunnel = t;
++ atomic_inc(&t->tunnel_users);
++out:
++ return err;
++}
++
++static void ipcomp_free_data(struct ipcomp_data *ipcd)
++{
++ if (ipcd->tfm)
++ crypto_free_tfm(ipcd->tfm);
++ if (ipcd->scratch)
++ kfree(ipcd->scratch);
++}
++
++static void ipcomp_destroy(struct xfrm_state *x)
++{
++ struct ipcomp_data *ipcd = x->data;
++ if (!ipcd)
++ return;
++ xfrm_state_delete_tunnel(x);
++ ipcomp_free_data(ipcd);
++ kfree(ipcd);
++}
++
++static int ipcomp_init_state(struct xfrm_state *x, void *args)
++{
++ int err;
++ struct ipcomp_data *ipcd;
++ struct xfrm_algo_desc *calg_desc;
++
++ err = -EINVAL;
++ if (!x->calg)
++ goto out;
++
++ if (x->encap)
++ goto out;
++
++ err = -ENOMEM;
++ ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
++ if (!ipcd)
++ goto error;
++
++ memset(ipcd, 0, sizeof(*ipcd));
++ x->props.header_len = 0;
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct iphdr);
++
++ ipcd->scratch = kmalloc(IPCOMP_SCRATCH_SIZE, GFP_KERNEL);
++ if (!ipcd->scratch)
++ goto error;
++
++ ipcd->tfm = crypto_alloc_tfm(x->calg->alg_name, 0);
++ if (!ipcd->tfm)
++ goto error;
++
++ if (x->props.mode) {
++ err = ipcomp_tunnel_attach(x);
++ if (err)
++ goto error;
++ }
++
++ calg_desc = xfrm_calg_get_byname(x->calg->alg_name);
++ BUG_ON(!calg_desc);
++ ipcd->threshold = calg_desc->uinfo.comp.threshold;
++ x->data = ipcd;
++ err = 0;
++out:
++ return err;
++
++error:
++ if (ipcd) {
++ ipcomp_free_data(ipcd);
++ kfree(ipcd);
++ }
++ goto out;
++}
++
++static struct xfrm_type ipcomp_type = {
++ .description = "IPCOMP4",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_COMP,
++ .init_state = ipcomp_init_state,
++ .destructor = ipcomp_destroy,
++ .input = ipcomp_input,
++ .output = ipcomp_output
++};
++
++static struct inet_protocol ipcomp4_protocol = {
++ .handler = xfrm4_rcv,
++ .err_handler = ipcomp4_err,
++ .no_policy = 1,
++};
++
++static int __init ipcomp4_init(void)
++{
++ if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
++ printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
++ printk(KERN_INFO "ipcomp init: can't add protocol\n");
++ xfrm_unregister_type(&ipcomp_type, AF_INET);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit ipcomp4_fini(void)
++{
++ if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
++ printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
++ if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
++ printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
++}
++
++module_init(ipcomp4_init);
++module_exit(ipcomp4_fini);
++
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
++MODULE_AUTHOR("James Morris <jmorris at intercode.com.au>");
++
+diff -Nru a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
+--- a/net/ipv4/ipconfig.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ipconfig.c 2005-02-13 21:25:09 +11:00
+@@ -655,7 +655,7 @@
+ struct net_device *dev = d->dev;
+ struct sk_buff *skb;
+ struct bootp_pkt *b;
+- int hh_len = (dev->hard_header_len + 15) & ~15;
++ int hh_len = LL_RESERVED_SPACE(dev);
+ struct iphdr *h;
+
+ /* Allocate packet */
+diff -Nru a/net/ipv4/ipip.c b/net/ipv4/ipip.c
+--- a/net/ipv4/ipip.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/ipip.c 2005-02-13 21:25:10 +11:00
+@@ -115,6 +115,7 @@
+ #include <net/protocol.h>
+ #include <net/ipip.h>
+ #include <net/inet_ecn.h>
++#include <net/xfrm.h>
+
+ #define HASH_SIZE 16
+ #define HASH(addr) ((addr^(addr>>4))&0xF)
+@@ -207,7 +208,7 @@
+ write_unlock_bh(&ipip_lock);
+ }
+
+-struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
++static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+ {
+ u32 remote = parms->iph.daddr;
+ u32 local = parms->iph.saddr;
+@@ -289,7 +290,7 @@
+ dev_put(dev);
+ }
+
+-void ipip_err(struct sk_buff *skb, u32 info)
++static void ipip_err(struct sk_buff *skb, void *__unused)
+ {
+ #ifndef I_WISH_WORLD_WERE_PERFECT
+
+@@ -355,6 +356,7 @@
+ int rel_code = 0;
+ int rel_info = 0;
+ struct sk_buff *skb2;
++ struct flowi fl;
+ struct rtable *rt;
+
+ if (len < hlen + sizeof(struct iphdr))
+@@ -417,7 +419,11 @@
+ skb2->nh.raw = skb2->data;
+
+ /* Try to guess incoming interface */
+- if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) {
++ memset(&fl, 0, sizeof(fl));
++ fl.fl4_daddr = eiph->saddr;
++ fl.fl4_tos = RT_TOS(eiph->tos);
++ fl.proto = IPPROTO_IPIP;
++ if (ip_route_output_key(&rt, &key)) {
+ kfree_skb(skb2);
+ return;
+ }
+@@ -427,8 +433,11 @@
+ if (rt->rt_flags&RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+- if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) ||
+- rt->u.dst.dev->type != ARPHRD_IPGRE) {
++ fl.fl4_daddr = eiph->daddr;
++ fl.fl4_src = eiph->saddr;
++ fl.fl4_tos = eiph->tos;
++ if (ip_route_output_key(&rt, &fl) ||
++ rt->u.dst.dev->type != ARPHRD_TUNNEL) {
+ ip_rt_put(rt);
+ kfree_skb(skb2);
+ return;
+@@ -436,7 +445,7 @@
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+- skb2->dst->dev->type != ARPHRD_IPGRE) {
++ skb2->dst->dev->type != ARPHRD_TUNNEL) {
+ kfree_skb(skb2);
+ return;
+ }
+@@ -444,11 +453,11 @@
+
+ /* change mtu on this route */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+- if (rel_info > skb2->dst->pmtu) {
++ if (rel_info > dst_pmtu(skb2->dst)) {
+ kfree_skb(skb2);
+ return;
+ }
+- skb2->dst->pmtu = rel_info;
++ skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+ rel_info = htonl(rel_info);
+ } else if (type == ICMP_TIME_EXCEEDED) {
+ struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+@@ -473,7 +482,7 @@
+ IP_ECN_set_ce(inner_iph);
+ }
+
+-int ipip_rcv(struct sk_buff *skb)
++static int ipip_rcv(struct sk_buff *skb)
+ {
+ struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+@@ -482,14 +491,23 @@
+ goto out;
+
+ iph = skb->nh.iph;
+- skb->mac.raw = skb->nh.raw;
+- skb->nh.raw = skb->data;
+- memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+- skb->protocol = htons(ETH_P_IP);
+- skb->pkt_type = PACKET_HOST;
+
+ read_lock(&ipip_lock);
+ if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
++ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ read_unlock(&ipip_lock);
++ kfree_skb(skb);
++ return 0;
++ }
++
++ secpath_reset(skb);
++
++ skb->mac.raw = skb->nh.raw;
++ skb->nh.raw = skb->data;
++ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
++ skb->protocol = htons(ETH_P_IP);
++ skb->pkt_type = PACKET_HOST;
++
+ tunnel->stat.rx_packets++;
+ tunnel->stat.rx_bytes += skb->len;
+ skb->dev = tunnel->dev;
+@@ -503,16 +521,8 @@
+ }
+ read_unlock(&ipip_lock);
+
+- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+ out:
+- kfree_skb(skb);
+- return 0;
+-}
+-
+-/* Need this wrapper because NF_HOOK takes the function address */
+-static inline int do_ip_send(struct sk_buff *skb)
+-{
+- return ip_send(skb);
++ return -1;
+ }
+
+ /*
+@@ -556,9 +566,17 @@
+ goto tx_error_icmp;
+ }
+
+- if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
+- tunnel->stat.tx_carrier_errors++;
+- goto tx_error_icmp;
++ {
++ struct flowi fl = { .oif = tunnel->parms.link,
++ .nl_u = { .ip4_u =
++ { .daddr = dst,
++ .saddr = tiph->saddr,
++ .tos = RT_TOS(tos) } },
++ .proto = IPPROTO_IPIP };
++ if (ip_route_output_key(&rt, &fl)) {
++ tunnel->stat.tx_carrier_errors++;
++ goto tx_error_icmp;
++ }
+ }
+ tdev = rt->u.dst.dev;
+
+@@ -569,17 +587,17 @@
+ }
+
+ if (tiph->frag_off)
+- mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
++ mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+ else
+- mtu = skb->dst ? skb->dst->pmtu : dev->mtu;
++ mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+
+ if (mtu < 68) {
+ tunnel->stat.collisions++;
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+- if (skb->dst && mtu < skb->dst->pmtu)
+- skb->dst->pmtu = mtu;
++ if (skb->dst)
++ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ df |= (old_iph->frag_off&htons(IP_DF));
+
+@@ -600,7 +618,7 @@
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+- max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
++ max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+@@ -812,8 +830,14 @@
+ ipip_tunnel_init_gen(dev);
+
+ if (iph->daddr) {
++ struct flowi fl = { .oif = tunnel->parms.link,
++ .nl_u = { .ip4_u =
++ { .daddr = iph->daddr,
++ .saddr = iph->saddr,
++ .tos = RT_TOS(iph->tos) } },
++ .proto = IPPROTO_IPIP };
+ struct rtable *rt;
+- if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
++ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+@@ -846,7 +870,7 @@
+ }
+ #endif
+
+-int __init ipip_fb_tunnel_init(struct net_device *dev)
++static int __init ipip_fb_tunnel_init(struct net_device *dev)
+ {
+ struct iphdr *iph;
+
+@@ -866,11 +890,9 @@
+ return 0;
+ }
+
+-static struct inet_protocol ipip_protocol = {
+- handler: ipip_rcv,
+- err_handler: ipip_err,
+- protocol: IPPROTO_IPIP,
+- name: "IPIP"
++static struct xfrm_tunnel ipip_handler = {
++ .handler = ipip_rcv,
++ .err_handler = ipip_err,
+ };
+
+ static char banner[] __initdata =
+@@ -880,16 +902,20 @@
+ {
+ printk(banner);
+
++ if (xfrm4_tunnel_register(&ipip_handler) < 0) {
++ printk(KERN_INFO "ipip init: can't register tunnel\n");
++ return -EAGAIN;
++ }
++
+ ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel;
+ register_netdev(&ipip_fb_tunnel_dev);
+- inet_add_protocol(&ipip_protocol);
+ return 0;
+ }
+
+ static void __exit ipip_fini(void)
+ {
+- if ( inet_del_protocol(&ipip_protocol) < 0 )
+- printk(KERN_INFO "ipip close: can't remove protocol\n");
++ if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
++ printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+
+ unregister_netdev(&ipip_fb_tunnel_dev);
+ }
+diff -Nru a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
+--- a/net/ipv4/ipmr.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ipmr.c 2005-02-13 21:25:09 +11:00
+@@ -108,7 +108,7 @@
+ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+ static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+
+-extern struct inet_protocol pim_protocol;
++static struct inet_protocol pim_protocol;
+
+ static struct timer_list ipmr_expire_timer;
+
+@@ -928,23 +928,28 @@
+ #ifdef CONFIG_IP_PIMSM
+ case MRT_PIM:
+ {
+- int v;
++ int v, ret;
+ if(get_user(v,(int *)optval))
+ return -EFAULT;
+ v = (v)?1:0;
+ rtnl_lock();
++ ret = 0;
+ if (v != mroute_do_pim) {
+ mroute_do_pim = v;
+ mroute_do_assert = v;
+ #ifdef CONFIG_IP_PIMSM_V2
+ if (mroute_do_pim)
+- inet_add_protocol(&pim_protocol);
++ ret = inet_add_protocol(&pim_protocol,
++ IPPROTO_PIM);
+ else
+- inet_del_protocol(&pim_protocol);
++ ret = inet_del_protocol(&pim_protocol,
++ IPPROTO_PIM);
++ if (ret < 0)
++ ret = -EAGAIN;
+ #endif
+ }
+ rtnl_unlock();
+- return 0;
++ return ret;
+ }
+ #endif
+ /*
+@@ -1102,16 +1107,14 @@
+
+ static inline int ipmr_forward_finish(struct sk_buff *skb)
+ {
+- struct ip_options *opt = &(IPCB(skb)->opt);
+- struct dst_entry *dst = skb->dst;
++ struct ip_options * opt = &(IPCB(skb)->opt);
++
++ IP_INC_STATS_BH(IpForwDatagrams);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+- if (skb->len <= dst->pmtu)
+- return dst->output(skb);
+- else
+- return ip_fragment(skb, dst->output);
++ return dst_output(skb);
+ }
+
+ /*
+@@ -1143,17 +1146,28 @@
+ #endif
+
+ if (vif->flags&VIFF_TUNNEL) {
+- if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link))
++ struct flowi fl = { .oif = vif->link,
++ .nl_u = { .ip4_u =
++ { .daddr = vif->remote,
++ .saddr = vif->local,
++ .tos = RT_TOS(iph->tos) } },
++ .proto = IPPROTO_IPIP };
++ if (ip_route_output_key(&rt, &fl))
+ return;
+ encap = sizeof(struct iphdr);
+ } else {
+- if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link))
++ struct flowi fl = { .oif = vif->link,
++ .nl_u = { .ip4_u =
++ { .daddr = iph->daddr,
++ .tos = RT_TOS(iph->tos) } },
++ .proto = IPPROTO_IPIP };
++ if (ip_route_output_key(&rt, &fl))
+ return;
+ }
+
+ dev = rt->u.dst.dev;
+
+- if (skb->len+encap > rt->u.dst.pmtu && (ntohs(iph->frag_off) & IP_DF)) {
++ if (skb->len+encap > dst_pmtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ /* Do not fragment multicasts. Alas, IPv4 does not
+ allow to send ICMP, so that packets will disappear
+ to blackhole.
+@@ -1164,7 +1178,7 @@
+ return;
+ }
+
+- encap += dev->hard_header_len;
++ encap += LL_RESERVED_SPACE(dev);
+
+ if (skb_headroom(skb) < encap || skb_cloned(skb) || !last)
+ skb2 = skb_realloc_headroom(skb, (encap + 15)&~15);
+@@ -1241,7 +1255,7 @@
+ if (vif_table[vif].dev != skb->dev) {
+ int true_vifi;
+
+- if (((struct rtable*)skb->dst)->key.iif == 0) {
++ if (((struct rtable*)skb->dst)->fl.iif == 0) {
+ /* It is our own packet, looped back.
+ Very complicated situation...
+
+@@ -1391,19 +1405,15 @@
+ struct net_device *reg_dev = NULL;
+
+ if (skb_is_nonlinear(skb)) {
+- if (skb_linearize(skb, GFP_ATOMIC) != 0) {
+- kfree_skb(skb);
+- return -ENOMEM;
+- }
++ if (skb_linearize(skb, GFP_ATOMIC) != 0)
++ goto drop;
+ pim = (struct igmphdr*)skb->h.raw;
+ }
+
+ if (!mroute_do_pim ||
+ skb->len < sizeof(*pim) + sizeof(*encap) ||
+- pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
++ goto drop;
+
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
+ /*
+@@ -1413,11 +1423,9 @@
+ c. packet is not truncated
+ */
+ if (!MULTICAST(encap->daddr) ||
+- ntohs(encap->tot_len) == 0 ||
+- ntohs(encap->tot_len) + sizeof(*pim) > skb->len) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ encap->tot_len == 0 ||
++ ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
++ goto drop;
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+@@ -1426,10 +1434,8 @@
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+- if (reg_dev == NULL) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ if (reg_dev == NULL)
++ goto drop;
+
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+@@ -1447,6 +1453,9 @@
+ netif_rx(skb);
+ dev_put(reg_dev);
+ return 0;
++ drop:
++ kfree_skb(skb);
++ return 0;
+ }
+ #endif
+
+@@ -1458,10 +1467,8 @@
+ struct net_device *reg_dev = NULL;
+
+ if (skb_is_nonlinear(skb)) {
+- if (skb_linearize(skb, GFP_ATOMIC) != 0) {
+- kfree_skb(skb);
+- return -ENOMEM;
+- }
++ if (skb_linearize(skb, GFP_ATOMIC) != 0)
++ goto drop;
+ pim = (struct pimreghdr*)skb->h.raw;
+ }
+
+@@ -1469,19 +1476,15 @@
+ pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+ (pim->flags&PIM_NULL_REGISTER) ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+- ip_compute_csum((void *)pim, skb->len))) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ ip_compute_csum((void *)pim, skb->len)))
++ goto drop;
+
+ /* check if the inner packet is destined to mcast group */
+ encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
+ if (!MULTICAST(encap->daddr) ||
+- ntohs(encap->tot_len) == 0 ||
+- ntohs(encap->tot_len) + sizeof(*pim) > skb->len) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ encap->tot_len == 0 ||
++ ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
++ goto drop;
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+@@ -1490,10 +1493,8 @@
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+- if (reg_dev == NULL) {
+- kfree_skb(skb);
+- return -EINVAL;
+- }
++ if (reg_dev == NULL)
++ goto drop;
+
+ skb->mac.raw = skb->nh.raw;
+ skb_pull(skb, (u8*)encap - skb->data);
+@@ -1511,6 +1512,9 @@
+ netif_rx(skb);
+ dev_put(reg_dev);
+ return 0;
++ drop:
++ kfree_skb(skb);
++ return 0;
+ }
+ #endif
+
+@@ -1723,15 +1727,8 @@
+ #endif
+
+ #ifdef CONFIG_IP_PIMSM_V2
+-struct inet_protocol pim_protocol =
+-{
+- pim_rcv, /* PIM handler */
+- NULL, /* PIM error control */
+- NULL, /* next */
+- IPPROTO_PIM, /* protocol ID */
+- 0, /* copy */
+- NULL, /* data */
+- "PIM" /* name */
++static struct inet_protocol pim_protocol = {
++ .handler = pim_rcv,
+ };
+ #endif
+
+diff -Nru a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
+--- a/net/ipv4/ipvs/ip_vs_conn.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ipvs/ip_vs_conn.c 2005-02-13 21:25:09 +11:00
+@@ -606,17 +606,25 @@
+ struct iphdr *iph = skb->nh.iph;
+ u8 tos = iph->tos;
+ int mtu;
++ struct flowi fl = {
++ .oif = 0,
++ .nl_u = {
++ .ip4_u = {
++ .daddr = iph->daddr,
++ .saddr = 0,
++ .tos = RT_TOS(tos), } },
++ };
+
+ EnterFunction(10);
+
+- if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
++ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+ "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+ goto tx_error_icmp;
+ }
+
+ /* MTU checking */
+- mtu = rt->u.dst.pmtu;
++ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+@@ -642,8 +650,7 @@
+ #ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+ #endif /* CONFIG_NETFILTER_DEBUG */
+- skb->nfcache |= NFC_IPVS_PROPERTY;
+- ip_send(skb);
++ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+@@ -742,7 +749,7 @@
+ goto tx_error_icmp;
+
+ /* MTU checking */
+- mtu = rt->u.dst.pmtu;
++ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+@@ -814,8 +821,7 @@
+ #ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+ #endif /* CONFIG_NETFILTER_DEBUG */
+- skb->nfcache |= NFC_IPVS_PROPERTY;
+- ip_send(skb);
++ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+@@ -870,14 +876,14 @@
+
+ tdev = rt->u.dst.dev;
+
+- mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
++ mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+ goto tx_error;
+ }
+- if (skb->dst && mtu < skb->dst->pmtu)
+- skb->dst->pmtu = mtu;
++ if (skb->dst)
++ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+@@ -939,8 +945,7 @@
+ #ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+ #endif /* CONFIG_NETFILTER_DEBUG */
+- skb->nfcache |= NFC_IPVS_PROPERTY;
+- ip_send(skb);
++ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+
+@@ -969,7 +974,7 @@
+ goto tx_error_icmp;
+
+ /* MTU checking */
+- mtu = rt->u.dst.pmtu;
++ mtu = dst_pmtu(&rt->u.dst);
+ if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+@@ -995,8 +1000,7 @@
+ #ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+ #endif /* CONFIG_NETFILTER_DEBUG */
+- skb->nfcache |= NFC_IPVS_PROPERTY;
+- ip_send(skb);
++ IP_VS_XMIT(skb, rt);
+
+ #if 0000
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+diff -Nru a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
+--- a/net/ipv4/ipvs/ip_vs_core.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/ipvs/ip_vs_core.c 2005-02-13 21:25:09 +11:00
+@@ -953,7 +953,7 @@
+ goto tx_error_icmp;
+
+ /* MTU checking */
+- mtu = rt->u.dst.pmtu;
++ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+@@ -1001,7 +1001,7 @@
+ #ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+ #endif /* CONFIG_NETFILTER_DEBUG */
+- ip_send(skb);
++ IP_VS_XMIT(skb, rt);
+ ip_vs_conn_put(cp);
+ return NF_STOLEN;
+
+diff -Nru a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
+--- a/net/ipv4/netfilter/ip_conntrack_standalone.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ip_conntrack_standalone.c 2005-02-13 21:25:09 +11:00
+@@ -204,7 +204,7 @@
+ /* Local packets are never produced too large for their
+ interface. We degfragment them at LOCAL_OUT, however,
+ so we have to refragment them here. */
+- if ((*pskb)->len > rt->u.dst.pmtu) {
++ if ((*pskb)->len > dst_pmtu(&rt->u.dst)) {
+ /* No hook can be after us, so this should be OK. */
+ ip_fragment(*pskb, okfn);
+ return NF_STOLEN;
+diff -Nru a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c
+--- a/net/ipv4/netfilter/ip_fw_compat_masq.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ip_fw_compat_masq.c 2005-02-13 21:25:09 +11:00
+@@ -69,12 +69,13 @@
+ /* Setup the masquerade, if not already */
+ if (!info->initialized) {
+ u_int32_t newsrc;
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = iph->daddr } } };
+ struct rtable *rt;
+ struct ip_nat_multi_range range;
+
+ /* Pass 0 instead of saddr, since it's going to be changed
+ anyway. */
+- if (ip_route_output(&rt, iph->daddr, 0, 0, 0) != 0) {
++ if (ip_route_output_key(&rt, &fl) != 0) {
+ DEBUGP("ipnat_rule_masquerade: Can't reroute.\n");
+ return NF_DROP;
+ }
+diff -Nru a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
+--- a/net/ipv4/netfilter/ip_nat_core.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ip_nat_core.c 2005-02-13 21:25:09 +11:00
+@@ -203,10 +203,11 @@
+ static int
+ do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
+ {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
+ struct rtable *rt;
+
+ /* FIXME: IPTOS_TOS(iph->tos) --RR */
+- if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
++ if (ip_route_output_key(&rt, &fl) != 0) {
+ DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
+ NIPQUAD(var_ip));
+ return 0;
+diff -Nru a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
+--- a/net/ipv4/netfilter/ipt_MASQUERADE.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ipt_MASQUERADE.c 2005-02-13 21:25:09 +11:00
+@@ -69,7 +69,6 @@
+ struct ip_nat_multi_range newrange;
+ u_int32_t newsrc;
+ struct rtable *rt;
+- struct rt_key key;
+
+ IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+@@ -84,26 +83,29 @@
+
+ mr = targinfo;
+
+- key.dst = (*pskb)->nh.iph->daddr;
+- key.src = 0; /* Unknown: that's what we're trying to establish */
+- key.tos = RT_TOS((*pskb)->nh.iph->tos)|RTO_CONN;
+- key.oif = 0;
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = (*pskb)->nh.iph->daddr,
++ .tos = (RT_TOS((*pskb)->nh.iph->tos) |
++ RTO_CONN),
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- key.fwmark = (*pskb)->nfmark;
++ .fwmark = (*pskb)->nfmark
+ #endif
+- if (ip_route_output_key(&rt, &key) != 0) {
+- /* Funky routing can do this. */
+- if (net_ratelimit())
+- printk("MASQUERADE:"
+- " No route: Rusty's brain broke!\n");
+- return NF_DROP;
+- }
+- if (rt->u.dst.dev != out) {
+- if (net_ratelimit())
+- printk("MASQUERADE:"
+- " Route sent us somewhere else.\n");
++ } } };
++ if (ip_route_output_key(&rt, &fl) != 0) {
++ /* Funky routing can do this. */
++ if (net_ratelimit())
++ printk("MASQUERADE:"
++ " No route: Rusty's brain broke!\n");
++ return NF_DROP;
++ }
++ if (rt->u.dst.dev != out) {
++ if (net_ratelimit())
++ printk("MASQUERADE:"
++ " Route sent us somewhere else.\n");
+ ip_rt_put(rt);
+- return NF_DROP;
++ return NF_DROP;
++ }
+ }
+
+ newsrc = rt->rt_src;
+diff -Nru a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c
+--- a/net/ipv4/netfilter/ipt_MIRROR.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/netfilter/ipt_MIRROR.c 2005-02-13 21:25:10 +11:00
+@@ -44,21 +44,21 @@
+ {
+ struct iphdr *iph = skb->nh.iph;
+ struct dst_entry *odst;
+- struct rt_key key = {};
++ struct flowi fl = {};
+ struct rtable *rt;
+
+ if (local) {
+- key.dst = iph->saddr;
+- key.src = iph->daddr;
+- key.tos = RT_TOS(iph->tos);
++ fl.nl_u.ip4_u.daddr = iph->saddr;
++ fl.nl_u.ip4_u.saddr = iph->daddr;
++ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+
+- if (ip_route_output_key(&rt, &key) != 0)
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+- key.dst = iph->daddr;
+- if (ip_route_output_key(&rt, &key) != 0)
++ fl.nl_u.ip4_u.daddr = iph->daddr;
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+
+ odst = skb->dst;
+diff -Nru a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
+--- a/net/ipv4/netfilter/ipt_REJECT.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ipt_REJECT.c 2005-02-13 21:25:09 +11:00
+@@ -26,22 +26,22 @@
+ {
+ struct iphdr *iph = skb->nh.iph;
+ struct dst_entry *odst;
+- struct rt_key key = {};
++ struct flowi fl = {};
+ struct rtable *rt;
+
+ if (hook != NF_IP_FORWARD) {
+- key.dst = iph->saddr;
++ fl.nl_u.ip4_u.daddr = iph->saddr;
+ if (hook == NF_IP_LOCAL_IN)
+- key.src = iph->daddr;
+- key.tos = RT_TOS(iph->tos);
++ fl.nl_u.ip4_u.saddr = iph->daddr;
++ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+
+- if (ip_route_output_key(&rt, &key) != 0)
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+- key.dst = iph->daddr;
+- if (ip_route_output_key(&rt, &key) != 0)
++ fl.nl_u.ip4_u.daddr = iph->daddr;
++ if (ip_route_output_key(&rt, &fl) != 0)
+ return NULL;
+
+ odst = skb->dst;
+@@ -172,7 +172,7 @@
+ nskb->nh.iph->ihl);
+
+ /* "Never happens" */
+- if (nskb->len > nskb->dst->pmtu)
++ if (nskb->len > dst_pmtu(nskb->dst))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+@@ -252,14 +252,19 @@
+
+ tos = (iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL;
+
+- if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0))
+- return;
+-
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = iph->saddr,
++ .saddr = saddr,
++ .tos = RT_TOS(tos) } } };
++ if (ip_route_output_key(&rt, &fl))
++ return;
++ }
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+ length = skb_in->len + sizeof(struct iphdr) + sizeof(struct icmphdr);
+
+- if (length > rt->u.dst.pmtu)
+- length = rt->u.dst.pmtu;
++ if (length > dst_pmtu(&rt->u.dst))
++ length = dst_pmtu(&rt->u.dst);
+ if (length > 576)
+ length = 576;
+
+diff -Nru a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
+--- a/net/ipv4/netfilter/ipt_TCPMSS.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ipt_TCPMSS.c 2005-02-13 21:25:09 +11:00
+@@ -85,14 +85,14 @@
+ return NF_DROP; /* or IPT_CONTINUE ?? */
+ }
+
+- if((*pskb)->dst->pmtu <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
++ if(dst_pmtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+- "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", (*pskb)->dst->pmtu);
++ "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_pmtu((*pskb)->dst));
+ return NF_DROP; /* or IPT_CONTINUE ?? */
+ }
+
+- newmss = (*pskb)->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
++ newmss = dst_pmtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ } else
+ newmss = tcpmssinfo->mss;
+
+diff -Nru a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
+--- a/net/ipv4/netfilter/ipt_multiport.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/netfilter/ipt_multiport.c 2005-02-13 21:25:09 +11:00
+@@ -4,6 +4,7 @@
+ #include <linux/types.h>
+ #include <linux/udp.h>
+ #include <linux/skbuff.h>
++#include <linux/socket.h>
+
+ #include <linux/netfilter_ipv4/ipt_multiport.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+diff -Nru a/net/ipv4/proc.c b/net/ipv4/proc.c
+--- a/net/ipv4/proc.c 2005-02-13 21:25:08 +11:00
++++ b/net/ipv4/proc.c 2005-02-13 21:25:08 +11:00
+@@ -116,7 +116,6 @@
+
+ int snmp_get_info(char *buffer, char **start, off_t offset, int length)
+ {
+- extern int sysctl_ip_default_ttl;
+ int len, i;
+
+ len = sprintf (buffer,
+diff -Nru a/net/ipv4/protocol.c b/net/ipv4/protocol.c
+--- a/net/ipv4/protocol.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/protocol.c 2005-02-13 21:25:09 +11:00
+@@ -48,134 +48,52 @@
+ #include <net/ipip.h>
+ #include <linux/igmp.h>
+
+-#define IPPROTO_PREVIOUS NULL
+-
+-#ifdef CONFIG_IP_MULTICAST
+-
+-static struct inet_protocol igmp_protocol = {
+- handler: igmp_rcv,
+- next: IPPROTO_PREVIOUS,
+- protocol: IPPROTO_IGMP,
+- name: "IGMP"
+-};
+-
+-#undef IPPROTO_PREVIOUS
+-#define IPPROTO_PREVIOUS &igmp_protocol
+-
+-#endif
+-
+-static struct inet_protocol tcp_protocol = {
+- handler: tcp_v4_rcv,
+- err_handler: tcp_v4_err,
+- next: IPPROTO_PREVIOUS,
+- protocol: IPPROTO_TCP,
+- name: "TCP"
+-};
+-
+-#undef IPPROTO_PREVIOUS
+-#define IPPROTO_PREVIOUS &tcp_protocol
+-
+-static struct inet_protocol udp_protocol = {
+- handler: udp_rcv,
+- err_handler: udp_err,
+- next: IPPROTO_PREVIOUS,
+- protocol: IPPROTO_UDP,
+- name: "UDP"
+-};
+-
+-#undef IPPROTO_PREVIOUS
+-#define IPPROTO_PREVIOUS &udp_protocol
+-
+-static struct inet_protocol icmp_protocol = {
+- handler: icmp_rcv,
+- next: IPPROTO_PREVIOUS,
+- protocol: IPPROTO_ICMP,
+- name: "ICMP"
+-};
+-
+-#undef IPPROTO_PREVIOUS
+-#define IPPROTO_PREVIOUS &icmp_protocol
+-
+-
+-struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS;
+-
+ struct inet_protocol *inet_protos[MAX_INET_PROTOS];
+
+ /*
+ * Add a protocol handler to the hash tables
+ */
+
+-void inet_add_protocol(struct inet_protocol *prot)
++int inet_add_protocol(struct inet_protocol *prot, unsigned char protocol)
+ {
+- unsigned char hash;
+- struct inet_protocol *p2;
++ int hash, ret;
++
++ hash = protocol & (MAX_INET_PROTOS - 1);
+
+- hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+- prot ->next = inet_protos[hash];
+- inet_protos[hash] = prot;
+- prot->copy = 0;
+-
+- /*
+- * Set the copy bit if we need to.
+- */
+-
+- p2 = (struct inet_protocol *) prot->next;
+- while (p2) {
+- if (p2->protocol == prot->protocol) {
+- prot->copy = 1;
+- break;
+- }
+- p2 = (struct inet_protocol *) p2->next;
++
++ if (inet_protos[hash]) {
++ ret = -1;
++ } else {
++ inet_protos[hash] = prot;
++ ret = 0;
+ }
++
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
++
++ return ret;
+ }
+
+ /*
+ * Remove a protocol from the hash tables.
+ */
+
+-int inet_del_protocol(struct inet_protocol *prot)
++int inet_del_protocol(struct inet_protocol *prot, unsigned char protocol)
+ {
+- struct inet_protocol *p;
+- struct inet_protocol *lp = NULL;
+- unsigned char hash;
+-
+- hash = prot->protocol & (MAX_INET_PROTOS - 1);
+- br_write_lock_bh(BR_NETPROTO_LOCK);
+- if (prot == inet_protos[hash]) {
+- inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next;
+- br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return 0;
+- }
++ int hash, ret;
+
+- p = (struct inet_protocol *) inet_protos[hash];
++ hash = protocol & (MAX_INET_PROTOS - 1);
+
+- if (p != NULL && p->protocol == prot->protocol)
+- lp = p;
+-
+- while (p) {
+- /*
+- * We have to worry if the protocol being deleted is
+- * the last one on the list, then we may need to reset
+- * someone's copied bit.
+- */
+- if (p->next && p->next == prot) {
+- /*
+- * if we are the last one with this protocol and
+- * there is a previous one, reset its copy bit.
+- */
+- if (prot->copy == 0 && lp != NULL)
+- lp->copy = 0;
+- p->next = prot->next;
+- br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return 0;
+- }
+- if (p->next != NULL && p->next->protocol == prot->protocol)
+- lp = p->next;
++ br_write_lock_bh(BR_NETPROTO_LOCK);
+
+- p = (struct inet_protocol *) p->next;
++ if (inet_protos[hash] == prot) {
++ inet_protos[hash] = NULL;
++ ret = 0;
++ } else {
++ ret = -1;
+ }
++
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return -1;
++
++ return ret;
+ }
+diff -Nru a/net/ipv4/raw.c b/net/ipv4/raw.c
+--- a/net/ipv4/raw.c 2005-02-13 21:25:08 +11:00
++++ b/net/ipv4/raw.c 2005-02-13 21:25:08 +11:00
+@@ -64,6 +64,8 @@
+ #include <net/raw.h>
+ #include <net/inet_common.h>
+ #include <net/checksum.h>
++#include <net/xfrm.h>
++#include <linux/netfilter_ipv4.h>
+
+ struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
+ rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED;
+@@ -132,13 +134,12 @@
+ }
+
+ /* IP input processing comes here for RAW socket delivery.
+- * This is fun as to avoid copies we want to make no surplus
+- * copies.
++ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+-struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
++void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+ {
+ struct sock *sk;
+
+@@ -150,28 +151,19 @@
+ skb->dev->ifindex);
+
+ while (sk) {
+- struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
+- iph->saddr, iph->daddr,
+- skb->dev->ifindex);
+- if (iph->protocol != IPPROTO_ICMP ||
+- !icmp_filter(sk, skb)) {
+- struct sk_buff *clone;
++ if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
++ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+- if (!sknext)
+- break;
+- clone = skb_clone(skb, GFP_ATOMIC);
+ /* Not releasing hash table! */
+ if (clone)
+ raw_rcv(sk, clone);
+ }
+- sk = sknext;
++ sk = __raw_v4_lookup(sk->next, iph->protocol,
++ iph->saddr, iph->daddr,
++ skb->dev->ifindex);
+ }
+ out:
+- if (sk)
+- sock_hold(sk);
+ read_unlock(&raw_v4_lock);
+-
+- return sk;
+ }
+
+ void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+@@ -244,71 +236,137 @@
+
+ int raw_rcv(struct sock *sk, struct sk_buff *skb)
+ {
++ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return NET_RX_DROP;
++ }
++
+ skb_push(skb, skb->data - skb->nh.raw);
+
+ raw_rcv_skb(sk, skb);
+ return 0;
+ }
+
+-struct rawfakehdr
+-{
+- struct iovec *iov;
+- u32 saddr;
+- struct dst_entry *dst;
+-};
++static int raw_send_hdrinc(struct sock *sk, void *from, int length,
++ struct rtable *rt,
++ unsigned int flags)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ int hh_len;
++ struct iphdr *iph;
++ struct sk_buff *skb;
++ int err;
+
+-/*
+- * Send a RAW IP packet.
+- */
++ if (length > rt->u.dst.dev->mtu) {
++ ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport,
++ rt->u.dst.dev->mtu);
++ return -EMSGSIZE;
++ }
++ if (flags&MSG_PROBE)
++ goto out;
+
+-/*
+- * Callback support is trivial for SOCK_RAW
+- */
+-
+-static int raw_getfrag(const void *p, char *to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
+-{
+- struct rawfakehdr *rfh = (struct rawfakehdr *) p;
+- return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
+-}
++ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+-/*
+- * IPPROTO_RAW needs extra work.
+- */
+-
+-static int raw_getrawfrag(const void *p, char *to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
+-{
+- struct rawfakehdr *rfh = (struct rawfakehdr *) p;
++ skb = sock_alloc_send_skb(sk, length+hh_len+15,
++ flags&MSG_DONTWAIT, &err);
++ if (skb == NULL)
++ goto error;
++ skb_reserve(skb, hh_len);
++
++ skb->priority = sk->priority;
++ skb->dst = dst_clone(&rt->u.dst);
++
++ skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+
+- if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
+- return -EFAULT;
++ skb->ip_summed = CHECKSUM_NONE;
+
+- if (!offset) {
+- struct iphdr *iph = (struct iphdr *)to;
++ skb->h.raw = skb->nh.raw;
++ err = memcpy_fromiovecend((void *)iph, from, 0, length);
++ if (err)
++ goto error_fault;
++
++ /* We don't modify invalid header */
++ if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+ if (!iph->saddr)
+- iph->saddr = rfh->saddr;
++ iph->saddr = rt->rt_src;
+ iph->check = 0;
+- iph->tot_len = htons(fraglen); /* This is right as you can't
+- frag RAW packets */
+- /*
+- * Deliberate breach of modularity to keep
+- * ip_build_xmit clean (well less messy).
+- */
++ iph->tot_len = htons(length);
+ if (!iph->id)
+- ip_select_ident(iph, rfh->dst, NULL);
++ ip_select_ident(iph, &rt->u.dst, NULL);
++
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ }
++
++ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
++ dst_output);
++ if (err > 0)
++ err = inet->recverr ? net_xmit_errno(err) : 0;
++ if (err)
++ goto error;
++out:
+ return 0;
++
++error_fault:
++ err = -EFAULT;
++ kfree_skb(skb);
++error:
++ IP_INC_STATS(IpOutDiscards);
++ return err;
++}
++
++static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
++{
++ struct iovec *iov;
++ u8 __user *type = NULL;
++ u8 __user *code = NULL;
++ int probed = 0;
++ int i;
++
++ if (!msg->msg_iov)
++ return;
++
++ for (i = 0; i < msg->msg_iovlen; i++) {
++ iov = &msg->msg_iov[i];
++ if (!iov)
++ continue;
++
++ switch (fl->proto) {
++ case IPPROTO_ICMP:
++ /* check if one-byte field is readable or not. */
++ if (iov->iov_base && iov->iov_len < 1)
++ break;
++
++ if (!type) {
++ type = iov->iov_base;
++ /* check if code field is readable or not. */
++ if (iov->iov_len > 1)
++ code = type + 1;
++ } else if (!code)
++ code = iov->iov_base;
++
++ if (type && code) {
++ get_user(fl->fl_icmp_type, type);
++ __get_user(fl->fl_icmp_code, code);
++ probed = 1;
++ }
++ break;
++ default:
++ probed = 1;
++ break;
++ }
++ if (probed)
++ break;
++ }
+ }
+
+ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+ {
++ struct inet_opt *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+- struct rawfakehdr rfh;
+ struct rtable *rt = NULL;
+ int free = 0;
+ u32 daddr;
++ u32 saddr;
+ u8 tos;
+ int err;
+
+@@ -378,7 +436,7 @@
+ free = 1;
+ }
+
+- rfh.saddr = ipc.addr;
++ saddr = ipc.addr;
+ ipc.addr = daddr;
+
+ if (!ipc.opt)
+@@ -404,12 +462,22 @@
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = sk->protinfo.af_inet.mc_index;
+- if (!rfh.saddr)
+- rfh.saddr = sk->protinfo.af_inet.mc_addr;
++ if (!saddr)
++ saddr = sk->protinfo.af_inet.mc_addr;
+ }
+
+- err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
++ {
++ struct flowi fl = { .oif = ipc.oif,
++ .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = saddr,
++ .tos = tos } },
++ .proto = inet->hdrincl ? IPPROTO_RAW : sk->protocol };
++ if (!inet->hdrincl)
++ raw_probe_proto_opt(&fl, msg);
+
++ err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
++ }
+ if (err)
+ goto done;
+
+@@ -421,14 +489,22 @@
+ goto do_confirm;
+ back_from_confirm:
+
+- rfh.iov = msg->msg_iov;
+- rfh.saddr = rt->rt_src;
+- rfh.dst = &rt->u.dst;
+- if (!ipc.addr)
+- ipc.addr = rt->rt_dst;
+- err = ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag :
+- raw_getfrag, &rfh, len, &ipc, rt, msg->msg_flags);
+-
++ if (inet->hdrincl)
++ err = raw_send_hdrinc(sk, msg->msg_iov, len,
++ rt, msg->msg_flags);
++
++ else {
++ if (!ipc.addr)
++ ipc.addr = rt->rt_dst;
++ lock_sock(sk);
++ err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
++ &ipc, rt, msg->msg_flags);
++ if (err)
++ ip_flush_pending_frames(sk);
++ else if (!(msg->msg_flags & MSG_MORE))
++ err = ip_push_pending_frames(sk);
++ release_sock(sk);
++ }
+ done:
+ if (free)
+ kfree(ipc.opt);
+diff -Nru a/net/ipv4/route.c b/net/ipv4/route.c
+--- a/net/ipv4/route.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv4/route.c 2005-02-13 21:25:10 +11:00
+@@ -95,6 +95,7 @@
+ #include <net/arp.h>
+ #include <net/tcp.h>
+ #include <net/icmp.h>
++#include <net/xfrm.h>
+ #ifdef CONFIG_SYSCTL
+ #include <linux/sysctl.h>
+ #endif
+@@ -132,11 +133,10 @@
+ */
+
+ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+-static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
+- struct sk_buff *skb);
+ static void ipv4_dst_destroy(struct dst_entry *dst);
+ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+ static void ipv4_link_failure(struct sk_buff *skb);
++static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+ static int rt_garbage_collect(void);
+
+
+@@ -145,10 +145,10 @@
+ protocol: __constant_htons(ETH_P_IP),
+ gc: rt_garbage_collect,
+ check: ipv4_dst_check,
+- reroute: ipv4_dst_reroute,
+ destroy: ipv4_dst_destroy,
+ negative_advice: ipv4_negative_advice,
+ link_failure: ipv4_link_failure,
++ update_pmtu: ip_rt_update_pmtu,
+ entry_size: sizeof(struct rtable),
+ };
+
+@@ -248,11 +248,12 @@
+ r->u.dst.__use,
+ 0,
+ (unsigned long)r->rt_src,
+- (r->u.dst.advmss ?
+- (int) r->u.dst.advmss + 40 : 0),
+- r->u.dst.window,
+- (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar),
+- r->key.tos,
++ (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
++ (int) dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
++ dst_metric(&r->u.dst, RTAX_WINDOW),
++ (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3)
++ + dst_metric(&r->u.dst, RTAX_RTTVAR)),
++ r->fl.fl4_tos,
+ r->u.dst.hh ?
+ atomic_read(&r->u.dst.hh->hh_refcnt) :
+ -1,
+@@ -338,7 +339,7 @@
+ /* Kill broadcast/multicast entries very aggresively, if they
+ collide in hash table with more useful entries */
+ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
+- rth->key.iif && rth->u.rt_next;
++ rth->fl.iif && rth->u.rt_next;
+ }
+
+ static __inline__ int rt_valuable(struct rtable *rth)
+@@ -383,7 +384,7 @@
+ if (rt_valuable(rt))
+ score |= (1<<31);
+
+- if (!rt->key.iif ||
++ if (!rt->fl.iif ||
+ !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+ score |= (1<<30);
+
+@@ -648,6 +649,13 @@
+ out: return 0;
+ }
+
++static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
++{
++ return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
++ fl1->oif == fl2->oif &&
++ fl1->iif == fl2->iif;
++}
++
+ static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+ {
+ struct rtable *rth, **rthp;
+@@ -668,7 +676,7 @@
+
+ write_lock_bh(&rt_hash_table[hash].lock);
+ while ((rth = *rthp) != NULL) {
+- if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
++ if (compare_keys(&rth->fl, &rt->fl)) {
+ /* Put it first */
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = rt_hash_table[hash].chain;
+@@ -715,7 +723,7 @@
+ /* Try to bind route to arp only if it is output
+ route or unicast forwarding path.
+ */
+- if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
++ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+ int err = arp_bind_neighbour(&rt->u.dst);
+ if (err) {
+ write_unlock_bh(&rt_hash_table[hash].lock);
+@@ -878,11 +886,11 @@
+ while ((rth = *rthp) != NULL) {
+ struct rtable *rt;
+
+- if (rth->key.dst != daddr ||
+- rth->key.src != skeys[i] ||
+- rth->key.tos != tos ||
+- rth->key.oif != ikeys[k] ||
+- rth->key.iif != 0) {
++ if (rth->fl.fl4_dst != daddr ||
++ rth->fl.fl4_src != skeys[i] ||
++ rth->fl.fl4_tos != tos ||
++ rth->fl.oif != ikeys[k] ||
++ rth->fl.iif != 0) {
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+@@ -908,12 +916,15 @@
+ *rt = *rth;
+ rt->u.dst.__use = 1;
+ atomic_set(&rt->u.dst.__refcnt, 1);
++ rt->u.dst.child = NULL;
+ if (rt->u.dst.dev)
+ dev_hold(rt->u.dst.dev);
++ rt->u.dst.obsolete = 0;
+ rt->u.dst.lastuse = jiffies;
++ rt->u.dst.path = &rt->u.dst;
+ rt->u.dst.neighbour = NULL;
+ rt->u.dst.hh = NULL;
+- rt->u.dst.obsolete = 0;
++ rt->u.dst.xfrm = NULL;
+
+ rt->rt_flags |= RTCF_REDIRECTED;
+
+@@ -973,14 +984,14 @@
+ ret = NULL;
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->u.dst.expires) {
+- unsigned hash = rt_hash_code(rt->key.dst,
+- rt->key.src ^
+- (rt->key.oif << 5),
+- rt->key.tos);
++ unsigned hash = rt_hash_code(rt->fl.fl4_dst,
++ rt->fl.fl4_src ^
++ (rt->fl.oif << 5),
++ rt->fl.fl4_tos);
+ #if RT_CACHE_DEBUG >= 1
+ printk(KERN_DEBUG "ip_rt_advice: redirect to "
+ "%u.%u.%u.%u/%02x dropped\n",
+- NIPQUAD(rt->rt_dst), rt->key.tos);
++ NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
+ #endif
+ rt_del(hash, rt);
+ ret = NULL;
+@@ -1125,34 +1136,34 @@
+ read_lock(&rt_hash_table[hash].lock);
+ for (rth = rt_hash_table[hash].chain; rth;
+ rth = rth->u.rt_next) {
+- if (rth->key.dst == daddr &&
+- rth->key.src == skeys[i] &&
++ if (rth->fl.fl4_dst == daddr &&
++ rth->fl.fl4_src == skeys[i] &&
+ rth->rt_dst == daddr &&
+ rth->rt_src == iph->saddr &&
+- rth->key.tos == tos &&
+- rth->key.iif == 0 &&
+- !(rth->u.dst.mxlock & (1 << RTAX_MTU))) {
++ rth->fl.fl4_tos == tos &&
++ rth->fl.iif == 0 &&
++ !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
+ unsigned short mtu = new_mtu;
+
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+ /* BSD 4.2 compatibility hack :-( */
+ if (mtu == 0 &&
+- old_mtu >= rth->u.dst.pmtu &&
++ old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+
+ mtu = guess_mtu(old_mtu);
+ }
+- if (mtu <= rth->u.dst.pmtu) {
+- if (mtu < rth->u.dst.pmtu) {
++ if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
++ if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
+ dst_confirm(&rth->u.dst);
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+- rth->u.dst.mxlock |=
++ rth->u.dst.metrics[RTAX_LOCK-1] |=
+ (1 << RTAX_MTU);
+ }
+- rth->u.dst.pmtu = mtu;
++ rth->u.dst.metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(&rth->u.dst,
+ ip_rt_mtu_expires);
+ }
+@@ -1165,15 +1176,15 @@
+ return est_mtu ? : new_mtu;
+ }
+
+-void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
++static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+ {
+- if (dst->pmtu > mtu && mtu >= 68 &&
+- !(dst->mxlock & (1 << RTAX_MTU))) {
++ if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
++ !(dst_metric_locked(dst, RTAX_MTU))) {
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+- dst->mxlock |= (1 << RTAX_MTU);
++ dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+ }
+- dst->pmtu = mtu;
++ dst->metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(dst, ip_rt_mtu_expires);
+ }
+ }
+@@ -1184,12 +1195,6 @@
+ return NULL;
+ }
+
+-static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst,
+- struct sk_buff *skb)
+-{
+- return NULL;
+-}
+-
+ static void ipv4_dst_destroy(struct dst_entry *dst)
+ {
+ struct rtable *rt = (struct rtable *) dst;
+@@ -1235,9 +1240,9 @@
+ u32 src;
+ struct fib_result res;
+
+- if (rt->key.iif == 0)
++ if (rt->fl.iif == 0)
+ src = rt->rt_src;
+- else if (fib_lookup(&rt->key, &res) == 0) {
++ else if (fib_lookup(&rt->fl, &res) == 0) {
+ #ifdef CONFIG_IP_ROUTE_NAT
+ if (res.type == RTN_NAT)
+ src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+@@ -1270,28 +1275,30 @@
+ if (FIB_RES_GW(*res) &&
+ FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = FIB_RES_GW(*res);
+- memcpy(&rt->u.dst.mxlock, fi->fib_metrics,
+- sizeof(fi->fib_metrics));
++ memcpy(rt->u.dst.metrics, fi->fib_metrics,
++ sizeof(rt->u.dst.metrics));
+ if (fi->fib_mtu == 0) {
+- rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+- if (rt->u.dst.mxlock & (1 << RTAX_MTU) &&
++ rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
++ if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
+ rt->rt_gateway != rt->rt_dst &&
+- rt->u.dst.pmtu > 576)
+- rt->u.dst.pmtu = 576;
++ rt->u.dst.dev->mtu > 576)
++ rt->u.dst.metrics[RTAX_MTU-1] = 576;
+ }
+ #ifdef CONFIG_NET_CLS_ROUTE
+ rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+ #endif
+ } else
+- rt->u.dst.pmtu = rt->u.dst.dev->mtu;
++ rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
+
+- if (rt->u.dst.pmtu > IP_MAX_MTU)
+- rt->u.dst.pmtu = IP_MAX_MTU;
+- if (rt->u.dst.advmss == 0)
+- rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
++ if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
++ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
++ if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
++ rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
++ if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+ ip_rt_min_advmss);
+- if (rt->u.dst.advmss > 65535 - 40)
+- rt->u.dst.advmss = 65535 - 40;
++ if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+
+ #ifdef CONFIG_NET_CLS_ROUTE
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+@@ -1336,13 +1343,15 @@
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+- rth->key.dst = daddr;
++ if (in_dev->cnf.no_policy)
++ rth->u.dst.flags |= DST_NOPOLICY;
++ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+- rth->key.tos = tos;
++ rth->fl.fl4_tos = tos;
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark = skb->nfmark;
++ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
+- rth->key.src = saddr;
++ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ #ifdef CONFIG_IP_ROUTE_NAT
+ rth->rt_dst_map = daddr;
+@@ -1352,10 +1361,10 @@
+ rth->u.dst.tclassid = itag;
+ #endif
+ rth->rt_iif =
+- rth->key.iif = dev->ifindex;
++ rth->fl.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
+- rth->key.oif = 0;
++ rth->fl.oif = 0;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->rt_type = RTN_MULTICAST;
+@@ -1397,10 +1406,19 @@
+ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct net_device *dev)
+ {
+- struct rt_key key;
+ struct fib_result res;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *out_dev = NULL;
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = saddr,
++ .tos = tos,
++ .scope = RT_SCOPE_UNIVERSE,
++#ifdef CONFIG_IP_ROUTE_FWMARK
++ .fwmark = skb->nfmark
++#endif
++ } },
++ .iif = dev->ifindex };
+ unsigned flags = 0;
+ u32 itag = 0;
+ struct rtable * rth;
+@@ -1414,17 +1432,7 @@
+ if (!in_dev)
+ goto out;
+
+- key.dst = daddr;
+- key.src = saddr;
+- key.tos = tos;
+-#ifdef CONFIG_IP_ROUTE_FWMARK
+- key.fwmark = skb->nfmark;
+-#endif
+- key.iif = dev->ifindex;
+- key.oif = 0;
+- key.scope = RT_SCOPE_UNIVERSE;
+-
+- hash = rt_hash_code(daddr, saddr ^ (key.iif << 5), tos);
++ hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
+
+ /* Check for the most weird martians, which can be not detected
+ by fib_lookup.
+@@ -1448,7 +1456,7 @@
+ /*
+ * Now we are ready to route packet.
+ */
+- if ((err = fib_lookup(&key, &res)) != 0) {
++ if ((err = fib_lookup(&fl, &res)) != 0) {
+ if (!IN_DEV_FORWARD(in_dev))
+ goto e_inval;
+ goto no_route;
+@@ -1468,17 +1476,17 @@
+ src_map = fib_rules_policy(saddr, &res, &flags);
+
+ if (res.type == RTN_NAT) {
+- key.dst = fib_rules_map_destination(daddr, &res);
++ fl.fl4_dst = fib_rules_map_destination(daddr, &res);
+ fib_res_put(&res);
+ free_res = 0;
+- if (fib_lookup(&key, &res))
++ if (fib_lookup(&fl, &res))
+ goto e_inval;
+ free_res = 1;
+ if (res.type != RTN_UNICAST)
+ goto e_inval;
+ flags |= RTCF_DNAT;
+ }
+- key.src = src_map;
++ fl.fl4_src = src_map;
+ }
+ #endif
+
+@@ -1504,8 +1512,8 @@
+ goto martian_destination;
+
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH
+- if (res.fi->fib_nhs > 1 && key.oif == 0)
+- fib_select_multipath(&key, &res);
++ if (res.fi->fib_nhs > 1 && fl.oif == 0)
++ fib_select_multipath(&fl, &res);
+ #endif
+ out_dev = in_dev_get(FIB_RES_DEV(res));
+ if (out_dev == NULL) {
+@@ -1542,26 +1550,30 @@
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+- rth->key.dst = daddr;
++ if (in_dev->cnf.no_policy)
++ rth->u.dst.flags |= DST_NOPOLICY;
++ if (in_dev->cnf.no_xfrm)
++ rth->u.dst.flags |= DST_NOXFRM;
++ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+- rth->key.tos = tos;
++ rth->fl.fl4_tos = tos;
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark = skb->nfmark;
++ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
+- rth->key.src = saddr;
++ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ rth->rt_gateway = daddr;
+ #ifdef CONFIG_IP_ROUTE_NAT
+- rth->rt_src_map = key.src;
+- rth->rt_dst_map = key.dst;
++ rth->rt_src_map = fl.fl4_src;
++ rth->rt_dst_map = fl.fl4_dst;
+ if (flags&RTCF_DNAT)
+- rth->rt_gateway = key.dst;
++ rth->rt_gateway = fl.fl4_dst;
+ #endif
+ rth->rt_iif =
+- rth->key.iif = dev->ifindex;
++ rth->fl.iif = dev->ifindex;
+ rth->u.dst.dev = out_dev->dev;
+ dev_hold(rth->u.dst.dev);
+- rth->key.oif = 0;
++ rth->fl.oif = 0;
+ rth->rt_spec_dst= spec_dst;
+
+ rth->u.dst.input = ip_forward;
+@@ -1619,26 +1631,27 @@
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+- rth->key.dst = daddr;
++ if (in_dev->cnf.no_policy)
++ rth->u.dst.flags |= DST_NOPOLICY;
++ rth->fl.fl4_dst = daddr;
+ rth->rt_dst = daddr;
+- rth->key.tos = tos;
++ rth->fl.fl4_tos = tos;
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark = skb->nfmark;
++ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
+- rth->key.src = saddr;
++ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ #ifdef CONFIG_IP_ROUTE_NAT
+- rth->rt_dst_map = key.dst;
+- rth->rt_src_map = key.src;
++ rth->rt_dst_map = fl.fl4_dst;
++ rth->rt_src_map = fl.fl4_src;
+ #endif
+ #ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+ #endif
+ rth->rt_iif =
+- rth->key.iif = dev->ifindex;
++ rth->fl.iif = dev->ifindex;
+ rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
+- rth->key.oif = 0;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= spec_dst;
+ rth->u.dst.input= ip_local_deliver;
+@@ -1716,14 +1729,14 @@
+
+ read_lock(&rt_hash_table[hash].lock);
+ for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
+- if (rth->key.dst == daddr &&
+- rth->key.src == saddr &&
+- rth->key.iif == iif &&
+- rth->key.oif == 0 &&
++ if (rth->fl.fl4_dst == daddr &&
++ rth->fl.fl4_src == saddr &&
++ rth->fl.iif == iif &&
++ rth->fl.oif == 0 &&
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark == skb->nfmark &&
++ rth->fl.fl4_fwmark == skb->nfmark &&
+ #endif
+- rth->key.tos == tos) {
++ rth->fl.fl4_tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
+@@ -1773,43 +1786,45 @@
+ * Major route resolver routine.
+ */
+
+-int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
++int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
+ {
+- struct rt_key key;
++ u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK);
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = oldflp->fl4_dst,
++ .saddr = oldflp->fl4_src,
++ .tos = tos & IPTOS_RT_MASK,
++ .scope = ((tos & RTO_ONLINK) ?
++ RT_SCOPE_LINK :
++ RT_SCOPE_UNIVERSE),
++#ifdef CONFIG_IP_ROUTE_FWMARK
++ .fwmark = oldflp->fl4_fwmark
++#endif
++ } },
++ .iif = loopback_dev.ifindex,
++ .oif = oldflp->oif };
+ struct fib_result res;
+ unsigned flags = 0;
+ struct rtable *rth;
+ struct net_device *dev_out = NULL;
++ struct in_device *in_dev = NULL;
+ unsigned hash;
+ int free_res = 0;
+ int err;
+- u32 tos;
+
+- tos = oldkey->tos & (IPTOS_RT_MASK | RTO_ONLINK);
+- key.dst = oldkey->dst;
+- key.src = oldkey->src;
+- key.tos = tos & IPTOS_RT_MASK;
+- key.iif = loopback_dev.ifindex;
+- key.oif = oldkey->oif;
+-#ifdef CONFIG_IP_ROUTE_FWMARK
+- key.fwmark = oldkey->fwmark;
+-#endif
+- key.scope = (tos & RTO_ONLINK) ? RT_SCOPE_LINK :
+- RT_SCOPE_UNIVERSE;
+ res.fi = NULL;
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+ #endif
+
+- if (oldkey->src) {
++ if (oldflp->fl4_src) {
+ err = -EINVAL;
+- if (MULTICAST(oldkey->src) ||
+- BADCLASS(oldkey->src) ||
+- ZERONET(oldkey->src))
++ if (MULTICAST(oldflp->fl4_src) ||
++ BADCLASS(oldflp->fl4_src) ||
++ ZERONET(oldflp->fl4_src))
+ goto out;
+
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+- dev_out = ip_dev_find(oldkey->src);
++ dev_out = ip_dev_find(oldflp->fl4_src);
+ if (dev_out == NULL)
+ goto out;
+
+@@ -1821,8 +1836,8 @@
+ of another iface. --ANK
+ */
+
+- if (oldkey->oif == 0
+- && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
++ if (oldflp->oif == 0
++ && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
+ /* Special hack: user can direct multicasts
+ and limited broadcast via necessary interface
+ without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+@@ -1838,15 +1853,15 @@
+ Luckily, this hack is good workaround.
+ */
+
+- key.oif = dev_out->ifindex;
++ fl.oif = dev_out->ifindex;
+ goto make_route;
+ }
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = NULL;
+ }
+- if (oldkey->oif) {
+- dev_out = dev_get_by_index(oldkey->oif);
++ if (oldflp->oif) {
++ dev_out = dev_get_by_index(oldflp->oif);
+ err = -ENODEV;
+ if (dev_out == NULL)
+ goto out;
+@@ -1855,39 +1870,39 @@
+ goto out; /* Wrong error code */
+ }
+
+- if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
+- if (!key.src)
+- key.src = inet_select_addr(dev_out, 0,
+- RT_SCOPE_LINK);
++ if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
++ if (!fl.fl4_src)
++ fl.fl4_src = inet_select_addr(dev_out, 0,
++ RT_SCOPE_LINK);
+ goto make_route;
+ }
+- if (!key.src) {
+- if (MULTICAST(oldkey->dst))
+- key.src = inet_select_addr(dev_out, 0,
+- key.scope);
+- else if (!oldkey->dst)
+- key.src = inet_select_addr(dev_out, 0,
+- RT_SCOPE_HOST);
++ if (!fl.fl4_src) {
++ if (MULTICAST(oldflp->fl4_dst))
++ fl.fl4_src = inet_select_addr(dev_out, 0,
++ fl.fl4_scope);
++ else if (!oldflp->fl4_dst)
++ fl.fl4_src = inet_select_addr(dev_out, 0,
++ RT_SCOPE_HOST);
+ }
+ }
+
+- if (!key.dst) {
+- key.dst = key.src;
+- if (!key.dst)
+- key.dst = key.src = htonl(INADDR_LOOPBACK);
++ if (!fl.fl4_dst) {
++ fl.fl4_dst = fl.fl4_src;
++ if (!fl.fl4_dst)
++ fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = &loopback_dev;
+ dev_hold(dev_out);
+- key.oif = loopback_dev.ifindex;
++ fl.oif = loopback_dev.ifindex;
+ res.type = RTN_LOCAL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+- if (fib_lookup(&key, &res)) {
++ if (fib_lookup(&fl, &res)) {
+ res.fi = NULL;
+- if (oldkey->oif) {
++ if (oldflp->oif) {
+ /* Apparently, routing tables are wrong. Assume,
+ that the destination is on link.
+
+@@ -1906,9 +1921,9 @@
+ likely IPv6, but we do not.
+ */
+
+- if (key.src == 0)
+- key.src = inet_select_addr(dev_out, 0,
+- RT_SCOPE_LINK);
++ if (fl.fl4_src == 0)
++ fl.fl4_src = inet_select_addr(dev_out, 0,
++ RT_SCOPE_LINK);
+ res.type = RTN_UNICAST;
+ goto make_route;
+ }
+@@ -1923,13 +1938,13 @@
+ goto e_inval;
+
+ if (res.type == RTN_LOCAL) {
+- if (!key.src)
+- key.src = key.dst;
++ if (!fl.fl4_src)
++ fl.fl4_src = fl.fl4_dst;
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = &loopback_dev;
+ dev_hold(dev_out);
+- key.oif = dev_out->ifindex;
++ fl.oif = dev_out->ifindex;
+ if (res.fi)
+ fib_info_put(res.fi);
+ res.fi = NULL;
+@@ -1938,36 +1953,40 @@
+ }
+
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH
+- if (res.fi->fib_nhs > 1 && key.oif == 0)
+- fib_select_multipath(&key, &res);
++ if (res.fi->fib_nhs > 1 && fl.oif == 0)
++ fib_select_multipath(&fl, &res);
+ else
+ #endif
+- if (!res.prefixlen && res.type == RTN_UNICAST && !key.oif)
+- fib_select_default(&key, &res);
++ if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
++ fib_select_default(&fl, &res);
+
+- if (!key.src)
+- key.src = FIB_RES_PREFSRC(res);
++ if (!fl.fl4_src)
++ fl.fl4_src = FIB_RES_PREFSRC(res);
+
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = FIB_RES_DEV(res);
+ dev_hold(dev_out);
+- key.oif = dev_out->ifindex;
++ fl.oif = dev_out->ifindex;
+
+ make_route:
+- if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
++ if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ goto e_inval;
+
+- if (key.dst == 0xFFFFFFFF)
++ if (fl.fl4_dst == 0xFFFFFFFF)
+ res.type = RTN_BROADCAST;
+- else if (MULTICAST(key.dst))
++ else if (MULTICAST(fl.fl4_dst))
+ res.type = RTN_MULTICAST;
+- else if (BADCLASS(key.dst) || ZERONET(key.dst))
++ else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst))
+ goto e_inval;
+
+ if (dev_out->flags & IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
++ in_dev = in_dev_get(dev_out);
++ if (!in_dev)
++ goto e_inval;
++
+ if (res.type == RTN_BROADCAST) {
+ flags |= RTCF_BROADCAST | RTCF_LOCAL;
+ if (res.fi) {
+@@ -1976,11 +1995,8 @@
+ }
+ } else if (res.type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST|RTCF_LOCAL;
+- read_lock(&inetdev_lock);
+- if (!__in_dev_get(dev_out) ||
+- !ip_check_mc(__in_dev_get(dev_out),oldkey->dst,oldkey->src))
++ if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src))
+ flags &= ~RTCF_LOCAL;
+- read_unlock(&inetdev_lock);
+ /* If multicast route do not exist use
+ default one, but do not gateway in this case.
+ Yes, it is hack.
+@@ -1997,25 +2013,28 @@
+
+ atomic_set(&rth->u.dst.__refcnt, 1);
+ rth->u.dst.flags= DST_HOST;
+- rth->key.dst = oldkey->dst;
+- rth->key.tos = tos;
+- rth->key.src = oldkey->src;
+- rth->key.iif = 0;
+- rth->key.oif = oldkey->oif;
++ if (in_dev->cnf.no_xfrm)
++ rth->u.dst.flags |= DST_NOXFRM;
++ if (in_dev->cnf.no_policy)
++ rth->u.dst.flags |= DST_NOPOLICY;
++ rth->fl.fl4_dst = oldflp->fl4_dst;
++ rth->fl.fl4_tos = tos;
++ rth->fl.fl4_src = oldflp->fl4_src;
++ rth->fl.oif = oldflp->oif;
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark = oldkey->fwmark;
++ rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
+ #endif
+- rth->rt_dst = key.dst;
+- rth->rt_src = key.src;
++ rth->rt_dst = fl.fl4_dst;
++ rth->rt_src = fl.fl4_src;
+ #ifdef CONFIG_IP_ROUTE_NAT
+- rth->rt_dst_map = key.dst;
+- rth->rt_src_map = key.src;
++ rth->rt_dst_map = fl.fl4_dst;
++ rth->rt_src_map = fl.fl4_src;
+ #endif
+- rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
++ rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
+ rth->u.dst.dev = dev_out;
+ dev_hold(dev_out);
+- rth->rt_gateway = key.dst;
+- rth->rt_spec_dst= key.src;
++ rth->rt_gateway = fl.fl4_dst;
++ rth->rt_spec_dst= fl.fl4_src;
+
+ rth->u.dst.output=ip_output;
+
+@@ -2023,40 +2042,39 @@
+
+ if (flags & RTCF_LOCAL) {
+ rth->u.dst.input = ip_local_deliver;
+- rth->rt_spec_dst = key.dst;
++ rth->rt_spec_dst = fl.fl4_dst;
+ }
+ if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+- rth->rt_spec_dst = key.src;
++ rth->rt_spec_dst = fl.fl4_src;
+ if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) {
+ rth->u.dst.output = ip_mc_output;
+ rt_cache_stat[smp_processor_id()].out_slow_mc++;
+ }
+ #ifdef CONFIG_IP_MROUTE
+ if (res.type == RTN_MULTICAST) {
+- struct in_device *in_dev = in_dev_get(dev_out);
+- if (in_dev) {
+- if (IN_DEV_MFORWARD(in_dev) &&
+- !LOCAL_MCAST(oldkey->dst)) {
+- rth->u.dst.input = ip_mr_input;
+- rth->u.dst.output = ip_mc_output;
+- }
+- in_dev_put(in_dev);
++ if (IN_DEV_MFORWARD(in_dev) &&
++ !LOCAL_MCAST(oldflp->fl4_dst)) {
++ rth->u.dst.input = ip_mr_input;
++ rth->u.dst.output = ip_mc_output;
+ }
+ }
+ #endif
+ }
+
+ rt_set_nexthop(rth, &res, 0);
++
+
+ rth->rt_flags = flags;
+
+- hash = rt_hash_code(oldkey->dst, oldkey->src ^ (oldkey->oif << 5), tos);
++ hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos);
+ err = rt_intern_hash(hash, rth, rp);
+ done:
+ if (free_res)
+ fib_res_put(&res);
+ if (dev_out)
+ dev_put(dev_out);
++ if (in_dev)
++ in_dev_put(in_dev);
+ out: return err;
+
+ e_inval:
+@@ -2067,23 +2085,23 @@
+ goto done;
+ }
+
+-int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
++int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+ {
+ unsigned hash;
+ struct rtable *rth;
+
+- hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
++ hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
+
+ read_lock_bh(&rt_hash_table[hash].lock);
+ for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
+- if (rth->key.dst == key->dst &&
+- rth->key.src == key->src &&
+- rth->key.iif == 0 &&
+- rth->key.oif == key->oif &&
++ if (rth->fl.fl4_dst == flp->fl4_dst &&
++ rth->fl.fl4_src == flp->fl4_src &&
++ rth->fl.iif == 0 &&
++ rth->fl.oif == flp->oif &&
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+- rth->key.fwmark == key->fwmark &&
++ rth->fl.fl4_fwmark == flp->fl4_fwmark &&
+ #endif
+- !((rth->key.tos ^ key->tos) &
++ !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+ (IPTOS_RT_MASK | RTO_ONLINK))) {
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+@@ -2097,8 +2115,31 @@
+ }
+ read_unlock_bh(&rt_hash_table[hash].lock);
+
+- return ip_route_output_slow(rp, key);
+-}
++ return ip_route_output_slow(rp, flp);
++}
++
++int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
++{
++ int err;
++
++ if ((err = __ip_route_output_key(rp, flp)) != 0)
++ return err;
++
++ if (flp->proto) {
++ if (!flp->fl4_src)
++ flp->fl4_src = (*rp)->rt_src;
++ if (!flp->fl4_dst)
++ flp->fl4_dst = (*rp)->rt_dst;
++ return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
++ }
++
++ return 0;
++}
++
++int ip_route_output_key(struct rtable **rp, struct flowi *flp)
++{
++ return ip_route_output_flow(rp, flp, NULL, 0);
++}
+
+ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ int nowait)
+@@ -2117,7 +2158,7 @@
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 0;
+- r->rtm_tos = rt->key.tos;
++ r->rtm_tos = rt->fl.fl4_tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+@@ -2126,9 +2167,9 @@
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+ RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+- if (rt->key.src) {
++ if (rt->fl.fl4_src) {
+ r->rtm_src_len = 32;
+- RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
++ RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
+ }
+ if (rt->u.dst.dev)
+ RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+@@ -2136,13 +2177,13 @@
+ if (rt->u.dst.tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+ #endif
+- if (rt->key.iif)
++ if (rt->fl.iif)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+- else if (rt->rt_src != rt->key.src)
++ else if (rt->rt_src != rt->fl.fl4_src)
+ RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
+ if (rt->rt_dst != rt->rt_gateway)
+ RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+- if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
++ if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ goto rtattr_failure;
+ ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
+ ci.rta_used = rt->u.dst.__use;
+@@ -2164,7 +2205,7 @@
+ eptr = (struct rtattr*)skb->tail;
+ #endif
+ RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+- if (rt->key.iif) {
++ if (rt->fl.iif) {
+ #ifdef CONFIG_IP_MROUTE
+ u32 dst = rt->rt_dst;
+
+@@ -2184,7 +2225,7 @@
+ }
+ } else
+ #endif
+- RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
++ RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
+ }
+
+ nlh->nlmsg_len = skb->tail - b;
+@@ -2238,10 +2279,14 @@
+ if (!err && rt->u.dst.error)
+ err = -rt->u.dst.error;
+ } else {
++ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
++ .saddr = src,
++ .tos = rtm->rtm_tos } } };
+ int oif = 0;
+ if (rta[RTA_OIF - 1])
+ memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
+- err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
++ fl.oif = oif;
++ err = ip_route_output_key(&rt, &fl);
+ }
+ if (err)
+ goto out_free;
+@@ -2630,5 +2675,9 @@
+ rt_cache_stat_get_info);
+ #ifdef CONFIG_NET_CLS_ROUTE
+ create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
++#endif
++#ifdef CONFIG_XFRM
++ xfrm_init();
++ xfrm4_init();
+ #endif
+ }
+diff -Nru a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
+--- a/net/ipv4/syncookies.c 2005-02-13 21:25:08 +11:00
++++ b/net/ipv4/syncookies.c 2005-02-13 21:25:08 +11:00
+@@ -169,18 +169,25 @@
+ * hasn't changed since we received the original syn, but I see
+ * no easy way to do this.
+ */
+- if (ip_route_output(&rt,
+- opt &&
+- opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
+- req->af.v4_req.loc_addr,
+- RT_CONN_FLAGS(sk),
+- 0)) {
+- tcp_openreq_free(req);
+- goto out;
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = ((opt && opt->srr) ?
++ opt->faddr :
++ req->af.v4_req.rmt_addr),
++ .saddr = req->af.v4_req.loc_addr,
++ .tos = RT_CONN_FLAGS(sk) } },
++ .proto = IPPROTO_TCP,
++ .uli_u = { .ports =
++ { .sport = skb->h.th->dest,
++ .dport = skb->h.th->source } } };
++ if (ip_route_output_key(&rt, &fl)) {
++ tcp_openreq_free(req);
++ goto out;
++ }
+ }
+
+ /* Try to redo what tcp_v4_send_synack did. */
+- req->window_clamp = rt->u.dst.window;
++ req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ 0, &rcv_wscale);
+diff -Nru a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
+--- a/net/ipv4/sysctl_net_ipv4.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/sysctl_net_ipv4.c 2005-02-13 21:25:09 +11:00
+@@ -82,14 +82,39 @@
+ void *newval, size_t newlen,
+ void **context)
+ {
++ int *valp = table->data;
+ int new;
++
++ if (!newval || !newlen)
++ return 0;
++
+ if (newlen != sizeof(int))
+ return -EINVAL;
+- if (get_user(new,(int *)newval))
+- return -EFAULT;
+- if (new != ipv4_devconf.forwarding)
+- inet_forward_change(new);
+- return 0; /* caller does change again and handles handles oldval */
++
++ if (get_user(new, (int *)newval))
++ return -EFAULT;
++
++ if (new == *valp)
++ return 0;
++
++ if (oldval && oldlenp) {
++ size_t len;
++
++ if (get_user(len, oldlenp))
++ return -EFAULT;
++
++ if (len) {
++ if (len > table->maxlen)
++ len = table->maxlen;
++ if (copy_to_user(oldval, valp, len))
++ return -EFAULT;
++ if (put_user(len, oldlenp))
++ return -EFAULT;
++ }
++ }
++
++ inet_forward_change(new);
++ return 1;
+ }
+
+ ctl_table ipv4_table[] = {
+@@ -110,7 +135,7 @@
+ &ipv4_sysctl_forward,&ipv4_sysctl_forward_strategy},
+ {NET_IPV4_DEFAULT_TTL, "ip_default_ttl",
+ &sysctl_ip_default_ttl, sizeof(int), 0644, NULL,
+- &proc_dointvec},
++ &ipv4_doint_and_flush, &ipv4_doint_and_flush_strategy},
+ {NET_IPV4_AUTOCONFIG, "ip_autoconfig",
+ &ipv4_config.autoconfig, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+diff -Nru a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+--- a/net/ipv4/tcp.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/tcp.c 2005-02-13 21:25:09 +11:00
+@@ -204,6 +204,8 @@
+ * Andi Kleen : Make poll agree with SIGIO
+ * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
+ * lingertime == 0 (RFC 793 ABORT Call)
++ * Hirokazu Takahashi : Use copy_from_user() instead of
++ * csum_and_copy_from_user() if possible.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+@@ -256,6 +258,7 @@
+
+ #include <net/icmp.h>
+ #include <net/tcp.h>
++#include <net/xfrm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+@@ -955,8 +958,8 @@
+ return res;
+ }
+
+-#define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page)
+-#define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off)
++#define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
++#define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
+
+ static inline int
+ tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
+@@ -965,18 +968,22 @@
+ int err = 0;
+ unsigned int csum;
+
+- csum = csum_and_copy_from_user(from, page_address(page)+off,
++ if (skb->ip_summed == CHECKSUM_NONE) {
++ csum = csum_and_copy_from_user(from, page_address(page) + off,
+ copy, 0, &err);
+- if (!err) {
+- if (skb->ip_summed == CHECKSUM_NONE)
+- skb->csum = csum_block_add(skb->csum, csum, skb->len);
+- skb->len += copy;
+- skb->data_len += copy;
+- skb->truesize += copy;
+- sk->wmem_queued += copy;
+- sk->forward_alloc -= copy;
++ if (err) return err;
++ skb->csum = csum_block_add(skb->csum, csum, skb->len);
++ } else {
++ if (copy_from_user(page_address(page) + off, from, copy))
++ return -EFAULT;
+ }
+- return err;
++
++ skb->len += copy;
++ skb->data_len += copy;
++ skb->truesize += copy;
++ sk->wmem_queued += copy;
++ sk->forward_alloc -= copy;
++ return 0;
+ }
+
+ static inline int
+@@ -986,11 +993,16 @@
+ unsigned int csum;
+ int off = skb->len;
+
+- csum = csum_and_copy_from_user(from, skb_put(skb, copy),
++ if (skb->ip_summed == CHECKSUM_NONE) {
++ csum = csum_and_copy_from_user(from, skb_put(skb, copy),
+ copy, 0, &err);
+- if (!err) {
+- skb->csum = csum_block_add(skb->csum, csum, off);
+- return 0;
++ if (!err) {
++ skb->csum = csum_block_add(skb->csum, csum, off);
++ return 0;
++ }
++ } else {
++ if (!copy_from_user(skb_put(skb, copy), from, copy))
++ return 0;
+ }
+
+ __skb_trim(skb, off);
+@@ -1072,6 +1084,12 @@
+ if (skb == NULL)
+ goto wait_for_memory;
+
++ /*
++ * Check whether we can use HW checksum.
++ */
++ if (sk->route_caps & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))
++ skb->ip_summed = CHECKSUM_HW;
++
+ skb_entail(sk, tp, skb);
+ copy = mss_now;
+ }
+@@ -1896,6 +1914,8 @@
+ sk->prot->destroy(sk);
+
+ tcp_kill_sk_queues(sk);
++
++ xfrm_sk_free_policy(sk);
+
+ #ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&sk->refcnt) != 1) {
+diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+--- a/net/ipv4/tcp_input.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/tcp_input.c 2005-02-13 21:25:09 +11:00
+@@ -727,25 +727,25 @@
+ * Probably, no packets returned in time.
+ * Reset our results.
+ */
+- if (!(dst->mxlock&(1<<RTAX_RTT)))
+- dst->rtt = 0;
++ if (!(dst_metric_locked(dst, RTAX_RTT)))
++ dst->metrics[RTAX_RTT-1] = 0;
+ return;
+ }
+
+- m = dst->rtt - tp->srtt;
++ m = dst_metric(dst, RTAX_RTT) - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one,
+ * store new one. Otherwise, use EWMA. Remember,
+ * rtt overestimation is always better than underestimation.
+ */
+- if (!(dst->mxlock&(1<<RTAX_RTT))) {
++ if (!(dst_metric_locked(dst, RTAX_RTT))) {
+ if (m <= 0)
+- dst->rtt = tp->srtt;
++ dst->metrics[RTAX_RTT-1] = tp->srtt;
+ else
+- dst->rtt -= (m>>3);
++ dst->metrics[RTAX_RTT-1] -= (m>>3);
+ }
+
+- if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
++ if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+ if (m < 0)
+ m = -m;
+
+@@ -754,67 +754,61 @@
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+- if (m >= dst->rttvar)
+- dst->rttvar = m;
++ if (m >= dst_metric(dst, RTAX_RTTVAR))
++ dst->metrics[RTAX_RTTVAR-1] = m;
+ else
+- dst->rttvar -= (dst->rttvar - m)>>2;
++ dst->metrics[RTAX_RTTVAR-1] -=
++ (dst->metrics[RTAX_RTTVAR-1] - m)>>2;
+ }
+
+ if (tp->snd_ssthresh >= 0xFFFF) {
+ /* Slow start still did not finish. */
+- if (dst->ssthresh &&
+- !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+- (tp->snd_cwnd>>1) > dst->ssthresh)
+- dst->ssthresh = (tp->snd_cwnd>>1);
+- if (!(dst->mxlock&(1<<RTAX_CWND)) &&
+- tp->snd_cwnd > dst->cwnd)
+- dst->cwnd = tp->snd_cwnd;
++ if (dst_metric(dst, RTAX_SSTHRESH) &&
++ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
++ (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
++ dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
++ if (!dst_metric_locked(dst, RTAX_CWND) &&
++ tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
++ dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ tp->ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+- if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
+- dst->ssthresh = max(tp->snd_cwnd>>1, tp->snd_ssthresh);
+- if (!(dst->mxlock&(1<<RTAX_CWND)))
+- dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
++ if (!dst_metric_locked(dst, RTAX_SSTHRESH))
++ dst->metrics[RTAX_SSTHRESH-1] =
++ max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
++ if (!dst_metric_locked(dst, RTAX_CWND))
++ dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+- if (!(dst->mxlock&(1<<RTAX_CWND)))
+- dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
+- if (dst->ssthresh &&
+- !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+- tp->snd_ssthresh > dst->ssthresh)
+- dst->ssthresh = tp->snd_ssthresh;
++ if (!dst_metric_locked(dst, RTAX_CWND))
++ dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
++ if (dst->metrics[RTAX_SSTHRESH-1] &&
++ !dst_metric_locked(dst, RTAX_SSTHRESH) &&
++ tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
++ dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+ }
+
+- if (!(dst->mxlock&(1<<RTAX_REORDERING))) {
+- if (dst->reordering < tp->reordering &&
++ if (!dst_metric_locked(dst, RTAX_REORDERING)) {
++ if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+- dst->reordering = tp->reordering;
++ dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+ }
+ }
+ }
+
+-/* Increase initial CWND conservatively: if estimated
+- * RTT is low enough (<20msec) or if we have some preset ssthresh.
+- *
+- * Numbers are taken from RFC2414.
+- */
+-__u32 tcp_init_cwnd(struct tcp_opt *tp)
++/* Numbers are taken from RFC2414. */
++__u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst)
+ {
+- __u32 cwnd;
+-
+- if (tp->mss_cache > 1460)
+- return 2;
+-
+- cwnd = (tp->mss_cache > 1095) ? 3 : 4;
+-
+- if (!tp->srtt || (tp->snd_ssthresh >= 0xFFFF && tp->srtt > ((HZ/50)<<3)))
+- cwnd = 2;
+- else if (cwnd > tp->snd_ssthresh)
+- cwnd = tp->snd_ssthresh;
++ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
++ if (!cwnd) {
++ if (tp->mss_cache > 1460)
++ cwnd = 2;
++ else
++ cwnd = (tp->mss_cache > 1095) ? 3 : 4;
++ }
+ return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+ }
+
+@@ -830,22 +824,23 @@
+
+ dst_confirm(dst);
+
+- if (dst->mxlock&(1<<RTAX_CWND))
+- tp->snd_cwnd_clamp = dst->cwnd;
+- if (dst->ssthresh) {
+- tp->snd_ssthresh = dst->ssthresh;
++ if (dst_metric_locked(dst, RTAX_CWND))
++ tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
++ if (dst_metric(dst, RTAX_SSTHRESH)) {
++ tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+- if (dst->reordering && tp->reordering != dst->reordering) {
++ if (dst_metric(dst, RTAX_REORDERING) &&
++ tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+ tp->sack_ok &= ~2;
+- tp->reordering = dst->reordering;
++ tp->reordering = dst_metric(dst, RTAX_REORDERING);
+ }
+
+- if (dst->rtt == 0)
++ if (dst_metric(dst, RTAX_RTT) == 0)
+ goto reset;
+
+- if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
++ if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+ goto reset;
+
+ /* Initial rtt is determined from SYN,SYN-ACK.
+@@ -862,19 +857,19 @@
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+- if (dst->rtt > tp->srtt) {
+- tp->srtt = dst->rtt;
++ if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
++ tp->srtt = dst_metric(dst, RTAX_RTT);
+ tp->rtt_seq = tp->snd_nxt;
+ }
+- if (dst->rttvar > tp->mdev) {
+- tp->mdev = dst->rttvar;
++ if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
++ tp->mdev = dst_metric(dst, RTAX_RTTVAR);
+ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+ }
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+ if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
+ goto reset;
+- tp->snd_cwnd = tcp_init_cwnd(tp);
++ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ return;
+
+@@ -4430,7 +4425,24 @@
+
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
++
++ /* Remember, tcp_poll() does not lock socket!
++ * Change state from SYN-SENT only after copied_seq
++ * is initialized. */
++ tp->copied_seq = tp->rcv_nxt;
++ mb();
++ tcp_set_state(sk, TCP_ESTABLISHED);
++
++ /* Make sure socket is routed, for correct metrics. */
++ tp->af_specific->rebuild_header(sk);
++
+ tcp_init_metrics(sk);
++
++ /* Prevent spurious tcp_cwnd_restart() on first data
++ * packet.
++ */
++ tp->lsndtime = tcp_time_stamp;
++
+ tcp_init_buffer_space(sk);
+
+ if (sk->keepopen)
+@@ -4441,13 +4453,6 @@
+ else
+ tp->pred_flags = 0;
+
+- /* Remember, tcp_poll() does not lock socket!
+- * Change state from SYN-SENT only after copied_seq
+- * is initialized. */
+- tp->copied_seq = tp->rcv_nxt;
+- mb();
+- tcp_set_state(sk, TCP_ESTABLISHED);
+-
+ if(!sk->dead) {
+ sk->state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+@@ -4695,7 +4700,18 @@
+ if (tp->tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
++ /* Make sure socket is routed, for
++ * correct metrics.
++ */
++ tp->af_specific->rebuild_header(sk);
++
+ tcp_init_metrics(sk);
++
++ /* Prevent spurious tcp_cwnd_restart() on
++ * first data packet.
++ */
++ tp->lsndtime = tcp_time_stamp;
++
+ tcp_initialize_rcv_mss(sk);
+ tcp_init_buffer_space(sk);
+ tcp_fast_path_on(tp);
+diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+--- a/net/ipv4/tcp_ipv4.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/tcp_ipv4.c 2005-02-13 21:25:09 +11:00
+@@ -63,13 +63,12 @@
+ #include <net/tcp.h>
+ #include <net/ipv6.h>
+ #include <net/inet_common.h>
++#include <net/xfrm.h>
+
+ #include <linux/inet.h>
+ #include <linux/stddef.h>
+-#include <linux/ipsec.h>
+
+ extern int sysctl_ip_dynaddr;
+-extern int sysctl_ip_default_ttl;
+ int sysctl_tcp_tw_reuse = 0;
+ int sysctl_tcp_low_latency = 0;
+
+@@ -785,7 +784,9 @@
+ }
+
+ tmp = ip_route_connect(&rt, nexthop, sk->saddr,
+- RT_CONN_FLAGS(sk), sk->bound_dev_if);
++ RT_CONN_FLAGS(sk), sk->bound_dev_if,
++ IPPROTO_TCP,
++ sk->sport, usin->sin_port, sk);
+ if (tmp < 0)
+ return tmp;
+
+@@ -794,9 +795,6 @@
+ return -ENETUNREACH;
+ }
+
+- __sk_dst_set(sk, &rt->u.dst);
+- sk->route_caps = rt->u.dst.dev->features;
+-
+ if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
+ daddr = rt->rt_dst;
+
+@@ -846,6 +844,15 @@
+ if (err)
+ goto failure;
+
++ err = ip_route_newports(&rt, sk->sport, sk->dport, sk);
++ if (err)
++ goto failure;
++
++ /* OK, now commit destination to socket. */
++ __sk_dst_set(sk, &rt->u.dst);
++ sk->route_caps = rt->u.dst.dev->features;
++ tp->ext2_header_len = rt->u.dst.header_len;
++
+ if (!tp->write_seq)
+ tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ sk->sport, usin->sin_port);
+@@ -853,14 +860,16 @@
+ sk->protinfo.af_inet.id = tp->write_seq^jiffies;
+
+ err = tcp_connect(sk);
++ rt = NULL;
+ if (err)
+ goto failure;
+
+ return 0;
+
+ failure:
++ /* This unhashes the socket and releases the local port, if necessary. */
+ tcp_set_state(sk, TCP_CLOSE);
+- __sk_dst_reset(sk);
++ ip_rt_put(rt);
+ sk->route_caps = 0;
+ sk->dport = 0;
+ return err;
+@@ -922,7 +931,7 @@
+ /*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+-static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
++static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, u32 mtu)
+ {
+ struct dst_entry *dst;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+@@ -943,17 +952,19 @@
+ if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ return;
+
+- ip_rt_update_pmtu(dst, mtu);
++ dst->ops->update_pmtu(dst, mtu);
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+- if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
++ if (mtu < dst_pmtu(dst) && ip_dont_fragment(sk, dst))
+ sk->err_soft = EMSGSIZE;
+
++ mtu = dst_pmtu(dst);
++
+ if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
+- tp->pmtu_cookie > dst->pmtu) {
+- tcp_sync_mss(sk, dst->pmtu);
++ tp->pmtu_cookie > mtu) {
++ tcp_sync_mss(sk, mtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+@@ -1187,10 +1198,8 @@
+ sizeof(struct tcphdr),
+ IPPROTO_TCP,
+ 0);
+- arg.n_iov = 1;
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+- tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
+ ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+
+ TCP_INC_STATS_BH(TcpOutSegs);
+@@ -1215,7 +1224,6 @@
+
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+- arg.n_iov = 1;
+ if (ts) {
+ rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+@@ -1266,14 +1274,20 @@
+ static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
+ {
+ struct rtable *rt;
+- struct ip_options *opt;
++ struct ip_options *opt = req->af.v4_req.opt;
++ struct flowi fl = { .oif = sk->bound_dev_if,
++ .nl_u = { .ip4_u =
++ { .daddr = ((opt && opt->srr) ?
++ opt->faddr :
++ req->af.v4_req.rmt_addr),
++ .saddr = req->af.v4_req.loc_addr,
++ .tos = RT_CONN_FLAGS(sk) } },
++ .proto = IPPROTO_TCP,
++ .uli_u = { .ports =
++ { .sport = sk->sport,
++ .dport = req->rmt_port } } };
+
+- opt = req->af.v4_req.opt;
+- if(ip_route_output(&rt, ((opt && opt->srr) ?
+- opt->faddr :
+- req->af.v4_req.rmt_addr),
+- req->af.v4_req.loc_addr,
+- RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
++ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IpOutNoRoutes);
+ return NULL;
+ }
+@@ -1496,7 +1510,7 @@
+ (sysctl_max_syn_backlog - tcp_synq_len(sk)
+ < (sysctl_max_syn_backlog>>2)) &&
+ (!peer || !peer->tcp_ts_stamp) &&
+- (!dst || !dst->rtt)) {
++ (!dst || !dst_metric(dst, RTAX_RTT))) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations, proven to be alive.
+ * It means that we continue to communicate
+@@ -1568,10 +1582,11 @@
+ newtp->ext_header_len = 0;
+ if (newsk->protinfo.af_inet.opt)
+ newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
++ newtp->ext2_header_len = dst->header_len;
+ newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
+
+- tcp_sync_mss(newsk, dst->pmtu);
+- newtp->advmss = dst->advmss;
++ tcp_sync_mss(newsk, dst_pmtu(dst));
++ newtp->advmss = dst_metric(dst, RTAX_ADVMSS);;
+ tcp_initialize_rcv_mss(newsk);
+
+ __tcp_v4_hash(newsk, 0);
+@@ -1756,12 +1771,12 @@
+ goto no_tcp_socket;
+
+ process:
+- if(!ipsec_sk_policy(sk,skb))
+- goto discard_and_relse;
+-
+ if (sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
++ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
++ goto discard_and_relse;
++
+ if (sk_filter(sk, skb, 0))
+ goto discard_and_relse;
+
+@@ -1781,6 +1796,9 @@
+ return ret;
+
+ no_tcp_socket:
++ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
++ goto discard_it;
++
+ if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ bad_packet:
+ TCP_INC_STATS_BH(TcpInErrs);
+@@ -1798,6 +1816,9 @@
+ goto discard_it;
+
+ do_time_wait:
++ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
++ goto discard_and_relse;
++
+ if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ TCP_INC_STATS_BH(TcpInErrs);
+ tcp_tw_put((struct tcp_tw_bucket *) sk);
+@@ -1852,12 +1873,15 @@
+ /* Query new route. */
+ err = ip_route_connect(&rt, daddr, 0,
+ RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
+- sk->bound_dev_if);
++ sk->bound_dev_if,
++ IPPROTO_TCP,
++ sk->sport, sk->dport, sk);
+ if (err)
+ return err;
+
+ __sk_dst_set(sk, &rt->u.dst);
+ sk->route_caps = rt->u.dst.dev->features;
++ tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
+
+ new_saddr = rt->rt_src;
+
+@@ -1900,11 +1924,23 @@
+ if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
+ daddr = sk->protinfo.af_inet.opt->faddr;
+
+- err = ip_route_output(&rt, daddr, sk->saddr,
+- RT_CONN_FLAGS(sk), sk->bound_dev_if);
++ {
++ struct flowi fl = { .oif = sk->bound_dev_if,
++ .nl_u = { .ip4_u =
++ { .daddr = daddr,
++ .saddr = sk->saddr,
++ .tos = RT_CONN_FLAGS(sk) } },
++ .proto = IPPROTO_TCP,
++ .uli_u = { .ports =
++ { .sport = sk->sport,
++ .dport = sk->dport } } };
++
++ err = ip_route_output_flow(&rt, &fl, sk, 0);
++ }
+ if (!err) {
+ __sk_dst_set(sk, &rt->u.dst);
+ sk->route_caps = rt->u.dst.dev->features;
++ tcp_sk(sk)->ext2_header_len = rt->u.dst.header_len;
+ return 0;
+ }
+
+@@ -2066,8 +2102,8 @@
+ tcp_put_port(sk);
+
+ /* If sendmsg cached page exists, toss it. */
+- if (tp->sndmsg_page != NULL)
+- __free_page(tp->sndmsg_page);
++ if (inet_sk(sk)->sndmsg_page)
++ __free_page(inet_sk(sk)->sndmsg_page);
+
+ atomic_dec(&tcp_sockets_allocated);
+
+@@ -2325,7 +2361,7 @@
+ if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
+ panic("Failed to create the TCP control socket.\n");
+ tcp_socket->sk->allocation=GFP_ATOMIC;
+- tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
++ tcp_socket->sk->protinfo.af_inet.uc_ttl = -1;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+diff -Nru a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+--- a/net/ipv4/tcp_minisocks.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/tcp_minisocks.c 2005-02-13 21:25:09 +11:00
+@@ -25,6 +25,7 @@
+ #include <linux/sysctl.h>
+ #include <net/tcp.h>
+ #include <net/inet_common.h>
++#include <net/xfrm.h>
+
+ #ifdef CONFIG_SYSCTL
+ #define SYNC_INIT 0 /* let the user enable it */
+@@ -683,6 +684,13 @@
+ if ((filter = newsk->filter) != NULL)
+ sk_filter_charge(newsk, filter);
+ #endif
++ if (unlikely(xfrm_sk_clone_policy(newsk))) {
++ /* It is still raw copy of parent, so invalidate
++ * destructor and make plain sk_free() */
++ newsk->destruct = NULL;
++ sk_free(newsk);
++ return NULL;
++ }
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+diff -Nru a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+--- a/net/ipv4/tcp_output.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/tcp_output.c 2005-02-13 21:25:09 +11:00
+@@ -89,8 +89,8 @@
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss = tp->advmss;
+
+- if (dst && dst->advmss < mss) {
+- mss = dst->advmss;
++ if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
++ mss = dst_metric(dst, RTAX_ADVMSS);
+ tp->advmss = mss;
+ }
+
+@@ -99,10 +99,10 @@
+
+ /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+-static void tcp_cwnd_restart(struct tcp_opt *tp)
++static void tcp_cwnd_restart(struct tcp_opt *tp, struct dst_entry *dst)
+ {
+ s32 delta = tcp_time_stamp - tp->lsndtime;
+- u32 restart_cwnd = tcp_init_cwnd(tp);
++ u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 cwnd = tp->snd_cwnd;
+
+ if (tcp_is_vegas(tp))
+@@ -118,12 +118,12 @@
+ tp->snd_cwnd_used = 0;
+ }
+
+-static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
++static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb, struct sock *sk)
+ {
+ u32 now = tcp_time_stamp;
+
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+- tcp_cwnd_restart(tp);
++ tcp_cwnd_restart(tp, __sk_dst_get(sk));
+
+ tp->lsndtime = now;
+
+@@ -287,7 +287,7 @@
+ tcp_event_ack_sent(sk);
+
+ if (skb->len != tcp_header_size)
+- tcp_event_data_sent(tp, skb);
++ tcp_event_data_sent(tp, skb, sk);
+
+ TCP_INC_STATS(TcpOutSegs);
+
+@@ -518,13 +518,16 @@
+
+ int tcp_sync_mss(struct sock *sk, u32 pmtu)
+ {
+- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
++ struct tcp_opt *tp = tcp_sk(sk);
++ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss_now;
+
++ if (dst && dst->ops->get_mss)
++ pmtu = dst->ops->get_mss(dst, pmtu);
++
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+-
+ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+@@ -532,7 +535,7 @@
+ mss_now = tp->mss_clamp;
+
+ /* Now subtract optional transport overhead */
+- mss_now -= tp->ext_header_len;
++ mss_now -= tp->ext_header_len + tp->ext2_header_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+@@ -1147,10 +1150,10 @@
+ if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+ __u8 rcv_wscale;
+ /* Set this up on the first call only */
+- req->window_clamp = tp->window_clamp ? : dst->window;
++ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+- dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
++ dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ req->wscale_ok,
+@@ -1162,7 +1165,7 @@
+ th->window = htons(req->rcv_wnd);
+
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+- tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
++ tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
+ req->sack_ok, req->wscale_ok, req->rcv_wscale,
+ TCP_SKB_CB(skb)->when,
+ req->ts_recent);
+@@ -1191,11 +1194,11 @@
+ if (tp->user_mss)
+ tp->mss_clamp = tp->user_mss;
+ tp->max_window = 0;
+- tcp_sync_mss(sk, dst->pmtu);
++ tcp_sync_mss(sk, dst_pmtu(dst));
+
+ if (!tp->window_clamp)
+- tp->window_clamp = dst->window;
+- tp->advmss = dst->advmss;
++ tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
++ tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ tcp_initialize_rcv_mss(sk);
+ tcp_ca_init(tp);
+
+diff -Nru a/net/ipv4/udp.c b/net/ipv4/udp.c
+--- a/net/ipv4/udp.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv4/udp.c 2005-02-13 21:25:09 +11:00
+@@ -11,6 +11,7 @@
+ * Fred N. van Kempen, <waltje at uWalt.NL.Mugnet.ORG>
+ * Arnt Gulbrandsen, <agulbra at nvg.unit.no>
+ * Alan Cox, <Alan.Cox at linux.org>
++ * Hirokazu Takahashi, <taka at valinux.co.jp>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() calls
+@@ -64,6 +65,10 @@
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
++ * Hirokazu Takahashi : HW checksumming for outgoing UDP
++ * datagrams.
++ * Hirokazu Takahashi : sendfile() on UDP works now.
++ * Derek Atkins <derek at ihtfp.com>: Add Encapulation Support
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+@@ -97,6 +102,7 @@
+ #include <net/route.h>
+ #include <net/inet_common.h>
+ #include <net/checksum.h>
++#include <net/xfrm.h>
+
+ /*
+ * Snmp MIB for the UDP layer
+@@ -371,81 +377,119 @@
+ sock_put(sk);
+ }
+
+-static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
+-{
+- return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+-}
+-
+-struct udpfakehdr
+-{
+- struct udphdr uh;
+- u32 saddr;
+- u32 daddr;
+- struct iovec *iov;
+- u32 wcheck;
+-};
+-
+ /*
+- * Copy and checksum a UDP packet from user space into a buffer.
++ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+-
+-static int udp_getfrag(const void *p, char * to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
++static void udp_flush_pending_frames(struct sock *sk)
+ {
+- struct udpfakehdr *ufh = (struct udpfakehdr *)p;
+- if (offset==0) {
+- if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
+- fraglen-sizeof(struct udphdr), &ufh->wcheck))
+- return -EFAULT;
+- ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr),
+- ufh->wcheck);
+- ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr,
+- ntohs(ufh->uh.len),
+- IPPROTO_UDP, ufh->wcheck);
+- if (ufh->uh.check == 0)
+- ufh->uh.check = -1;
+- memcpy(to, ufh, sizeof(struct udphdr));
+- return 0;
++ struct udp_opt *up = udp_sk(sk);
++
++ if (up->pending) {
++ up->len = 0;
++ up->pending = 0;
++ ip_flush_pending_frames(sk);
+ }
+- if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
+- fraglen, &ufh->wcheck))
+- return -EFAULT;
+- return 0;
+ }
+
+ /*
+- * Copy a UDP packet from user space into a buffer without checksumming.
++ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+-
+-static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset,
+- unsigned int fraglen, struct sk_buff *skb)
++static int udp_push_pending_frames(struct sock *sk, struct udp_opt *up)
+ {
+- struct udpfakehdr *ufh = (struct udpfakehdr *)p;
++ struct sk_buff *skb;
++ struct udphdr *uh;
++ int err = 0;
++
++ /* Grab the skbuff where UDP header space exists. */
++ if ((skb = skb_peek(&sk->write_queue)) == NULL)
++ goto out;
+
+- if (offset==0) {
+- memcpy(to, ufh, sizeof(struct udphdr));
+- return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset,
+- fraglen-sizeof(struct udphdr));
++ /*
++ * Create a UDP header
++ */
++ uh = skb->h.uh;
++ uh->source = up->sport;
++ uh->dest = up->dport;
++ uh->len = htons(up->len);
++ uh->check = 0;
++
++ if (sk->no_check == UDP_CSUM_NOXMIT) {
++ skb->ip_summed = CHECKSUM_NONE;
++ goto send;
+ }
+- return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr),
+- fraglen);
++
++ if (skb_queue_len(&sk->write_queue) == 1) {
++ /*
++ * Only one fragment on the socket.
++ */
++ if (skb->ip_summed == CHECKSUM_HW) {
++ skb->csum = offsetof(struct udphdr, check);
++ uh->check = ~csum_tcpudp_magic(up->saddr, up->daddr,
++ up->len, IPPROTO_UDP, 0);
++ } else {
++ skb->csum = csum_partial((char *)uh,
++ sizeof(struct udphdr), skb->csum);
++ uh->check = csum_tcpudp_magic(up->saddr, up->daddr,
++ up->len, IPPROTO_UDP, skb->csum);
++ if (uh->check == 0)
++ uh->check = -1;
++ }
++ } else {
++ unsigned int csum = 0;
++ /*
++ * HW-checksum won't work as there are two or more
++ * fragments on the socket so that all csums of sk_buffs
++ * should be together.
++ */
++ if (skb->ip_summed == CHECKSUM_HW) {
++ int offset = (unsigned char *)uh - skb->data;
++ skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
++
++ skb->ip_summed = CHECKSUM_NONE;
++ } else {
++ skb->csum = csum_partial((char *)uh,
++ sizeof(struct udphdr), skb->csum);
++ }
++
++ skb_queue_walk(&sk->write_queue, skb) {
++ csum = csum_add(csum, skb->csum);
++ }
++ uh->check = csum_tcpudp_magic(up->saddr, up->daddr,
++ up->len, IPPROTO_UDP, csum);
++ if (uh->check == 0)
++ uh->check = -1;
++ }
++send:
++ err = ip_push_pending_frames(sk);
++out:
++ up->len = 0;
++ up->pending = 0;
++ return err;
++}
++
++
++static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
++{
++ return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+ }
+
+ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+ {
+- int ulen = len + sizeof(struct udphdr);
++ struct udp_opt *up = udp_sk(sk);
++ int ulen = len;
+ struct ipcm_cookie ipc;
+- struct udpfakehdr ufh;
+ struct rtable *rt = NULL;
+ int free = 0;
+ int connected = 0;
+- u32 daddr;
++ u32 daddr, faddr, saddr;
++ u16 dport;
+ u8 tos;
+ int err;
++ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+
+ /* This check is ONLY to check for arithmetic overflow
+ on integer(!) len. Not more! Real check will be made
+- in ip_build_xmit --ANK
++ in ip_append_* --ANK
+
+ BTW socket.c -> af_*.c -> ... make multiple
+ invalid conversions size_t -> int. We MUST repair it f.e.
+@@ -464,10 +508,23 @@
+ if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
++ ipc.opt = NULL;
++
++ if (up->pending) {
++ /*
++ * There are pending frames.
++ * The socket lock must be held while it's corked.
++ */
++ lock_sock(sk);
++ if (likely(up->pending))
++ goto do_append_data;
++ release_sock(sk);
++ }
++ ulen += sizeof(struct udphdr);
++
+ /*
+ * Get and verify the address.
+ */
+-
+ if (msg->msg_name) {
+ struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+ if (msg->msg_namelen < sizeof(*usin))
+@@ -477,24 +534,22 @@
+ return -EINVAL;
+ }
+
+- ufh.daddr = usin->sin_addr.s_addr;
+- ufh.uh.dest = usin->sin_port;
+- if (ufh.uh.dest == 0)
++ daddr = usin->sin_addr.s_addr;
++ dport = usin->sin_port;
++ if (dport == 0)
+ return -EINVAL;
+ } else {
+ if (sk->state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+- ufh.daddr = sk->daddr;
+- ufh.uh.dest = sk->dport;
++ daddr = sk->daddr;
++ dport = sk->dport;
+ /* Open fast path for connected socket.
+ Route will not be used, if at least one option is set.
+ */
+ connected = 1;
+ }
+ ipc.addr = sk->saddr;
+- ufh.uh.source = sk->sport;
+
+- ipc.opt = NULL;
+ ipc.oif = sk->bound_dev_if;
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(msg, &ipc);
+@@ -507,13 +562,13 @@
+ if (!ipc.opt)
+ ipc.opt = sk->protinfo.af_inet.opt;
+
+- ufh.saddr = ipc.addr;
+- ipc.addr = daddr = ufh.daddr;
++ saddr = ipc.addr;
++ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->srr) {
+ if (!daddr)
+ return -EINVAL;
+- daddr = ipc.opt->faddr;
++ faddr = ipc.opt->faddr;
+ connected = 0;
+ }
+ tos = RT_TOS(sk->protinfo.af_inet.tos);
+@@ -526,8 +581,8 @@
+ if (MULTICAST(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = sk->protinfo.af_inet.mc_index;
+- if (!ufh.saddr)
+- ufh.saddr = sk->protinfo.af_inet.mc_addr;
++ if (!saddr)
++ saddr = sk->protinfo.af_inet.mc_addr;
+ connected = 0;
+ }
+
+@@ -535,7 +590,16 @@
+ rt = (struct rtable*)sk_dst_check(sk, 0);
+
+ if (rt == NULL) {
+- err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif);
++ struct flowi fl = { .oif = ipc.oif,
++ .nl_u = { .ip4_u =
++ { .daddr = faddr,
++ .saddr = saddr,
++ .tos = tos } },
++ .proto = IPPROTO_UDP,
++ .uli_u = { .ports =
++ { .sport = sk->sport,
++ .dport = dport } } };
++ err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+ if (err)
+ goto out;
+
+@@ -550,23 +614,39 @@
+ goto do_confirm;
+ back_from_confirm:
+
+- ufh.saddr = rt->rt_src;
++ saddr = rt->rt_src;
+ if (!ipc.addr)
+- ufh.daddr = ipc.addr = rt->rt_dst;
+- ufh.uh.len = htons(ulen);
+- ufh.uh.check = 0;
+- ufh.iov = msg->msg_iov;
+- ufh.wcheck = 0;
+-
+- /* RFC1122: OK. Provides the checksumming facility (MUST) as per */
+- /* 4.1.3.4. It's configurable by the application via setsockopt() */
+- /* (MAY) and it defaults to on (MUST). */
+-
+- err = ip_build_xmit(sk,
+- (sk->no_check == UDP_CSUM_NOXMIT ?
+- udp_getfrag_nosum :
+- udp_getfrag),
+- &ufh, ulen, &ipc, rt, msg->msg_flags);
++ daddr = ipc.addr = rt->rt_dst;
++
++ lock_sock(sk);
++ if (unlikely(up->pending)) {
++ /* The socket is already corked while preparing it. */
++ /* ... which is an evident application bug. --ANK */
++ release_sock(sk);
++
++ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
++ err = -EINVAL;
++ goto out;
++ }
++ /*
++ * Now cork the socket to pend data.
++ */
++ up->daddr = daddr;
++ up->dport = dport;
++ up->saddr = saddr;
++ up->sport = sk->sport;
++ up->pending = 1;
++
++do_append_data:
++ up->len += ulen;
++ err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen,
++ sizeof(struct udphdr), &ipc, rt,
++ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
++ if (err)
++ udp_flush_pending_frames(sk);
++ else if (!corkreq)
++ err = udp_push_pending_frames(sk, up);
++ release_sock(sk);
+
+ out:
+ ip_rt_put(rt);
+@@ -586,6 +666,52 @@
+ goto out;
+ }
+
++int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags)
++{
++ struct udp_opt *up = udp_sk(sk);
++ int ret;
++
++ if (!up->pending) {
++ struct msghdr msg = { .msg_flags = flags|MSG_MORE };
++
++ /* Call udp_sendmsg to specify destination address which
++ * sendpage interface can't pass.
++ * This will succeed only when the socket is connected.
++ */
++ ret = udp_sendmsg(sk, &msg, 0);
++ if (ret < 0)
++ return ret;
++ }
++
++ lock_sock(sk);
++
++ if (unlikely(!up->pending)) {
++ release_sock(sk);
++
++ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
++ return -EINVAL;
++ }
++
++ ret = ip_append_page(sk, page, offset, size, flags);
++ if (ret == -EOPNOTSUPP) {
++ release_sock(sk);
++ return sock_no_sendpage(sk->socket, page, offset, size, flags);
++ }
++ if (ret < 0) {
++ udp_flush_pending_frames(sk);
++ goto out;
++ }
++
++ up->len += size;
++ if (!(up->corkflag || (flags&MSG_MORE)))
++ ret = udp_push_pending_frames(sk, up);
++ if (!ret)
++ ret = size;
++out:
++ release_sock(sk);
++ return ret;
++}
++
+ /*
+ * IOCTL requests applicable to the UDP protocol
+ */
+@@ -807,7 +933,9 @@
+ saddr = sk->protinfo.af_inet.mc_addr;
+ }
+ err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+- RT_CONN_FLAGS(sk), oif);
++ RT_CONN_FLAGS(sk), oif,
++ IPPROTO_UDP,
++ sk->sport, usin->sin_port, sk);
+ if (err)
+ return err;
+ if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) {
+@@ -858,11 +986,138 @@
+ inet_sock_release(sk);
+ }
+
++/* return:
++ * 1 if the the UDP system should process it
++ * 0 if we should drop this packet
++ * -1 if it should get processed by xfrm4_rcv_encap
++ */
++static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
++{
++#ifndef CONFIG_XFRM
++ return 1;
++#else
++ struct udp_opt *up = udp_sk(sk);
++ struct udphdr *uh = skb->h.uh;
++ struct iphdr *iph;
++ int iphlen, len;
++
++ __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr);
++ __u32 *udpdata32 = (__u32 *)udpdata;
++ __u16 encap_type = up->encap_type;
++
++ /* if we're overly short, let UDP handle it */
++ if (udpdata > skb->tail)
++ return 1;
++
++ /* if this is not encapsulated socket, then just return now */
++ if (!encap_type)
++ return 1;
++
++ len = skb->tail - udpdata;
++
++ switch (encap_type) {
++ default:
++ case UDP_ENCAP_ESPINUDP:
++ /* Check if this is a keepalive packet. If so, eat it. */
++ if (len == 1 && udpdata[0] == 0xff) {
++ return 0;
++ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
++ /* ESP Packet without Non-ESP header */
++ len = sizeof(struct udphdr);
++ } else
++ /* Must be an IKE packet.. pass it through */
++ return 1;
++ break;
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ /* Check if this is a keepalive packet. If so, eat it. */
++ if (len == 1 && udpdata[0] == 0xff) {
++ return 0;
++ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
++ udpdata32[0] == 0 && udpdata32[1] == 0) {
++
++ /* ESP Packet with Non-IKE marker */
++ len = sizeof(struct udphdr) + 2 * sizeof(u32);
++ } else
++ /* Must be an IKE packet.. pass it through */
++ return 1;
++ break;
++ }
++
++ /* At this point we are sure that this is an ESPinUDP packet,
++ * so we need to remove 'len' bytes from the packet (the UDP
++ * header and optional ESP marker bytes) and then modify the
++ * protocol to ESP, and then call into the transform receiver.
++ */
++
++ /* Now we can update and verify the packet length... */
++ iph = skb->nh.iph;
++ iphlen = iph->ihl << 2;
++ iph->tot_len = htons(ntohs(iph->tot_len) - len);
++ if (skb->len < iphlen + len) {
++ /* packet is too small!?! */
++ return 0;
++ }
++
++ /* pull the data buffer up to the ESP header and set the
++ * transport header to point to ESP. Keep UDP on the stack
++ * for later.
++ */
++ skb->h.raw = skb_pull(skb, len);
++
++ /* modify the protocol (it's ESP!) */
++ iph->protocol = IPPROTO_ESP;
++
++ /* and let the caller know to send this into the ESP processor... */
++ return -1;
++#endif
++}
++
++/* returns:
++ * -1: error
++ * 0: success
++ * >0: "udp encap" protocol resubmission
++ *
++ * Note that in the success and error cases, the skb is assumed to
++ * have either been requeued or freed.
++ */
+ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+ {
++ struct udp_opt *up = udp_sk(sk);
++
+ /*
+ * Charge it to the socket, dropping if the queue is full.
+ */
++ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return -1;
++ }
++
++ if (up->encap_type) {
++ /*
++ * This is an encapsulation socket, so let's see if this is
++ * an encapsulated packet.
++ * If it's a keepalive packet, then just eat it.
++ * If it's an encapsulateed packet, then pass it to the
++ * IPsec xfrm input and return the response
++ * appropriately. Otherwise, just fall through and
++ * pass this up the UDP socket.
++ */
++ int ret;
++
++ ret = udp_encap_rcv(sk, skb);
++ if (ret == 0) {
++ /* Eat the packet .. */
++ kfree_skb(skb);
++ return 0;
++ }
++ if (ret < 0) {
++ /* process the ESP packet */
++ ret = xfrm4_rcv_encap(skb, up->encap_type);
++ UDP_INC_STATS_BH(UdpInDatagrams);
++ return -ret;
++ }
++ /* FALLTHROUGH -- it's a UDP Packet */
++ }
+
+ #if defined(CONFIG_FILTER)
+ if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+@@ -915,8 +1170,13 @@
+ if(sknext)
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+
+- if(skb1)
+- udp_queue_rcv_skb(sk, skb1);
++ if(skb1) {
++ int ret = udp_queue_rcv_skb(sk, skb1);
++ if (ret > 0)
++ /* we should probably re-process instead
++ * of dropping packets here. */
++ kfree_skb(skb1);
++ }
+ sk = sknext;
+ } while(sknext);
+ } else
+@@ -991,11 +1251,20 @@
+ sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
+
+ if (sk != NULL) {
+- udp_queue_rcv_skb(sk, skb);
++ int ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
++
++ /* a return value > 0 means to resubmit the input, but
++ * it it wants the return to be -protocol, or 0
++ */
++ if (ret > 0)
++ return -ret;
+ return 0;
+ }
+
++ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
++ goto drop;
++
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_checksum_complete(skb))
+ goto csum_error;
+@@ -1036,6 +1305,7 @@
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen));
++drop:
+ UDP_INC_STATS_BH(UdpInErrors);
+ kfree_skb(skb);
+ return(0);
+@@ -1100,16 +1370,116 @@
+ return len;
+ }
+
++static int udp_destroy_sock(struct sock *sk)
++{
++ lock_sock(sk);
++ udp_flush_pending_frames(sk);
++ release_sock(sk);
++ return 0;
++}
++
++/*
++ * Socket option code for UDP
++ */
++static int udp_setsockopt(struct sock *sk, int level, int optname,
++ char *optval, int optlen)
++{
++ struct udp_opt *up = udp_sk(sk);
++ int val;
++ int err = 0;
++
++ if (level != SOL_UDP)
++ return ip_setsockopt(sk, level, optname, optval, optlen);
++
++ if(optlen<sizeof(int))
++ return -EINVAL;
++
++ if (get_user(val, (int *)optval))
++ return -EFAULT;
++
++ switch(optname) {
++ case UDP_CORK:
++ if (val != 0) {
++ up->corkflag = 1;
++ } else {
++ up->corkflag = 0;
++ lock_sock(sk);
++ udp_push_pending_frames(sk, up);
++ release_sock(sk);
++ }
++ break;
++
++ case UDP_ENCAP:
++ switch (val) {
++ case 0:
++ case UDP_ENCAP_ESPINUDP:
++ case UDP_ENCAP_ESPINUDP_NON_IKE:
++ up->encap_type = val;
++ break;
++ default:
++ err = -ENOPROTOOPT;
++ break;
++ }
++ break;
++
++ default:
++ err = -ENOPROTOOPT;
++ break;
++ };
++
++ return err;
++}
++
++static int udp_getsockopt(struct sock *sk, int level, int optname,
++ char *optval, int *optlen)
++{
++ struct udp_opt *up = udp_sk(sk);
++ int val, len;
++
++ if (level != SOL_UDP)
++ return ip_getsockopt(sk, level, optname, optval, optlen);
++
++ if(get_user(len,optlen))
++ return -EFAULT;
++
++ len = min_t(unsigned int, len, sizeof(int));
++
++ if(len < 0)
++ return -EINVAL;
++
++ switch(optname) {
++ case UDP_CORK:
++ val = up->corkflag;
++ break;
++
++ case UDP_ENCAP:
++ val = up->encap_type;
++ break;
++
++ default:
++ return -ENOPROTOOPT;
++ };
++
++ if(put_user(len, optlen))
++ return -EFAULT;
++ if(copy_to_user(optval, &val,len))
++ return -EFAULT;
++ return 0;
++}
++
++
+ struct proto udp_prot = {
+ name: "UDP",
+ close: udp_close,
+ connect: udp_connect,
+ disconnect: udp_disconnect,
+ ioctl: udp_ioctl,
+- setsockopt: ip_setsockopt,
+- getsockopt: ip_getsockopt,
++ destroy: udp_destroy_sock,
++ setsockopt: udp_setsockopt,
++ getsockopt: udp_getsockopt,
+ sendmsg: udp_sendmsg,
+ recvmsg: udp_recvmsg,
++ sendpage: udp_sendpage,
+ backlog_rcv: udp_queue_rcv_skb,
+ hash: udp_v4_hash,
+ unhash: udp_v4_unhash,
+diff -Nru a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/xfrm4_input.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,159 @@
++/*
++ * xfrm4_input.c
++ *
++ * Changes:
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific portion
++ * Derek Atkins <derek at ihtfp.com>
++ * Add Encapsulation support
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <net/inet_ecn.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++
++int xfrm4_rcv(struct sk_buff *skb)
++{
++ return xfrm4_rcv_encap(skb, 0);
++}
++
++EXPORT_SYMBOL(xfrm4_rcv);
++
++static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
++{
++ struct iphdr *outer_iph = skb->nh.iph;
++ struct iphdr *inner_iph = skb->h.ipiph;
++
++ if (INET_ECN_is_ce(outer_iph->tos) &&
++ INET_ECN_is_not_ce(inner_iph->tos))
++ IP_ECN_set_ce(inner_iph);
++}
++
++static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
++{
++ switch (nexthdr) {
++ case IPPROTO_IPIP:
++ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
++ return -EINVAL;
++ *spi = skb->nh.iph->saddr;
++ *seq = 0;
++ return 0;
++ }
++
++ return xfrm_parse_spi(skb, nexthdr, spi, seq);
++}
++
++int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
++{
++ int err;
++ u32 spi, seq;
++ struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
++ struct xfrm_state *x;
++ int xfrm_nr = 0;
++ int decaps = 0;
++
++ if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
++ goto drop;
++
++ do {
++ struct iphdr *iph = skb->nh.iph;
++
++ if (xfrm_nr == XFRM_MAX_DEPTH)
++ goto drop;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
++ if (x == NULL)
++ goto drop;
++
++ spin_lock(&x->lock);
++ if (unlikely(x->km.state != XFRM_STATE_VALID))
++ goto drop_unlock;
++
++ if (x->props.replay_window && xfrm_replay_check(x, seq))
++ goto drop_unlock;
++
++ if (xfrm_state_check_expire(x))
++ goto drop_unlock;
++
++ xfrm_vec[xfrm_nr].decap.decap_type = encap_type;
++ if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb))
++ goto drop_unlock;
++
++ /* only the first xfrm gets the encap type */
++ encap_type = 0;
++
++ if (x->props.replay_window)
++ xfrm_replay_advance(x, seq);
++
++ x->curlft.bytes += skb->len;
++ x->curlft.packets++;
++
++ spin_unlock(&x->lock);
++
++ xfrm_vec[xfrm_nr++].xvec = x;
++
++ iph = skb->nh.iph;
++
++ if (x->props.mode) {
++ if (iph->protocol != IPPROTO_IPIP)
++ goto drop;
++ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
++ goto drop;
++ if (skb_cloned(skb) &&
++ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ goto drop;
++ if (!(x->props.flags & XFRM_STATE_NOECN))
++ ipip_ecn_decapsulate(skb);
++ skb->mac.raw = memmove(skb->data - skb->mac_len,
++ skb->mac.raw, skb->mac_len);
++ skb->nh.raw = skb->data;
++ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
++ decaps = 1;
++ break;
++ }
++
++ if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
++ goto drop;
++ } while (!err);
++
++ /* Allocate new secpath or COW existing one. */
++
++ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
++ struct sec_path *sp;
++ sp = secpath_dup(skb->sp);
++ if (!sp)
++ goto drop;
++ if (skb->sp)
++ secpath_put(skb->sp);
++ skb->sp = sp;
++ }
++ if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
++ goto drop;
++
++ memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
++ skb->sp->len += xfrm_nr;
++
++ if (decaps) {
++ if (!(skb->dev->flags&IFF_LOOPBACK)) {
++ dst_release(skb->dst);
++ skb->dst = NULL;
++ }
++ netif_rx(skb);
++ return 0;
++ } else {
++ return -skb->nh.iph->protocol;
++ }
++
++drop_unlock:
++ spin_unlock(&x->lock);
++ xfrm_state_put(x);
++drop:
++ while (--xfrm_nr >= 0)
++ xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
++
++ kfree_skb(skb);
++ return 0;
++}
+diff -Nru a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/xfrm4_output.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,138 @@
++/*
++ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
++ * Copyright (c) 2004 Herbert Xu <herbert at gondor.apana.org.au>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/skbuff.h>
++#include <linux/spinlock.h>
++#include <net/inet_ecn.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/icmp.h>
++
++/* Add encapsulation header.
++ *
++ * In transport mode, the IP header will be moved forward to make space
++ * for the encapsulation header.
++ *
++ * In tunnel mode, the top IP header will be constructed per RFC 2401.
++ * The following fields in it shall be filled in by x->type->output:
++ * tot_len
++ * check
++ *
++ * On exit, skb->h will be set to the start of the payload to be processed
++ * by x->type->output and skb->nh will be set to the top IP header.
++ */
++static void xfrm4_encap(struct sk_buff *skb)
++{
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct iphdr *iph, *top_iph;
++
++ iph = skb->nh.iph;
++ skb->h.ipiph = iph;
++
++ skb->nh.raw = skb_push(skb, x->props.header_len);
++ top_iph = skb->nh.iph;
++
++ if (!x->props.mode) {
++ skb->h.raw += iph->ihl*4;
++ memmove(top_iph, iph, iph->ihl*4);
++ return;
++ }
++
++ top_iph->ihl = 5;
++ top_iph->version = 4;
++
++ /* DS disclosed */
++ top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
++ if (x->props.flags & XFRM_STATE_NOECN)
++ IP_ECN_clear(top_iph);
++
++ top_iph->frag_off = iph->frag_off & htons(IP_DF);
++ if (!top_iph->frag_off)
++ __ip_select_ident(top_iph, dst);
++
++ top_iph->ttl = dst_path_metric(dst, RTAX_HOPLIMIT);
++
++ top_iph->saddr = x->props.saddr.a4;
++ top_iph->daddr = x->id.daddr.a4;
++ top_iph->protocol = IPPROTO_IPIP;
++
++ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
++}
++
++static int xfrm4_tunnel_check_size(struct sk_buff *skb)
++{
++ int mtu, ret = 0;
++ struct dst_entry *dst;
++ struct iphdr *iph = skb->nh.iph;
++
++ if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
++ goto out;
++
++ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
++
++ if (!(iph->frag_off & htons(IP_DF)))
++ goto out;
++
++ dst = skb->dst;
++ mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len;
++ if (skb->len > mtu) {
++ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
++ ret = -EMSGSIZE;
++ }
++out:
++ return ret;
++}
++
++int xfrm4_output(struct sk_buff *skb)
++{
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ int err;
++
++ if (skb->ip_summed == CHECKSUM_HW)
++ skb_checksum_help(skb);
++
++ spin_lock_bh(&x->lock);
++ err = xfrm_state_check(x, skb);
++ if (err)
++ goto error;
++
++ if (x->props.mode) {
++ err = xfrm4_tunnel_check_size(skb);
++ if (err)
++ goto error;
++ }
++
++ xfrm4_encap(skb);
++
++ err = x->type->output(skb);
++ if (err)
++ goto error;
++
++ x->curlft.bytes += skb->len;
++ x->curlft.packets++;
++
++ spin_unlock_bh(&x->lock);
++
++ if (!(skb->dst = dst_pop(dst))) {
++ err = -EHOSTUNREACH;
++ goto error_nolock;
++ }
++ err = NET_XMIT_BYPASS;
++
++out_exit:
++ return err;
++error:
++ spin_unlock_bh(&x->lock);
++error_nolock:
++ kfree_skb(skb);
++ goto out_exit;
++}
+diff -Nru a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/xfrm4_policy.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,288 @@
++/*
++ * xfrm4_policy.c
++ *
++ * Changes:
++ * Kazunori MIYAZAWA @USAGI
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific portion
++ *
++ */
++
++#include <linux/config.h>
++#include <net/xfrm.h>
++#include <net/ip.h>
++
++static struct dst_ops xfrm4_dst_ops;
++static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
++
++static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
++
++static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
++{
++ return __ip_route_output_key((struct rtable**)dst, fl);
++}
++
++/* Check that the bundle accepts the flow and its components are
++ * still valid.
++ */
++
++static int __xfrm4_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
++{
++ do {
++ if (xdst->u.dst.ops != &xfrm4_dst_ops)
++ return 1;
++
++ if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET))
++ return 0;
++ if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
++ xdst->u.dst.path->obsolete > 0)
++ return 0;
++ xdst = (struct xfrm_dst*)xdst->u.dst.child;
++ } while (xdst);
++ return 0;
++}
++
++static struct dst_entry *
++__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
++{
++ struct dst_entry *dst;
++
++ read_lock_bh(&policy->lock);
++ for (dst = policy->bundles; dst; dst = dst->next) {
++ struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
++ if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
++ xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
++ xdst->u.rt.fl.fl4_src == fl->fl4_src &&
++ __xfrm4_bundle_ok(xdst, fl)) {
++ dst_clone(dst);
++ break;
++ }
++ }
++ read_unlock_bh(&policy->lock);
++ return dst;
++}
++
++/* Allocate chain of dst_entry's, attach known xfrm's, calculate
++ * all the metrics... Shortly, bundle a bundle.
++ */
++
++static int
++__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
++ struct flowi *fl, struct dst_entry **dst_p)
++{
++ struct dst_entry *dst, *dst_prev;
++ struct rtable *rt0 = (struct rtable*)(*dst_p);
++ struct rtable *rt = rt0;
++ u32 remote = fl->fl4_dst;
++ u32 local = fl->fl4_src;
++ int i;
++ int err;
++ int header_len = 0;
++ int trailer_len = 0;
++
++ dst = dst_prev = NULL;
++
++ for (i = 0; i < nx; i++) {
++ struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
++
++ if (unlikely(dst1 == NULL)) {
++ err = -ENOBUFS;
++ goto error;
++ }
++
++ if (!dst)
++ dst = dst1;
++ else {
++ dst_prev->child = dst1;
++ dst1->flags |= DST_NOHASH;
++ dst_clone(dst1);
++ }
++ dst_prev = dst1;
++ if (xfrm[i]->props.mode) {
++ remote = xfrm[i]->id.daddr.a4;
++ local = xfrm[i]->props.saddr.a4;
++ }
++ header_len += xfrm[i]->props.header_len;
++ trailer_len += xfrm[i]->props.trailer_len;
++ }
++
++ if (remote != fl->fl4_dst) {
++ struct flowi fl_tunnel = { .nl_u = { .ip4_u =
++ { .daddr = remote,
++ .saddr = local }
++ }
++ };
++ err = xfrm_dst_lookup((struct xfrm_dst**)&rt, &fl_tunnel, AF_INET);
++ if (err)
++ goto error;
++ } else {
++ dst_hold(&rt->u.dst);
++ }
++ dst_prev->child = &rt->u.dst;
++ i = 0;
++ for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
++ struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
++ x->u.rt.fl = *fl;
++
++ dst_prev->xfrm = xfrm[i++];
++ dst_prev->dev = rt->u.dst.dev;
++ if (rt->u.dst.dev)
++ dev_hold(rt->u.dst.dev);
++ dst_prev->obsolete = -1;
++ dst_prev->flags |= DST_HOST;
++ dst_prev->lastuse = jiffies;
++ dst_prev->header_len = header_len;
++ dst_prev->trailer_len = trailer_len;
++ memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
++ dst_prev->path = &rt->u.dst;
++
++ /* Copy neighbout for reachability confirmation */
++ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
++ dst_prev->input = rt->u.dst.input;
++ dst_prev->output = xfrm4_output;
++ if (rt->peer)
++ atomic_inc(&rt->peer->refcnt);
++ x->u.rt.peer = rt->peer;
++ /* Sheit... I remember I did this right. Apparently,
++ * it was magically lost, so this code needs audit */
++ x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
++ x->u.rt.rt_type = rt->rt_type;
++ x->u.rt.rt_src = rt0->rt_src;
++ x->u.rt.rt_dst = rt0->rt_dst;
++ x->u.rt.rt_gateway = rt->rt_gateway;
++ x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
++ header_len -= x->u.dst.xfrm->props.header_len;
++ trailer_len -= x->u.dst.xfrm->props.trailer_len;
++ }
++ *dst_p = dst;
++ return 0;
++
++error:
++ if (dst)
++ dst_free(dst);
++ return err;
++}
++
++static void
++_decode_session4(struct sk_buff *skb, struct flowi *fl)
++{
++ struct iphdr *iph = skb->nh.iph;
++ u8 *xprth = skb->nh.raw + iph->ihl*4;
++
++ memset(fl, 0, sizeof(struct flowi));
++ if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
++ switch (iph->protocol) {
++ case IPPROTO_UDP:
++ case IPPROTO_TCP:
++ case IPPROTO_SCTP:
++ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
++ u16 *ports = (u16 *)xprth;
++
++ fl->fl_ip_sport = ports[0];
++ fl->fl_ip_dport = ports[1];
++ }
++ break;
++
++ case IPPROTO_ICMP:
++ if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
++ u8 *icmp = xprth;
++
++ fl->fl_icmp_type = icmp[0];
++ fl->fl_icmp_code = icmp[1];
++ }
++ break;
++
++ case IPPROTO_ESP:
++ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
++ u32 *ehdr = (u32 *)xprth;
++
++ fl->fl_ipsec_spi = ehdr[0];
++ }
++ break;
++
++ case IPPROTO_AH:
++ if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
++ u32 *ah_hdr = (u32*)xprth;
++
++ fl->fl_ipsec_spi = ah_hdr[1];
++ }
++ break;
++
++ case IPPROTO_COMP:
++ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
++ u16 *ipcomp_hdr = (u16 *)xprth;
++
++ fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1]));
++ }
++ break;
++ default:
++ fl->fl_ipsec_spi = 0;
++ break;
++ };
++ }
++ fl->proto = iph->protocol;
++ fl->fl4_dst = iph->daddr;
++ fl->fl4_src = iph->saddr;
++}
++
++static inline int xfrm4_garbage_collect(void)
++{
++ read_lock(&xfrm4_policy_afinfo.lock);
++ xfrm4_policy_afinfo.garbage_collect();
++ read_unlock(&xfrm4_policy_afinfo.lock);
++ return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
++}
++
++static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
++{
++ struct dst_entry *path = dst->path;
++
++ if (mtu < 68 + dst->header_len)
++ return;
++
++ path->ops->update_pmtu(path, mtu);
++}
++
++static struct dst_ops xfrm4_dst_ops = {
++ .family = AF_INET,
++ .protocol = __constant_htons(ETH_P_IP),
++ .gc = xfrm4_garbage_collect,
++ .update_pmtu = xfrm4_update_pmtu,
++ .gc_thresh = 1024,
++ .entry_size = sizeof(struct xfrm_dst),
++};
++
++static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
++ .family = AF_INET,
++ .lock = RW_LOCK_UNLOCKED,
++ .type_map = &xfrm4_type_map,
++ .dst_ops = &xfrm4_dst_ops,
++ .dst_lookup = xfrm4_dst_lookup,
++ .find_bundle = __xfrm4_find_bundle,
++ .bundle_create = __xfrm4_bundle_create,
++ .decode_session = _decode_session4,
++};
++
++static void __init xfrm4_policy_init(void)
++{
++ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
++}
++
++static void __exit xfrm4_policy_fini(void)
++{
++ xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
++}
++
++void __init xfrm4_init(void)
++{
++ xfrm4_state_init();
++ xfrm4_policy_init();
++}
++
++void __exit xfrm4_fini(void)
++{
++ //xfrm4_input_fini();
++ xfrm4_policy_fini();
++ xfrm4_state_fini();
++}
++
+diff -Nru a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/xfrm4_state.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,126 @@
++/*
++ * xfrm4_state.c
++ *
++ * Changes:
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific portion
++ *
++ */
++
++#include <net/xfrm.h>
++#include <linux/pfkeyv2.h>
++#include <linux/ipsec.h>
++
++extern struct xfrm_state_afinfo xfrm4_state_afinfo;
++
++static void
++__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
++ struct xfrm_tmpl *tmpl,
++ xfrm_address_t *daddr, xfrm_address_t *saddr)
++{
++ x->sel.daddr.a4 = fl->fl4_dst;
++ x->sel.saddr.a4 = fl->fl4_src;
++ x->sel.dport = fl->fl_ip_dport;
++ x->sel.dport_mask = ~0;
++ x->sel.sport = fl->fl_ip_sport;
++ x->sel.sport_mask = ~0;
++ x->sel.prefixlen_d = 32;
++ x->sel.prefixlen_s = 32;
++ x->sel.proto = fl->proto;
++ x->sel.ifindex = fl->oif;
++ x->id = tmpl->id;
++ if (x->id.daddr.a4 == 0)
++ x->id.daddr.a4 = daddr->a4;
++ x->props.saddr = tmpl->saddr;
++ if (x->props.saddr.a4 == 0)
++ x->props.saddr.a4 = saddr->a4;
++ x->props.mode = tmpl->mode;
++ x->props.reqid = tmpl->reqid;
++ x->props.family = AF_INET;
++}
++
++static struct xfrm_state *
++__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
++{
++ unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
++ struct xfrm_state *x;
++
++ list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
++ if (x->props.family == AF_INET &&
++ spi == x->id.spi &&
++ daddr->a4 == x->id.daddr.a4 &&
++ proto == x->id.proto) {
++ xfrm_state_hold(x);
++ return x;
++ }
++ }
++ return NULL;
++}
++
++static struct xfrm_state *
++__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ int create)
++{
++ struct xfrm_state *x, *x0;
++ unsigned h = __xfrm4_dst_hash(daddr);
++
++ x0 = NULL;
++
++ list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
++ if (x->props.family == AF_INET &&
++ daddr->a4 == x->id.daddr.a4 &&
++ mode == x->props.mode &&
++ proto == x->id.proto &&
++ saddr->a4 == x->props.saddr.a4 &&
++ reqid == x->props.reqid &&
++ x->km.state == XFRM_STATE_ACQ &&
++ !x->id.spi) {
++ x0 = x;
++ break;
++ }
++ }
++ if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
++ x0->sel.daddr.a4 = daddr->a4;
++ x0->sel.saddr.a4 = saddr->a4;
++ x0->sel.prefixlen_d = 32;
++ x0->sel.prefixlen_s = 32;
++ x0->props.saddr.a4 = saddr->a4;
++ x0->km.state = XFRM_STATE_ACQ;
++ x0->id.daddr.a4 = daddr->a4;
++ x0->id.proto = proto;
++ x0->props.family = AF_INET;
++ x0->props.mode = mode;
++ x0->props.reqid = reqid;
++ x0->props.family = AF_INET;
++ x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
++ xfrm_state_hold(x0);
++ x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
++ add_timer(&x0->timer);
++ xfrm_state_hold(x0);
++ list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
++ wake_up(&km_waitq);
++ }
++ if (x0)
++ xfrm_state_hold(x0);
++ return x0;
++}
++
++static struct xfrm_state_afinfo xfrm4_state_afinfo = {
++ .family = AF_INET,
++ .lock = RW_LOCK_UNLOCKED,
++ .init_tempsel = __xfrm4_init_tempsel,
++ .state_lookup = __xfrm4_state_lookup,
++ .find_acq = __xfrm4_find_acq,
++};
++
++void __init xfrm4_state_init(void)
++{
++ xfrm_state_register_afinfo(&xfrm4_state_afinfo);
++}
++
++void __exit xfrm4_state_fini(void)
++{
++ xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
++}
++
+diff -Nru a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv4/xfrm4_tunnel.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,144 @@
++/* xfrm4_tunnel.c: Generic IP tunnel transformer.
++ *
++ * Copyright (C) 2003 David S. Miller (davem at redhat.com)
++ */
++
++#include <linux/skbuff.h>
++#include <linux/module.h>
++#include <net/xfrm.h>
++#include <net/ip.h>
++#include <net/protocol.h>
++
++static int ipip_output(struct sk_buff *skb)
++{
++ struct iphdr *iph;
++
++ iph = skb->nh.iph;
++ iph->tot_len = htons(skb->len);
++ ip_send_check(iph);
++
++ return 0;
++}
++
++static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ return 0;
++}
++
++static struct xfrm_tunnel *ipip_handler;
++static DECLARE_MUTEX(xfrm4_tunnel_sem);
++
++int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
++{
++ int ret;
++
++ down(&xfrm4_tunnel_sem);
++ ret = 0;
++ if (ipip_handler != NULL)
++ ret = -EINVAL;
++ if (!ret)
++ ipip_handler = handler;
++ up(&xfrm4_tunnel_sem);
++
++ return ret;
++}
++
++EXPORT_SYMBOL(xfrm4_tunnel_register);
++
++int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
++{
++ int ret;
++
++ down(&xfrm4_tunnel_sem);
++ ret = 0;
++ if (ipip_handler != handler)
++ ret = -EINVAL;
++ if (!ret)
++ ipip_handler = NULL;
++ up(&xfrm4_tunnel_sem);
++
++ synchronize_net();
++
++ return ret;
++}
++
++EXPORT_SYMBOL(xfrm4_tunnel_deregister);
++
++static int ipip_rcv(struct sk_buff *skb)
++{
++ struct xfrm_tunnel *handler = ipip_handler;
++
++ /* Tunnel devices take precedence. */
++ if (handler && handler->handler(skb) == 0)
++ return 0;
++
++ return xfrm4_rcv(skb);
++}
++
++static void ipip_err(struct sk_buff *skb, u32 info)
++{
++ struct xfrm_tunnel *handler = ipip_handler;
++ u32 arg = info;
++
++ if (handler)
++ handler->err_handler(skb, &arg);
++}
++
++static int ipip_init_state(struct xfrm_state *x, void *args)
++{
++ if (!x->props.mode)
++ return -EINVAL;
++
++ if (x->encap)
++ return -EINVAL;
++
++ x->props.header_len = sizeof(struct iphdr);
++
++ return 0;
++}
++
++static void ipip_destroy(struct xfrm_state *x)
++{
++}
++
++static struct xfrm_type ipip_type = {
++ .description = "IPIP",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_IPIP,
++ .init_state = ipip_init_state,
++ .destructor = ipip_destroy,
++ .input = ipip_xfrm_rcv,
++ .output = ipip_output
++};
++
++static struct inet_protocol ipip_protocol = {
++ .handler = ipip_rcv,
++ .err_handler = ipip_err,
++ .no_policy = 1,
++};
++
++static int __init ipip_init(void)
++{
++ if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
++ printk(KERN_INFO "ipip init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) {
++ printk(KERN_INFO "ipip init: can't add protocol\n");
++ xfrm_unregister_type(&ipip_type, AF_INET);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit ipip_fini(void)
++{
++ if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0)
++ printk(KERN_INFO "ipip close: can't remove protocol\n");
++ if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
++ printk(KERN_INFO "ipip close: can't remove xfrm type\n");
++}
++
++module_init(ipip_init);
++module_exit(ipip_fini);
++MODULE_LICENSE("GPL");
+diff -Nru a/net/ipv6/Config.in b/net/ipv6/Config.in
+--- a/net/ipv6/Config.in 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/Config.in 2005-02-13 21:25:09 +11:00
+@@ -2,9 +2,23 @@
+ # IPv6 configuration
+ #
+
+-#bool ' IPv6: flow policy support' CONFIG_RT6_POLICY
+-#bool ' IPv6: firewall support' CONFIG_IPV6_FIREWALL
++bool 'IPv6: Privacy Extensions (RFC 3041) support' CONFIG_IPV6_PRIVACY
+
+ if [ "$CONFIG_NETFILTER" != "n" ]; then
+ source net/ipv6/netfilter/Config.in
++fi
++
++tristate 'IPv6: AH transformation' CONFIG_INET6_AH
++tristate 'IPv6: ESP transformation' CONFIG_INET6_ESP
++tristate 'IPv6: IPComp transformation' CONFIG_INET6_IPCOMP
++
++tristate 'IPv6: IPv6-in-IPv6 tunnel' CONFIG_IPV6_TUNNEL
++if [ "$CONFIG_IPV6_TUNNEL" = "y" -o "$CONFIG_INET6_IPCOMP" = "y" ]; then
++ define_tristate CONFIG_INET6_TUNNEL y
++else
++ if [ "$CONFIG_IPV6_TUNNEL" = "m" -o "$CONFIG_INET6_IPCOMP" = "m" ]; then
++ define_tristate CONFIG_INET6_TUNNEL m
++ else
++ tristate 'IPv6: tunnel transformation' CONFIG_INET6_TUNNEL
++ fi
+ fi
+diff -Nru a/net/ipv6/Makefile b/net/ipv6/Makefile
+--- a/net/ipv6/Makefile 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/Makefile 2005-02-13 21:25:09 +11:00
+@@ -9,16 +9,45 @@
+
+ O_TARGET := ipv6.o
+
+-obj-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
++mod-subdirs := netfilter
++
++ifeq ($(CONFIG_IPV6),m)
++obj-m += ipv6.o
++endif
++
++ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
+ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
+ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
+ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
+ ip6_flowlabel.o ipv6_syms.o
+
+-export-objs := ipv6_syms.o
++export-objs := ipv6_syms.o xfrm6_input.o xfrm6_tunnel.o
++
++ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
++ xfrm6_output.o
++ipv6-objs += $(ipv6-y)
++
++obj-$(CONFIG_INET6_AH) += ah6.o
++obj-$(CONFIG_INET6_ESP) += esp6.o
++obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
++obj-$(CONFIG_INET6_TUNNEL) += xfrm6_tunnel.o
+
+-obj-m := $(O_TARGET)
++obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+
+-#obj-$(CONFIG_IPV6_FIREWALL) += ip6_fw.o
++subdir-$(CONFIG_NETFILTER) += netfilter
++
++ifeq ($(CONFIG_NETFILTER),y)
++obj-y += netfilter/netfilter.o
++endif
++
++ifeq ($(CONFIG_IPV6),y)
++obj-y += $(ipv6-objs)
++endif
+
+ include $(TOPDIR)/Rules.make
++
++
++ifeq ($(CONFIG_IPV6),m)
++ipv6.o: $(ipv6-objs)
++ $(LD) -r -o $@ $(ipv6-objs)
++endif
+diff -Nru a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+--- a/net/ipv6/addrconf.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv6/addrconf.c 2005-02-13 21:25:10 +11:00
+@@ -28,6 +28,8 @@
+ * packets.
+ * YOSHIFUJI Hideaki @USAGI : improved accuracy of
+ * address validation timer.
++ * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041)
++ * support.
+ * Yuji SEKIYA @USAGI : Don't assign a same IPv6
+ * address on a same interface.
+ * YOSHIFUJI Hideaki @USAGI : ARCnet support
+@@ -66,6 +68,12 @@
+ #include <linux/if_tunnel.h>
+ #include <linux/rtnetlink.h>
+
++#ifdef CONFIG_IPV6_PRIVACY
++#include <linux/random.h>
++#include <linux/crypto.h>
++#include <asm/scatterlist.h>
++#endif
++
+ #include <asm/uaccess.h>
+
+ #define IPV6_MAX_ADDRESSES 16
+@@ -87,6 +95,18 @@
+ int inet6_dev_count;
+ int inet6_ifa_count;
+
++#ifdef CONFIG_IPV6_PRIVACY
++static int __ipv6_regen_rndid(struct inet6_dev *idev);
++static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
++static void ipv6_regen_rndid(unsigned long data);
++
++static int desync_factor = MAX_DESYNC_FACTOR * HZ;
++static struct crypto_tfm *md5_tfm;
++static spinlock_t md5_tfm_lock = SPIN_LOCK_UNLOCKED;
++#endif
++
++static int ipv6_count_addresses(struct inet6_dev *idev);
++
+ /*
+ * Configured unicast address hash table
+ */
+@@ -125,6 +145,13 @@
+ MAX_RTR_SOLICITATIONS, /* router solicits */
+ RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */
+ MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */
++#ifdef CONFIG_IPV6_PRIVACY
++ .use_tempaddr = 0,
++ .temp_valid_lft = TEMP_VALID_LIFETIME,
++ .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
++ .regen_max_retry = REGEN_MAX_RETRY,
++ .max_desync_factor = MAX_DESYNC_FACTOR,
++#endif
+ };
+
+ static struct ipv6_devconf ipv6_devconf_dflt =
+@@ -139,6 +166,13 @@
+ MAX_RTR_SOLICITATIONS, /* router solicits */
+ RTR_SOLICITATION_INTERVAL, /* rtr solicit interval */
+ MAX_RTR_SOLICITATION_DELAY, /* rtr solicit delay */
++#ifdef CONFIG_IPV6_PRIVACY
++ .use_tempaddr = 0,
++ .temp_valid_lft = TEMP_VALID_LIFETIME,
++ .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
++ .regen_max_retry = REGEN_MAX_RETRY,
++ .max_desync_factor = MAX_DESYNC_FACTOR,
++#endif
+ };
+
+ /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+@@ -170,15 +204,8 @@
+ };
+ return type;
+ }
+- /* check for reserved anycast addresses */
+-
+- if ((st & htonl(0xE0000000)) &&
+- ((addr->s6_addr32[2] == htonl(0xFDFFFFFF) &&
+- (addr->s6_addr32[3] | htonl(0x7F)) == (u32)~0) ||
+- (addr->s6_addr32[2] == 0 && addr->s6_addr32[3] == 0)))
+- type = IPV6_ADDR_ANYCAST;
+- else
+- type = IPV6_ADDR_UNICAST;
++
++ type = IPV6_ADDR_UNICAST;
+
+ /* Consider all addresses with the first three bits different of
+ 000 and 111 as finished.
+@@ -299,10 +326,32 @@
+ /* We refer to the device */
+ dev_hold(dev);
+
++ /* One reference from device. We must do this before
++ * we invoke __ipv6_regen_rndid().
++ */
++ in6_dev_hold(ndev);
++
++#ifdef CONFIG_IPV6_PRIVACY
++ get_random_bytes(ndev->rndid, sizeof(ndev->rndid));
++ get_random_bytes(ndev->entropy, sizeof(ndev->entropy));
++ init_timer(&ndev->regen_timer);
++ ndev->regen_timer.function = ipv6_regen_rndid;
++ ndev->regen_timer.data = (unsigned long) ndev;
++ if ((dev->flags&IFF_LOOPBACK) ||
++ dev->type == ARPHRD_TUNNEL ||
++ dev->type == ARPHRD_SIT) {
++ printk(KERN_INFO
++ "Disabled Privacy Extensions on device %p(%s)\n",
++ dev, dev->name);
++ ndev->cnf.use_tempaddr = -1;
++ } else {
++ in6_dev_hold(ndev);
++ ipv6_regen_rndid((unsigned long) ndev);
++ }
++#endif
++
+ write_lock_bh(&addrconf_lock);
+ dev->ip6_ptr = ndev;
+- /* One reference from device */
+- in6_dev_hold(ndev);
+ write_unlock_bh(&addrconf_lock);
+
+ ipv6_mc_init_dev(ndev);
+@@ -330,38 +379,6 @@
+ return idev;
+ }
+
+-void ipv6_addr_prefix(struct in6_addr *prefix,
+- struct in6_addr *addr, int prefix_len)
+-{
+- unsigned long mask;
+- int ncopy, nbits;
+-
+- memset(prefix, 0, sizeof(*prefix));
+-
+- if (prefix_len <= 0)
+- return;
+- if (prefix_len > 128)
+- prefix_len = 128;
+-
+- ncopy = prefix_len / 32;
+- switch (ncopy) {
+- case 4: prefix->s6_addr32[3] = addr->s6_addr32[3];
+- case 3: prefix->s6_addr32[2] = addr->s6_addr32[2];
+- case 2: prefix->s6_addr32[1] = addr->s6_addr32[1];
+- case 1: prefix->s6_addr32[0] = addr->s6_addr32[0];
+- case 0: break;
+- }
+- nbits = prefix_len % 32;
+- if (nbits == 0)
+- return;
+-
+- mask = ~((1 << (32 - nbits)) - 1);
+- mask = htonl(mask);
+-
+- prefix->s6_addr32[ncopy] = addr->s6_addr32[ncopy] & mask;
+-}
+-
+-
+ static void dev_forward_change(struct inet6_dev *idev)
+ {
+ struct net_device *dev;
+@@ -501,6 +518,18 @@
+ /* Add to inet6_dev unicast addr list. */
+ ifa->if_next = idev->addr_list;
+ idev->addr_list = ifa;
++
++#ifdef CONFIG_IPV6_PRIVACY
++ ifa->regen_count = 0;
++ if (ifa->flags&IFA_F_TEMPORARY) {
++ ifa->tmp_next = idev->tempaddr_list;
++ idev->tempaddr_list = ifa;
++ in6_ifa_hold(ifa);
++ } else {
++ ifa->tmp_next = NULL;
++ }
++#endif
++
+ in6_ifa_hold(ifa);
+ write_unlock_bh(&idev->lock);
+ read_unlock(&addrconf_lock);
+@@ -523,6 +552,15 @@
+
+ ifp->dead = 1;
+
++#ifdef CONFIG_IPV6_PRIVACY
++ spin_lock_bh(&ifp->lock);
++ if (ifp->ifpub) {
++ __in6_ifa_put(ifp->ifpub);
++ ifp->ifpub = NULL;
++ }
++ spin_unlock_bh(&ifp->lock);
++#endif
++
+ write_lock_bh(&addrconf_hash_lock);
+ for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL;
+ ifap = &ifa->lst_next) {
+@@ -536,6 +574,24 @@
+ write_unlock_bh(&addrconf_hash_lock);
+
+ write_lock_bh(&idev->lock);
++#ifdef CONFIG_IPV6_PRIVACY
++ if (ifp->flags&IFA_F_TEMPORARY) {
++ for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL;
++ ifap = &ifa->tmp_next) {
++ if (ifa == ifp) {
++ *ifap = ifa->tmp_next;
++ if (ifp->ifpub) {
++ __in6_ifa_put(ifp->ifpub);
++ ifp->ifpub = NULL;
++ }
++ __in6_ifa_put(ifp);
++ ifa->tmp_next = NULL;
++ break;
++ }
++ }
++ }
++#endif
++
+ for (ifap = &idev->addr_list; (ifa=*ifap) != NULL;
+ ifap = &ifa->if_next) {
+ if (ifa == ifp) {
+@@ -556,6 +612,96 @@
+ in6_ifa_put(ifp);
+ }
+
++#ifdef CONFIG_IPV6_PRIVACY
++static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift)
++{
++ struct inet6_dev *idev;
++ struct in6_addr addr, *tmpaddr;
++ unsigned long tmp_prefered_lft, tmp_valid_lft;
++ int tmp_plen;
++ int ret = 0;
++
++ if (ift) {
++ spin_lock_bh(&ift->lock);
++ memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
++ spin_unlock_bh(&ift->lock);
++ tmpaddr = &addr;
++ } else {
++ tmpaddr = NULL;
++ }
++retry:
++ spin_lock_bh(&ifp->lock);
++ in6_ifa_hold(ifp);
++ idev = ifp->idev;
++ in6_dev_hold(idev);
++ memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
++ write_lock(&idev->lock);
++ if (idev->cnf.use_tempaddr <= 0) {
++ write_unlock(&idev->lock);
++ spin_unlock_bh(&ifp->lock);
++ printk(KERN_INFO
++ "ipv6_create_tempaddr(): use_tempaddr is disabled.\n");
++ in6_dev_put(idev);
++ in6_ifa_put(ifp);
++ ret = -1;
++ goto out;
++ }
++ if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
++ idev->cnf.use_tempaddr = -1; /*XXX*/
++ write_unlock(&idev->lock);
++ spin_unlock_bh(&ifp->lock);
++ printk(KERN_WARNING
++ "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n");
++ in6_dev_put(idev);
++ in6_ifa_put(ifp);
++ ret = -1;
++ goto out;
++ }
++ if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) {
++ write_unlock(&idev->lock);
++ spin_unlock_bh(&ifp->lock);
++ printk(KERN_WARNING
++ "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n");
++ in6_dev_put(idev);
++ in6_ifa_put(ifp);
++ ret = -1;
++ goto out;
++ }
++ memcpy(&addr.s6_addr[8], idev->rndid, 8);
++ tmp_valid_lft = min_t(__u32,
++ ifp->valid_lft,
++ idev->cnf.temp_valid_lft);
++ tmp_prefered_lft = min_t(__u32,
++ ifp->prefered_lft,
++ idev->cnf.temp_prefered_lft - desync_factor / HZ);
++ tmp_plen = ifp->prefix_len;
++ write_unlock(&idev->lock);
++ spin_unlock_bh(&ifp->lock);
++ ift = ipv6_count_addresses(idev) < IPV6_MAX_ADDRESSES ?
++ ipv6_add_addr(idev, &addr, tmp_plen,
++ ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : 0;
++ if (!ift || IS_ERR(ift)) {
++ in6_dev_put(idev);
++ in6_ifa_put(ifp);
++ printk(KERN_INFO
++ "ipv6_create_tempaddr(): retry temporary address regeneration.\n");
++ tmpaddr = &addr;
++ goto retry;
++ }
++ spin_lock_bh(&ift->lock);
++ ift->ifpub = ifp;
++ ift->valid_lft = tmp_valid_lft;
++ ift->prefered_lft = tmp_prefered_lft;
++ ift->tstamp = ifp->tstamp;
++ spin_unlock_bh(&ift->lock);
++ addrconf_dad_start(ift, 0);
++ in6_ifa_put(ift);
++ in6_dev_put(idev);
++out:
++ return ret;
++}
++#endif
++
+ /*
+ * Choose an apropriate source address
+ * should do:
+@@ -564,6 +710,22 @@
+ * an address of the attached interface
+ * iii) don't use deprecated addresses
+ */
++static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref)
++{
++ int pref;
++ pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2;
++#ifdef CONFIG_IPV6_PRIVACY
++ pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1;
++#endif
++ return pref;
++}
++
++#ifdef CONFIG_IPV6_PRIVACY
++#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3)
++#else
++#define IPV6_GET_SADDR_MAXSCORE(score) (score)
++#endif
++
+ int ipv6_dev_get_saddr(struct net_device *dev,
+ struct in6_addr *daddr, struct in6_addr *saddr, int onlink)
+ {
+@@ -572,6 +734,7 @@
+ struct inet6_dev *idev;
+ int scope;
+ int err;
++ int hiscore = -1, score;
+
+
+ if (!onlink)
+@@ -594,17 +757,27 @@
+ read_lock_bh(&idev->lock);
+ for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
+ if (ifp->scope == scope) {
+- if (!(ifp->flags & (IFA_F_DEPRECATED|IFA_F_TENTATIVE))) {
+- in6_ifa_hold(ifp);
++ if (ifp->flags&IFA_F_TENTATIVE)
++ continue;
++#ifdef CONFIG_IPV6_PRIVACY
++ score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0);
++#else
++ score = ipv6_saddr_pref(ifp, 0);
++#endif
++ if (score <= hiscore)
++ continue;
++
++ if (match)
++ in6_ifa_put(match);
++ match = ifp;
++ hiscore = score;
++ in6_ifa_hold(ifp);
++
++ if (IPV6_GET_SADDR_MAXSCORE(score)) {
+ read_unlock_bh(&idev->lock);
+ read_unlock(&addrconf_lock);
+ goto out;
+ }
+-
+- if (!match && !(ifp->flags & IFA_F_TENTATIVE)) {
+- match = ifp;
+- in6_ifa_hold(ifp);
+- }
+ }
+ }
+ read_unlock_bh(&idev->lock);
+@@ -627,16 +800,26 @@
+ read_lock_bh(&idev->lock);
+ for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
+ if (ifp->scope == scope) {
+- if (!(ifp->flags&(IFA_F_DEPRECATED|IFA_F_TENTATIVE))) {
+- in6_ifa_hold(ifp);
++ if (ifp->flags&IFA_F_TENTATIVE)
++ continue;
++#ifdef CONFIG_IPV6_PRIVACY
++ score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0);
++#else
++ score = ipv6_saddr_pref(ifp, 0);
++#endif
++ if (score <= hiscore)
++ continue;
++
++ if (match)
++ in6_ifa_put(match);
++ match = ifp;
++ hiscore = score;
++ in6_ifa_hold(ifp);
++
++ if (IPV6_GET_SADDR_MAXSCORE(score)) {
+ read_unlock_bh(&idev->lock);
+ goto out_unlock_base;
+ }
+-
+- if (!match && !(ifp->flags&IFA_F_TENTATIVE)) {
+- match = ifp;
+- in6_ifa_hold(ifp);
+- }
+ }
+ }
+ read_unlock_bh(&idev->lock);
+@@ -648,24 +831,16 @@
+ read_unlock(&dev_base_lock);
+
+ out:
+- if (ifp == NULL) {
+- ifp = match;
+- match = NULL;
+- }
+-
+ err = -EADDRNOTAVAIL;
+- if (ifp) {
+- ipv6_addr_copy(saddr, &ifp->addr);
++ if (match) {
++ ipv6_addr_copy(saddr, &match->addr);
+ err = 0;
+- in6_ifa_put(ifp);
+- }
+- if (match)
+ in6_ifa_put(match);
++ }
+
+ return err;
+ }
+
+-
+ int ipv6_get_saddr(struct dst_entry *dst,
+ struct in6_addr *daddr, struct in6_addr *saddr)
+ {
+@@ -706,7 +881,7 @@
+ return err;
+ }
+
+-int ipv6_count_addresses(struct inet6_dev *idev)
++static int ipv6_count_addresses(struct inet6_dev *idev)
+ {
+ int cnt = 0;
+ struct inet6_ifaddr *ifp;
+@@ -785,6 +960,21 @@
+ ifp->flags |= IFA_F_TENTATIVE;
+ spin_unlock_bh(&ifp->lock);
+ in6_ifa_put(ifp);
++#ifdef CONFIG_IPV6_PRIVACY
++ } else if (ifp->flags&IFA_F_TEMPORARY) {
++ struct inet6_ifaddr *ifpub;
++ spin_lock_bh(&ifp->lock);
++ ifpub = ifp->ifpub;
++ if (ifpub) {
++ in6_ifa_hold(ifpub);
++ spin_unlock_bh(&ifp->lock);
++ ipv6_create_tempaddr(ifpub, ifp);
++ in6_ifa_put(ifpub);
++ } else {
++ spin_unlock_bh(&ifp->lock);
++ }
++ ipv6_del_addr(ifp);
++#endif
+ } else
+ ipv6_del_addr(ifp);
+ }
+@@ -857,6 +1047,110 @@
+ return err;
+ }
+
++#ifdef CONFIG_IPV6_PRIVACY
++/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
++static int __ipv6_regen_rndid(struct inet6_dev *idev)
++{
++ struct net_device *dev;
++ struct scatterlist sg[2];
++
++ sg[0].page = virt_to_page(idev->entropy);
++ sg[0].offset = ((long) idev->entropy & ~PAGE_MASK);
++ sg[0].length = 8;
++ sg[1].page = virt_to_page(idev->work_eui64);
++ sg[1].offset = ((long) idev->work_eui64 & ~PAGE_MASK);
++ sg[1].length = 8;
++
++ dev = idev->dev;
++
++ if (ipv6_generate_eui64(idev->work_eui64, dev)) {
++ printk(KERN_INFO
++ "__ipv6_regen_rndid(idev=%p): cannot get EUI64 identifier; use random bytes.\n",
++ idev);
++ get_random_bytes(idev->work_eui64, sizeof(idev->work_eui64));
++ }
++regen:
++ spin_lock(&md5_tfm_lock);
++ if (unlikely(md5_tfm == NULL)) {
++ spin_unlock(&md5_tfm_lock);
++ return -1;
++ }
++ crypto_digest_init(md5_tfm);
++ crypto_digest_update(md5_tfm, sg, 2);
++ crypto_digest_final(md5_tfm, idev->work_digest);
++ spin_unlock(&md5_tfm_lock);
++
++ memcpy(idev->rndid, &idev->work_digest[0], 8);
++ idev->rndid[0] &= ~0x02;
++ memcpy(idev->entropy, &idev->work_digest[8], 8);
++
++ /*
++ * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
++ * check if generated address is not inappropriate
++ *
++ * - Reserved subnet anycast (RFC 2526)
++ * 11111101 11....11 1xxxxxxx
++ * - ISATAP (draft-ietf-ngtrans-isatap-01.txt) 4.3
++ * 00-00-5E-FE-xx-xx-xx-xx
++ * - value 0
++ * - XXX: already assigned to an address on the device
++ */
++ if (idev->rndid[0] == 0xfd &&
++ (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) &&
++ (idev->rndid[7]&0x80))
++ goto regen;
++ if ((idev->rndid[0]|idev->rndid[1]) == 0) {
++ if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
++ goto regen;
++ if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
++ goto regen;
++ }
++
++ return 0;
++}
++
++static void ipv6_regen_rndid(unsigned long data)
++{
++ struct inet6_dev *idev = (struct inet6_dev *) data;
++ unsigned long expires;
++
++ read_lock_bh(&addrconf_lock);
++ write_lock_bh(&idev->lock);
++
++ if (idev->dead)
++ goto out;
++
++ if (__ipv6_regen_rndid(idev) < 0)
++ goto out;
++
++ expires = jiffies +
++ idev->cnf.temp_prefered_lft * HZ -
++ idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor;
++ if (time_before(expires, jiffies)) {
++ printk(KERN_WARNING
++ "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n",
++ idev->dev->name);
++ goto out;
++ }
++
++ if (!mod_timer(&idev->regen_timer, expires))
++ in6_dev_hold(idev);
++
++out:
++ write_unlock_bh(&idev->lock);
++ read_unlock_bh(&addrconf_lock);
++ in6_dev_put(idev);
++}
++
++static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) {
++ int ret = 0;
++
++ if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
++ ret = __ipv6_regen_rndid(idev);
++ return ret;
++}
++#endif
++
+ /*
+ * Add prefix route.
+ */
+@@ -883,7 +1177,7 @@
+ if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
+ rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
+
+- ip6_route_add(&rtmsg, NULL);
++ ip6_route_add(&rtmsg, NULL, NULL);
+ }
+
+ /* Create "default" multicast route to the interface */
+@@ -900,7 +1194,7 @@
+ rtmsg.rtmsg_ifindex = dev->ifindex;
+ rtmsg.rtmsg_flags = RTF_UP;
+ rtmsg.rtmsg_type = RTMSG_NEWROUTE;
+- ip6_route_add(&rtmsg, NULL);
++ ip6_route_add(&rtmsg, NULL, NULL);
+ }
+
+ static void sit_route_add(struct net_device *dev)
+@@ -917,7 +1211,7 @@
+ rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
+ rtmsg.rtmsg_ifindex = dev->ifindex;
+
+- ip6_route_add(&rtmsg, NULL);
++ ip6_route_add(&rtmsg, NULL, NULL);
+ }
+
+ static void addrconf_add_lroute(struct net_device *dev)
+@@ -948,7 +1242,6 @@
+ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
+ {
+ struct prefix_info *pinfo;
+- struct rt6_info *rt;
+ __u32 valid_lft;
+ __u32 prefered_lft;
+ int addr_type;
+@@ -1004,32 +1297,33 @@
+ else
+ rt_expires = jiffies + valid_lft * HZ;
+
+- rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
+-
+- if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
+- if (rt->rt6i_flags&RTF_EXPIRES) {
+- if (pinfo->onlink == 0 || valid_lft == 0) {
+- ip6_del_rt(rt, NULL);
+- rt = NULL;
+- } else {
+- rt->rt6i_expires = rt_expires;
++ if (pinfo->onlink) {
++ struct rt6_info *rt;
++ rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
++
++ if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
++ if (rt->rt6i_flags&RTF_EXPIRES) {
++ if (valid_lft == 0) {
++ ip6_del_rt(rt, NULL, NULL);
++ rt = NULL;
++ } else {
++ rt->rt6i_expires = rt_expires;
++ }
+ }
++ } else if (valid_lft) {
++ addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
++ dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
+ }
+- } else if (pinfo->onlink && valid_lft) {
+- addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
+- dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT);
++ if (rt)
++ dst_release(&rt->u.dst);
+ }
+- if (rt)
+- dst_release(&rt->u.dst);
+
+ /* Try to figure out our local address for this prefix */
+
+ if (pinfo->autoconf && in6_dev->cnf.autoconf) {
+ struct inet6_ifaddr * ifp;
+ struct in6_addr addr;
+- int plen;
+-
+- plen = pinfo->prefix_len >> 3;
++ int create = 0, update_lft = 0;
+
+ if (pinfo->prefix_len == 64) {
+ memcpy(&addr, &pinfo->prefix, 8);
+@@ -1058,33 +1352,95 @@
+ ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len,
+ addr_type&IPV6_ADDR_SCOPE_MASK, 0);
+
+- if (IS_ERR(ifp)) {
++ if (!ifp || IS_ERR(ifp)) {
+ in6_dev_put(in6_dev);
+ return;
+ }
+
++ update_lft = create = 1;
+ addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT);
+ }
+
+- if (ifp && valid_lft == 0) {
+- ipv6_del_addr(ifp);
+- ifp = NULL;
+- }
+-
+ if (ifp) {
+ int flags;
++ unsigned long now;
++#ifdef CONFIG_IPV6_PRIVACY
++ struct inet6_ifaddr *ift;
++#endif
++ u32 stored_lft;
+
++ /* update lifetime (RFC2462 5.5.3 e) */
+ spin_lock(&ifp->lock);
+- ifp->valid_lft = valid_lft;
+- ifp->prefered_lft = prefered_lft;
+- ifp->tstamp = jiffies;
+- flags = ifp->flags;
+- ifp->flags &= ~IFA_F_DEPRECATED;
+- spin_unlock(&ifp->lock);
+-
+- if (!(flags&IFA_F_TENTATIVE))
+- ipv6_ifa_notify((flags&IFA_F_DEPRECATED) ?
+- 0 : RTM_NEWADDR, ifp);
++ now = jiffies;
++ if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
++ stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
++ else
++ stored_lft = 0;
++ if (!update_lft && stored_lft) {
++ if (valid_lft > MIN_VALID_LIFETIME ||
++ valid_lft > stored_lft)
++ update_lft = 1;
++ else if (stored_lft <= MIN_VALID_LIFETIME) {
++ /* valid_lft <= stored_lft is always true */
++ /* XXX: IPsec */
++ update_lft = 0;
++ } else {
++ valid_lft = MIN_VALID_LIFETIME;
++ if (valid_lft < prefered_lft)
++ prefered_lft = valid_lft;
++ update_lft = 1;
++ }
++ }
++
++ if (update_lft) {
++ ifp->valid_lft = valid_lft;
++ ifp->prefered_lft = prefered_lft;
++ ifp->tstamp = now;
++ flags = ifp->flags;
++ ifp->flags &= ~IFA_F_DEPRECATED;
++ spin_unlock(&ifp->lock);
++
++ if (!(flags&IFA_F_TENTATIVE))
++ ipv6_ifa_notify((flags&IFA_F_DEPRECATED) ?
++ 0 : RTM_NEWADDR, ifp);
++ } else
++ spin_unlock(&ifp->lock);
++
++#ifdef CONFIG_IPV6_PRIVACY
++ read_lock_bh(&in6_dev->lock);
++ /* update all temporary addresses in the list */
++ for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) {
++ /*
++ * When adjusting the lifetimes of an existing
++ * temporary address, only lower the lifetimes.
++ * Implementations must not increase the
++ * lifetimes of an existing temporary address
++ * when processing a Prefix Information Option.
++ */
++ spin_lock(&ift->lock);
++ flags = ift->flags;
++ if (ift->valid_lft > valid_lft &&
++ ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ)
++ ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ;
++ if (ift->prefered_lft > prefered_lft &&
++ ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ)
++ ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ;
++ spin_unlock(&ift->lock);
++ if (!(flags&IFA_F_TENTATIVE))
++ ipv6_ifa_notify(0, ift);
++ }
++
++ if (create && in6_dev->cnf.use_tempaddr > 0) {
++ /*
++ * When a new public address is created as described in [ADDRCONF],
++ * also create a new temporary address.
++ */
++ read_unlock_bh(&in6_dev->lock);
++ ipv6_create_tempaddr(ifp, NULL);
++ } else {
++ read_unlock_bh(&in6_dev->lock);
++ }
++#endif
+ in6_ifa_put(ifp);
+ addrconf_verify(0);
+ }
+@@ -1407,6 +1763,54 @@
+ sit_route_add(dev);
+ }
+
++static inline int
++ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
++{
++ struct in6_addr lladdr;
++
++ if (!ipv6_get_lladdr(link_dev, &lladdr)) {
++ addrconf_add_linklocal(idev, &lladdr);
++ return 0;
++ }
++ return -1;
++}
++
++static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
++{
++ struct net_device *link_dev;
++
++ /* first try to inherit the link-local address from the link device */
++ if (idev->dev->iflink &&
++ (link_dev = __dev_get_by_index(idev->dev->iflink))) {
++ if (!ipv6_inherit_linklocal(idev, link_dev))
++ return;
++ }
++ /* then try to inherit it from any device */
++ for (link_dev = dev_base; link_dev; link_dev = link_dev->next) {
++ if (!ipv6_inherit_linklocal(idev, link_dev))
++ return;
++ }
++ printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n");
++}
++
++/*
++ * Autoconfigure tunnel with a link-local address so routing protocols,
++ * DHCPv6, MLD etc. can be run over the virtual link
++ */
++
++static void addrconf_ip6_tnl_config(struct net_device *dev)
++{
++ struct inet6_dev *idev;
++
++ ASSERT_RTNL();
++
++ if ((idev = addrconf_add_dev(dev)) == NULL) {
++ printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n");
++ return;
++ }
++ ip6_tnl_add_linklocal(idev);
++ addrconf_add_mroute(dev);
++}
+
+ int addrconf_notify(struct notifier_block *this, unsigned long event,
+ void * data)
+@@ -1420,7 +1824,9 @@
+ case ARPHRD_SIT:
+ addrconf_sit_config(dev);
+ break;
+-
++ case ARPHRD_TUNNEL6:
++ addrconf_ip6_tnl_config(dev);
++ break;
+ case ARPHRD_LOOPBACK:
+ init_loopback(dev);
+ break;
+@@ -1515,6 +1921,27 @@
+ /* Step 3: clear address list */
+
+ write_lock_bh(&idev->lock);
++#ifdef CONFIG_IPV6_PRIVACY
++ if (how == 1 && del_timer(&idev->regen_timer))
++ in6_dev_put(idev);
++
++ /* clear tempaddr list */
++ while ((ifa = idev->tempaddr_list) != NULL) {
++ idev->tempaddr_list = ifa->tmp_next;
++ ifa->tmp_next = NULL;
++ ifa->dead = 1;
++ write_unlock_bh(&idev->lock);
++ spin_lock_bh(&ifa->lock);
++
++ if (ifa->ifpub) {
++ in6_ifa_put(ifa->ifpub);
++ ifa->ifpub = NULL;
++ }
++ spin_unlock_bh(&ifa->lock);
++ in6_ifa_put(ifa);
++ write_lock_bh(&idev->lock);
++ }
++#endif
+ while ((ifa = idev->addr_list) != NULL) {
+ idev->addr_list = ifa->if_next;
+ ifa->if_next = NULL;
+@@ -1539,10 +1966,11 @@
+ /* Shot the device (if unregistered) */
+
+ if (how == 1) {
+- neigh_parms_release(&nd_tbl, idev->nd_parms);
+ #ifdef CONFIG_SYSCTL
+ addrconf_sysctl_unregister(&idev->cnf);
++ neigh_sysctl_unregister(idev->nd_parms);
+ #endif
++ neigh_parms_release(&nd_tbl, idev->nd_parms);
+ in6_dev_put(idev);
+ }
+ return 0;
+@@ -1592,7 +2020,7 @@
+
+ rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex;
+
+- ip6_route_add(&rtmsg, NULL);
++ ip6_route_add(&rtmsg, NULL, NULL);
+ }
+
+ out:
+@@ -1612,7 +2040,8 @@
+ addrconf_join_solict(dev, &ifp->addr);
+
+ if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT))
+- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, flags);
++ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0,
++ flags);
+
+ net_srandom(ifp->addr.s6_addr32[3]);
+ rand_num = net_random() % (ifp->idev->cnf.rtr_solicit_delay ? : 1);
+@@ -1682,6 +2111,7 @@
+ */
+
+ if (ifp->idev->cnf.forwarding == 0 &&
++ ifp->idev->cnf.rtr_solicits > 0 &&
+ (dev->flags&IFF_LOOPBACK) == 0 &&
+ (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
+ struct in6_addr all_routers;
+@@ -1787,6 +2217,9 @@
+ write_lock(&addrconf_hash_lock);
+ for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) {
+ unsigned long age;
++#ifdef CONFIG_IPV6_PRIVACY
++ unsigned long regen_advance;
++#endif
+
+ if (ifp->flags & IFA_F_PERMANENT)
+ continue;
+@@ -1794,6 +2227,12 @@
+ spin_lock(&ifp->lock);
+ age = (now - ifp->tstamp) / HZ;
+
++#ifdef CONFIG_IPV6_PRIVACY
++ regen_advance = ifp->idev->cnf.regen_max_retry *
++ ifp->idev->cnf.dad_transmits *
++ ifp->idev->nd_parms->retrans_time / HZ;
++#endif
++
+ if (age >= ifp->valid_lft) {
+ spin_unlock(&ifp->lock);
+ in6_ifa_hold(ifp);
+@@ -1822,6 +2261,28 @@
+ in6_ifa_put(ifp);
+ goto restart;
+ }
++#ifdef CONFIG_IPV6_PRIVACY
++ } else if ((ifp->flags&IFA_F_TEMPORARY) &&
++ !(ifp->flags&IFA_F_TENTATIVE)) {
++ if (age >= ifp->prefered_lft - regen_advance) {
++ struct inet6_ifaddr *ifpub = ifp->ifpub;
++ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
++ next = ifp->tstamp + ifp->prefered_lft * HZ;
++ if (!ifp->regen_count && ifpub) {
++ ifp->regen_count++;
++ in6_ifa_hold(ifp);
++ in6_ifa_hold(ifpub);
++ spin_unlock(&ifp->lock);
++ write_unlock(&addrconf_hash_lock);
++ ipv6_create_tempaddr(ifpub, ifp);
++ in6_ifa_put(ifpub);
++ in6_ifa_put(ifp);
++ goto restart;
++ }
++ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
++ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
++ spin_unlock(&ifp->lock);
++#endif
+ } else {
+ /* ifp->prefered_lft <= ifp->valid_lft */
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+@@ -2106,7 +2567,7 @@
+
+ switch (event) {
+ case RTM_NEWADDR:
+- ip6_rt_addr_add(&ifp->addr, ifp->idev->dev);
++ ip6_rt_addr_add(&ifp->addr, ifp->idev->dev, 0);
+ break;
+ case RTM_DELADDR:
+ addrconf_leave_solict(ifp->idev->dev, &ifp->addr);
+@@ -2157,7 +2618,7 @@
+ static struct addrconf_sysctl_table
+ {
+ struct ctl_table_header *sysctl_header;
+- ctl_table addrconf_vars[11];
++ ctl_table addrconf_vars[16];
+ ctl_table addrconf_dev[2];
+ ctl_table addrconf_conf_dir[2];
+ ctl_table addrconf_proto_dir[2];
+@@ -2204,6 +2665,28 @@
+ &ipv6_devconf.rtr_solicit_delay, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
+
++#ifdef CONFIG_IPV6_PRIVACY
++ {NET_IPV6_USE_TEMPADDR, "use_tempaddr",
++ &ipv6_devconf.use_tempaddr, sizeof(int), 0644, NULL,
++ &proc_dointvec},
++
++ {NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft",
++ &ipv6_devconf.temp_valid_lft, sizeof(int), 0644, NULL,
++ &proc_dointvec},
++
++ {NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft",
++ &ipv6_devconf.temp_prefered_lft, sizeof(int), 0644, NULL,
++ &proc_dointvec},
++
++ {NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry",
++ &ipv6_devconf.regen_max_retry, sizeof(int), 0644, NULL,
++ &proc_dointvec},
++
++ {NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor",
++ &ipv6_devconf.max_desync_factor, sizeof(int), 0644, NULL,
++ &proc_dointvec},
++#endif
++
+ {0}},
+
+ {{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, addrconf_sysctl.addrconf_vars},{0}},
+@@ -2222,7 +2705,7 @@
+ if (t == NULL)
+ return;
+ memcpy(t, &addrconf_sysctl, sizeof(*t));
+- for (i=0; i<sizeof(t->addrconf_vars)/sizeof(t->addrconf_vars[0])-1; i++) {
++ for (i=0; t->addrconf_vars[i].data; i++) {
+ t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf;
+ t->addrconf_vars[i].de = NULL;
+ t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
+@@ -2285,7 +2768,16 @@
+ {
+ #ifdef MODULE
+ struct net_device *dev;
++#endif
+
++#ifdef CONFIG_IPV6_PRIVACY
++ md5_tfm = crypto_alloc_tfm("md5", 0);
++ if (unlikely(md5_tfm == NULL))
++ printk(KERN_WARNING
++ "failed to load transform for md5\n");
++#endif
++
++#ifdef MODULE
+ /* This takes sense only during module load. */
+ rtnl_lock();
+ for (dev = dev_base; dev; dev = dev->next) {
+@@ -2370,6 +2862,13 @@
+ del_timer(&addr_chk_timer);
+
+ rtnl_unlock();
++
++#ifdef CONFIG_IPV6_PRIVACY
++ if (likely(md5_tfm != NULL)) {
++ crypto_free_tfm(md5_tfm);
++ md5_tfm = NULL;
++ }
++#endif
+
+ #ifdef CONFIG_PROC_FS
+ proc_net_remove("if_inet6");
+diff -Nru a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+--- a/net/ipv6/af_inet6.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/af_inet6.c 2005-02-13 21:25:09 +11:00
+@@ -58,6 +58,9 @@
+ #include <net/transp_v6.h>
+ #include <net/ip6_route.h>
+ #include <net/addrconf.h>
++#if CONFIG_IPV6_TUNNEL
++#include <net/ip6_tunnel.h>
++#endif
+
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -181,7 +184,7 @@
+ /* Init the ipv4 part of the socket since we can have sockets
+ * using v6 API for ipv4.
+ */
+- sk->protinfo.af_inet.ttl = 64;
++ sk->protinfo.af_inet.uc_ttl = -1;
+
+ sk->protinfo.af_inet.mc_loop = 1;
+ sk->protinfo.af_inet.mc_ttl = 1;
+@@ -651,6 +654,11 @@
+ */
+ inet6_register_protosw(&rawv6_protosw);
+
++ /* Register the family here so that the init calls below will
++ * be able to create sockets. (?? is this dangerous ??)
++ */
++ (void) sock_register(&inet6_family_ops);
++
+ /*
+ * ipngwg API draft makes clear that the correct semantics
+ * for TCP and UDP is to consider one TCP and UDP instance
+@@ -667,6 +675,11 @@
+ err = ndisc_init(&inet6_family_ops);
+ if (err)
+ goto ndisc_fail;
++#ifdef CONFIG_IPV6_TUNNEL
++ err = ip6_tunnel_init();
++ if (err)
++ goto ip6_tunnel_fail;
++#endif
+ err = igmp6_init(&inet6_family_ops);
+ if (err)
+ goto igmp_fail;
+@@ -692,15 +705,17 @@
+ ip6_flowlabel_init();
+ addrconf_init();
+ sit_init();
++
++ /* Init v6 extention headers. */
++ ipv6_rthdr_init();
+ ipv6_frag_init();
++ ipv6_nodata_init();
++ ipv6_destopt_init();
+
+ /* Init v6 transport protocols. */
+ udpv6_init();
+ tcpv6_init();
+
+- /* Now the userspace is allowed to create INET6 sockets. */
+- (void) sock_register(&inet6_family_ops);
+-
+ return 0;
+
+ #ifdef CONFIG_PROC_FS
+@@ -718,6 +733,10 @@
+ igmp6_cleanup();
+ #endif
+ igmp_fail:
++#ifdef CONFIG_IPV6_TUNNEL
++ ip6_tunnel_cleanup();
++ip6_tunnel_fail:
++#endif
+ ndisc_cleanup();
+ ndisc_fail:
+ icmpv6_cleanup();
+@@ -751,6 +770,9 @@
+ ip6_route_cleanup();
+ ipv6_packet_cleanup();
+ igmp6_cleanup();
++#ifdef CONFIG_IPV6_TUNNEL
++ ip6_tunnel_cleanup();
++#endif
+ ndisc_cleanup();
+ icmpv6_cleanup();
+ #ifdef CONFIG_SYSCTL
+diff -Nru a/net/ipv6/ah6.c b/net/ipv6/ah6.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/ah6.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,481 @@
++/*
++ * Copyright (C)2002 USAGI/WIDE Project
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ *
++ * Authors
++ *
++ * Mitsuru KANDA @USAGI : IPv6 Support
++ * Kazunori MIYAZAWA @USAGI :
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ *
++ * This file is derived from net/ipv4/ah.c.
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/ah.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <linux/string.h>
++#include <net/icmp.h>
++#include <net/ipv6.h>
++#include <net/xfrm.h>
++#include <asm/scatterlist.h>
++
++static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
++{
++ u8 *opt = (u8 *)opthdr;
++ int len = ipv6_optlen(opthdr);
++ int off = 0;
++ int optlen = 0;
++
++ off += 2;
++ len -= 2;
++
++ while (len > 0) {
++
++ switch (opt[off]) {
++
++ case IPV6_TLV_PAD0:
++ optlen = 1;
++ break;
++ default:
++ if (len < 2)
++ goto bad;
++ optlen = opt[off+1]+2;
++ if (len < optlen)
++ goto bad;
++ if (opt[off] & 0x20)
++ memset(&opt[off+2], 0, opt[off+1]);
++ break;
++ }
++
++ off += optlen;
++ len -= optlen;
++ }
++ if (len == 0)
++ return 1;
++
++bad:
++ return 0;
++}
++
++/**
++ * ipv6_rearrange_rthdr - rearrange IPv6 routing header
++ * @iph: IPv6 header
++ * @rthdr: routing header
++ *
++ * Rearrange the destination address in @iph and the addresses in @rthdr
++ * so that they appear in the order they will at the final destination.
++ * See Appendix A2 of RFC 2402 for details.
++ */
++static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
++{
++ int segments, segments_left;
++ struct in6_addr *addrs;
++ struct in6_addr final_addr;
++
++ segments_left = rthdr->segments_left;
++ if (segments_left == 0)
++ return;
++ rthdr->segments_left = 0;
++
++ /* The value of rthdr->hdrlen has been verified either by the system
++ * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming
++ * packets. So we can assume that it is even and that segments is
++ * greater than or equal to segments_left.
++ *
++ * For the same reason we can assume that this option is of type 0.
++ */
++ segments = rthdr->hdrlen >> 1;
++
++ addrs = ((struct rt0_hdr *)rthdr)->addr;
++ ipv6_addr_copy(&final_addr, addrs + segments - 1);
++
++ addrs += segments - segments_left;
++ memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs));
++
++ ipv6_addr_copy(addrs, &iph->daddr);
++ ipv6_addr_copy(&iph->daddr, &final_addr);
++}
++
++static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
++{
++ union {
++ struct ipv6hdr *iph;
++ struct ipv6_opt_hdr *opth;
++ struct ipv6_rt_hdr *rth;
++ char *raw;
++ } exthdr = { .iph = iph };
++ char *end = exthdr.raw + len;
++ int nexthdr = iph->nexthdr;
++
++ exthdr.iph++;
++
++ while (exthdr.raw < end) {
++ switch (nexthdr) {
++ case NEXTHDR_HOP:
++ case NEXTHDR_DEST:
++ if (!zero_out_mutable_opts(exthdr.opth)) {
++ if (net_ratelimit())
++ printk(KERN_WARNING "overrun %sopts\n",
++ nexthdr == NEXTHDR_HOP ?
++ "hop" : "dest");
++ return -EINVAL;
++ }
++ break;
++
++ case NEXTHDR_ROUTING:
++ ipv6_rearrange_rthdr(iph, exthdr.rth);
++ break;
++
++ default :
++ return 0;
++ }
++
++ nexthdr = exthdr.opth->nexthdr;
++ exthdr.raw += ipv6_optlen(exthdr.opth);
++ }
++
++ return 0;
++}
++
++static int ah6_output(struct sk_buff *skb)
++{
++ int err;
++ int extlen;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct ipv6hdr *top_iph;
++ struct ip_auth_hdr *ah;
++ struct ah_data *ahp;
++ u8 nexthdr;
++ char tmp_base[8];
++ struct {
++ struct in6_addr daddr;
++ char hdrs[0];
++ } *tmp_ext;
++
++ top_iph = (struct ipv6hdr *)skb->data;
++ top_iph->payload_len = htons(skb->len - sizeof(*top_iph));
++
++ nexthdr = *skb->nh.raw;
++ *skb->nh.raw = IPPROTO_AH;
++
++ /* When there are no extension headers, we only need to save the first
++ * 8 bytes of the base IP header.
++ */
++ memcpy(tmp_base, top_iph, sizeof(tmp_base));
++
++ tmp_ext = NULL;
++ extlen = skb->h.raw - (unsigned char *)(top_iph + 1);
++ if (extlen) {
++ extlen += sizeof(*tmp_ext);
++ tmp_ext = kmalloc(extlen, GFP_ATOMIC);
++ if (!tmp_ext) {
++ err = -ENOMEM;
++ goto error;
++ }
++ memcpy(tmp_ext, &top_iph->daddr, extlen);
++ err = ipv6_clear_mutable_options(top_iph,
++ extlen - sizeof(*tmp_ext) +
++ sizeof(*top_iph));
++ if (err)
++ goto error_free_iph;
++ }
++
++ ah = (struct ip_auth_hdr *)skb->h.raw;
++ ah->nexthdr = nexthdr;
++
++ top_iph->priority = 0;
++ top_iph->flow_lbl[0] = 0;
++ top_iph->flow_lbl[1] = 0;
++ top_iph->flow_lbl[2] = 0;
++ top_iph->hop_limit = 0;
++
++ ahp = x->data;
++ ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) +
++ ahp->icv_trunc_len) >> 2) - 2;
++
++ ah->reserved = 0;
++ ah->spi = x->id.spi;
++ ah->seq_no = htonl(++x->replay.oseq);
++ ahp->icv(ahp, skb, ah->auth_data);
++
++ err = 0;
++
++ memcpy(top_iph, tmp_base, sizeof(tmp_base));
++ if (tmp_ext) {
++ memcpy(&top_iph->daddr, tmp_ext, extlen);
++error_free_iph:
++ kfree(tmp_ext);
++ }
++
++error:
++ return err;
++}
++
++static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ /*
++ * Before process AH
++ * [IPv6][Ext1][Ext2][AH][Dest][Payload]
++ * |<-------------->| hdr_len
++ *
++ * To erase AH:
++ * Keeping copy of cleared headers. After AH processing,
++ * Moving the pointer of skb->nh.raw by using skb_pull as long as AH
++ * header length. Then copy back the copy as long as hdr_len
++ * If destination header following AH exists, copy it into after [Ext2].
++ *
++ * |<>|[IPv6][Ext1][Ext2][Dest][Payload]
++ * There is offset of AH before IPv6 header after the process.
++ */
++
++ struct ipv6_auth_hdr *ah;
++ struct ah_data *ahp;
++ unsigned char *tmp_hdr = NULL;
++ u16 hdr_len;
++ u16 ah_hlen;
++ int nexthdr;
++
++ if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
++ goto out;
++
++ /* We are going to _remove_ AH header to keep sockets happy,
++ * so... Later this can change. */
++ if (skb_cloned(skb) &&
++ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ goto out;
++
++ hdr_len = skb->data - skb->nh.raw;
++ ah = (struct ipv6_auth_hdr*)skb->data;
++ ahp = x->data;
++ nexthdr = ah->nexthdr;
++ ah_hlen = (ah->hdrlen + 2) << 2;
++
++ if (ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_full_len) &&
++ ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len))
++ goto out;
++
++ if (!pskb_may_pull(skb, ah_hlen))
++ goto out;
++
++ tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
++ if (!tmp_hdr)
++ goto out;
++ memcpy(tmp_hdr, skb->nh.raw, hdr_len);
++ if (ipv6_clear_mutable_options(skb->nh.ipv6h, hdr_len))
++ goto out;
++ skb->nh.ipv6h->priority = 0;
++ skb->nh.ipv6h->flow_lbl[0] = 0;
++ skb->nh.ipv6h->flow_lbl[1] = 0;
++ skb->nh.ipv6h->flow_lbl[2] = 0;
++ skb->nh.ipv6h->hop_limit = 0;
++
++ {
++ u8 auth_data[MAX_AH_AUTH_LEN];
++
++ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
++ memset(ah->auth_data, 0, ahp->icv_trunc_len);
++ skb_push(skb, skb->data - skb->nh.raw);
++ ahp->icv(ahp, skb, ah->auth_data);
++ if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
++ if (net_ratelimit())
++ printk(KERN_WARNING "ipsec ah authentication error\n");
++ x->stats.integrity_failed++;
++ goto free_out;
++ }
++ }
++
++ skb->nh.raw = skb_pull(skb, ah_hlen);
++ memcpy(skb->nh.raw, tmp_hdr, hdr_len);
++ skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++ skb_pull(skb, hdr_len);
++ skb->h.raw = skb->data;
++
++
++ kfree(tmp_hdr);
++
++ return nexthdr;
++
++free_out:
++ kfree(tmp_hdr);
++out:
++ return -EINVAL;
++}
++
++static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info)
++{
++ struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
++ struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset);
++ struct xfrm_state *x;
++
++ if (type != ICMPV6_DEST_UNREACH &&
++ type != ICMPV6_PKT_TOOBIG)
++ return;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
++ if (!x)
++ return;
++
++ printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
++ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
++ ntohl(ah->spi), NIP6(iph->daddr));
++
++ xfrm_state_put(x);
++}
++
++static int ah6_init_state(struct xfrm_state *x, void *args)
++{
++ struct ah_data *ahp = NULL;
++ struct xfrm_algo_desc *aalg_desc;
++
++ if (!x->aalg)
++ goto error;
++
++ /* null auth can use a zero length key */
++ if (x->aalg->alg_key_len > 512)
++ goto error;
++
++ if (x->encap)
++ goto error;
++
++ ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
++ if (ahp == NULL)
++ return -ENOMEM;
++
++ memset(ahp, 0, sizeof(*ahp));
++
++ ahp->key = x->aalg->alg_key;
++ ahp->key_len = (x->aalg->alg_key_len+7)/8;
++ ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
++ if (!ahp->tfm)
++ goto error;
++ ahp->icv = ah_hmac_digest;
++
++ /*
++ * Lookup the algorithm description maintained by xfrm_algo,
++ * verify crypto transform properties, and store information
++ * we need for AH processing. This lookup cannot fail here
++ * after a successful crypto_alloc_tfm().
++ */
++ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name);
++ BUG_ON(!aalg_desc);
++
++ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
++ crypto_tfm_alg_digestsize(ahp->tfm)) {
++ printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
++ x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
++ aalg_desc->uinfo.auth.icv_fullbits/8);
++ goto error;
++ }
++
++ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
++ ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
++
++ BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
++
++ ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
++ if (!ahp->work_icv)
++ goto error;
++
++ x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len);
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct ipv6hdr);
++ x->data = ahp;
++
++ return 0;
++
++error:
++ if (ahp) {
++ if (ahp->work_icv)
++ kfree(ahp->work_icv);
++ if (ahp->tfm)
++ crypto_free_tfm(ahp->tfm);
++ kfree(ahp);
++ }
++ return -EINVAL;
++}
++
++static void ah6_destroy(struct xfrm_state *x)
++{
++ struct ah_data *ahp = x->data;
++
++ if (!ahp)
++ return;
++
++ if (ahp->work_icv) {
++ kfree(ahp->work_icv);
++ ahp->work_icv = NULL;
++ }
++ if (ahp->tfm) {
++ crypto_free_tfm(ahp->tfm);
++ ahp->tfm = NULL;
++ }
++ kfree(ahp);
++}
++
++static struct xfrm_type ah6_type =
++{
++ .description = "AH6",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_AH,
++ .init_state = ah6_init_state,
++ .destructor = ah6_destroy,
++ .input = ah6_input,
++ .output = ah6_output
++};
++
++static struct inet6_protocol ah6_protocol = {
++ .handler = xfrm6_rcv,
++ .err_handler = ah6_err,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++static int __init ah6_init(void)
++{
++ if (xfrm_register_type(&ah6_type, AF_INET6) < 0) {
++ printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++
++ if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) {
++ printk(KERN_INFO "ipv6 ah init: can't add protocol\n");
++ xfrm_unregister_type(&ah6_type, AF_INET6);
++ return -EAGAIN;
++ }
++
++ return 0;
++}
++
++static void __exit ah6_fini(void)
++{
++ if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0)
++ printk(KERN_INFO "ipv6 ah close: can't remove protocol\n");
++
++ if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
++ printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n");
++
++}
++
++module_init(ah6_init);
++module_exit(ah6_fini);
++
++MODULE_LICENSE("GPL");
+diff -Nru a/net/ipv6/anycast.c b/net/ipv6/anycast.c
+--- a/net/ipv6/anycast.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv6/anycast.c 2005-02-13 21:25:10 +11:00
+@@ -95,7 +95,6 @@
+ return onlink;
+ }
+
+-
+ /*
+ * socket join an anycast group
+ */
+@@ -109,8 +108,12 @@
+ int ishost = !ipv6_devconf.forwarding;
+ int err = 0;
+
++ if (!capable(CAP_NET_ADMIN))
++ return -EPERM;
+ if (ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST)
+ return -EINVAL;
++ if (ipv6_chk_addr(addr, NULL))
++ return -EINVAL;
+
+ pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
+ if (pac == NULL)
+@@ -160,21 +163,12 @@
+ * For hosts, allow link-local or matching prefix anycasts.
+ * This obviates the need for propagating anycast routes while
+ * still allowing some non-router anycast participation.
+- *
+- * allow anyone to join anycasts that don't require a special route
+- * and can't be spoofs of unicast addresses (reserved anycast only)
+ */
+ if (!ip6_onlink(addr, dev)) {
+ if (ishost)
+ err = -EADDRNOTAVAIL;
+- else if (!capable(CAP_NET_ADMIN))
+- err = -EPERM;
+ if (err)
+ goto out_dev_put;
+- } else if (!(ipv6_addr_type(addr) & IPV6_ADDR_ANYCAST) &&
+- !capable(CAP_NET_ADMIN)) {
+- err = -EPERM;
+- goto out_dev_put;
+ }
+
+ err = ipv6_dev_ac_inc(dev, addr);
+@@ -265,6 +259,13 @@
+ dev_put(dev);
+ }
+
++#if 0
++/* The function is not used, which is funny. Apparently, author
++ * supposed to use it to filter out datagrams inside udp/raw but forgot.
++ *
++ * It is OK, anycasts are not special comparing to delivery to unicasts.
++ */
++
+ int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex)
+ {
+ struct ipv6_ac_socklist *pac;
+@@ -285,6 +286,8 @@
+ return found;
+ }
+
++#endif
++
+ static void aca_put(struct ifacaddr6 *ac)
+ {
+ if (atomic_dec_and_test(&ac->aca_refcnt)) {
+@@ -346,7 +349,7 @@
+ idev->ac_list = aca;
+ write_unlock_bh(&idev->lock);
+
+- ip6_rt_addr_add(&aca->aca_addr, dev);
++ ip6_rt_addr_add(&aca->aca_addr, dev, 1);
+
+ addrconf_join_solict(dev, &aca->aca_addr);
+
+diff -Nru a/net/ipv6/datagram.c b/net/ipv6/datagram.c
+--- a/net/ipv6/datagram.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/datagram.c 2005-02-13 21:25:09 +11:00
+@@ -78,7 +78,7 @@
+
+ iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr));
+ skb->nh.ipv6h = iph;
+- memcpy(&iph->daddr, fl->fl6_dst, 16);
++ ipv6_addr_copy(&iph->daddr, &fl->fl6_dst);
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+@@ -89,7 +89,7 @@
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+- serr->port = fl->uli_u.ports.dport;
++ serr->port = fl->fl_ip_dport;
+
+ skb->h.raw = skb->tail;
+ __skb_pull(skb, skb->tail - skb->data);
+@@ -289,7 +289,8 @@
+ goto exit_f;
+ }
+
+- fl->fl6_src = &src_info->ipi6_addr;
++ ipv6_addr_copy(&fl->fl6_src,
++ &src_info->ipi6_addr);
+ }
+
+ break;
+diff -Nru a/net/ipv6/esp6.c b/net/ipv6/esp6.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/esp6.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,432 @@
++/*
++ * Copyright (C)2002 USAGI/WIDE Project
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ *
++ * Authors
++ *
++ * Mitsuru KANDA @USAGI : IPv6 Support
++ * Kazunori MIYAZAWA @USAGI :
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ *
++ * This file is derived from net/ipv4/esp.c
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/esp.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <linux/random.h>
++#include <net/icmp.h>
++#include <net/ipv6.h>
++#include <linux/icmpv6.h>
++
++static int esp6_output(struct sk_buff *skb)
++{
++ int err;
++ int hdr_len;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct ipv6hdr *top_iph;
++ struct ipv6_esp_hdr *esph;
++ struct crypto_tfm *tfm;
++ struct esp_data *esp;
++ struct sk_buff *trailer;
++ int blksize;
++ int clen;
++ int alen;
++ int nfrags;
++
++ esp = x->data;
++ hdr_len = skb->h.raw - skb->data +
++ sizeof(*esph) + esp->conf.ivlen;
++
++ /* Strip IP+ESP header. */
++ __skb_pull(skb, hdr_len);
++
++ /* Now skb is pure payload to encrypt */
++ err = -ENOMEM;
++
++ /* Round to block size */
++ clen = skb->len;
++
++ alen = esp->auth.icv_trunc_len;
++ tfm = esp->conf.tfm;
++ blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
++ clen = (clen + 2 + blksize-1)&~(blksize-1);
++ if (esp->conf.padlen)
++ clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
++
++ if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) {
++ goto error;
++ }
++
++ /* Fill padding... */
++ do {
++ int i;
++ for (i=0; i<clen-skb->len - 2; i++)
++ *(u8*)(trailer->tail + i) = i+1;
++ } while (0);
++ *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
++ pskb_put(skb, trailer, clen - skb->len);
++
++ top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len);
++ esph = (struct ipv6_esp_hdr *)skb->h.raw;
++ top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph));
++ *(u8*)(trailer->tail - 1) = *skb->nh.raw;
++ *skb->nh.raw = IPPROTO_ESP;
++
++ esph->spi = x->id.spi;
++ esph->seq_no = htonl(++x->replay.oseq);
++
++ if (esp->conf.ivlen)
++ crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++
++ do {
++ struct scatterlist *sg = &esp->sgbuf[0];
++
++ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
++ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
++ if (!sg)
++ goto error;
++ }
++ skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
++ crypto_cipher_encrypt(tfm, sg, sg, clen);
++ if (unlikely(sg != &esp->sgbuf[0]))
++ kfree(sg);
++ } while (0);
++
++ if (esp->conf.ivlen) {
++ memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++ crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
++ }
++
++ if (esp->auth.icv_full_len) {
++ esp->auth.icv(esp, skb, (u8*)esph-skb->data,
++ sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
++ pskb_put(skb, trailer, alen);
++ }
++
++ err = 0;
++
++error:
++ return err;
++}
++
++static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ struct ipv6hdr *iph;
++ struct ipv6_esp_hdr *esph;
++ struct esp_data *esp = x->data;
++ struct sk_buff *trailer;
++ int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
++ int alen = esp->auth.icv_trunc_len;
++ int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen;
++
++ int hdr_len = skb->h.raw - skb->nh.raw;
++ int nfrags;
++ unsigned char *tmp_hdr = NULL;
++ int ret = 0;
++
++ if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) {
++ ret = -EINVAL;
++ goto out_nofree;
++ }
++
++ if (elen <= 0 || (elen & (blksize-1))) {
++ ret = -EINVAL;
++ goto out_nofree;
++ }
++
++ tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
++ if (!tmp_hdr) {
++ ret = -ENOMEM;
++ goto out_nofree;
++ }
++ memcpy(tmp_hdr, skb->nh.raw, hdr_len);
++
++ /* If integrity check is required, do this. */
++ if (esp->auth.icv_full_len) {
++ u8 sum[esp->auth.icv_full_len];
++ u8 sum1[alen];
++
++ esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
++
++ if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
++ BUG();
++
++ if (unlikely(memcmp(sum, sum1, alen))) {
++ x->stats.integrity_failed++;
++ ret = -EINVAL;
++ goto out;
++ }
++ }
++
++ if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) {
++ ret = -EINVAL;
++ goto out;
++ }
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ esph = (struct ipv6_esp_hdr*)skb->data;
++ iph = skb->nh.ipv6h;
++
++ /* Get ivec. This can be wrong, check against another impls. */
++ if (esp->conf.ivlen)
++ crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
++
++ {
++ u8 nexthdr[2];
++ struct scatterlist *sg = &esp->sgbuf[0];
++ u8 padlen;
++
++ if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
++ sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
++ if (!sg) {
++ ret = -ENOMEM;
++ goto out;
++ }
++ }
++ skb_to_sgvec(skb, sg, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen, elen);
++ crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
++ if (unlikely(sg != &esp->sgbuf[0]))
++ kfree(sg);
++
++ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
++ BUG();
++
++ padlen = nexthdr[0];
++ if (padlen+2 >= elen) {
++ if (net_ratelimit()) {
++ printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
++ }
++ ret = -EINVAL;
++ goto out;
++ }
++ /* ... check padding bits here. Silly. :-) */
++
++ pskb_trim(skb, skb->len - alen - padlen - 2);
++ skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen);
++ skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
++ memcpy(skb->nh.raw, tmp_hdr, hdr_len);
++ skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++ ret = nexthdr[1];
++ }
++
++out:
++ kfree(tmp_hdr);
++out_nofree:
++ return ret;
++}
++
++static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
++{
++ struct esp_data *esp = x->data;
++ u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
++
++ if (x->props.mode) {
++ mtu = (mtu + 2 + blksize-1)&~(blksize-1);
++ } else {
++ /* The worst case. */
++ mtu += 2 + blksize;
++ }
++ if (esp->conf.padlen)
++ mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
++
++ return mtu + x->props.header_len + esp->auth.icv_full_len;
++}
++
++static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info)
++{
++ struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
++ struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset);
++ struct xfrm_state *x;
++
++ if (type != ICMPV6_DEST_UNREACH &&
++ type != ICMPV6_PKT_TOOBIG)
++ return;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6);
++ if (!x)
++ return;
++ printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/"
++ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
++ ntohl(esph->spi), NIP6(iph->daddr));
++ xfrm_state_put(x);
++}
++
++static void esp6_destroy(struct xfrm_state *x)
++{
++ struct esp_data *esp = x->data;
++
++ if (!esp)
++ return;
++
++ if (esp->conf.tfm) {
++ crypto_free_tfm(esp->conf.tfm);
++ esp->conf.tfm = NULL;
++ }
++ if (esp->conf.ivec) {
++ kfree(esp->conf.ivec);
++ esp->conf.ivec = NULL;
++ }
++ if (esp->auth.tfm) {
++ crypto_free_tfm(esp->auth.tfm);
++ esp->auth.tfm = NULL;
++ }
++ if (esp->auth.work_icv) {
++ kfree(esp->auth.work_icv);
++ esp->auth.work_icv = NULL;
++ }
++ kfree(esp);
++}
++
++static int esp6_init_state(struct xfrm_state *x, void *args)
++{
++ struct esp_data *esp = NULL;
++
++ /* null auth and encryption can have zero length keys */
++ if (x->aalg) {
++ if (x->aalg->alg_key_len > 512)
++ goto error;
++ }
++ if (x->ealg == NULL)
++ goto error;
++
++ if (x->encap)
++ goto error;
++
++ esp = kmalloc(sizeof(*esp), GFP_KERNEL);
++ if (esp == NULL)
++ return -ENOMEM;
++
++ memset(esp, 0, sizeof(*esp));
++
++ if (x->aalg) {
++ struct xfrm_algo_desc *aalg_desc;
++
++ esp->auth.key = x->aalg->alg_key;
++ esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
++ esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
++ if (esp->auth.tfm == NULL)
++ goto error;
++ esp->auth.icv = esp_hmac_digest;
++
++ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name);
++ BUG_ON(!aalg_desc);
++
++ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
++ crypto_tfm_alg_digestsize(esp->auth.tfm)) {
++ printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
++ x->aalg->alg_name,
++ crypto_tfm_alg_digestsize(esp->auth.tfm),
++ aalg_desc->uinfo.auth.icv_fullbits/8);
++ goto error;
++ }
++
++ esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
++ esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
++
++ esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
++ if (!esp->auth.work_icv)
++ goto error;
++ }
++ esp->conf.key = x->ealg->alg_key;
++ esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
++ if (x->props.ealgo == SADB_EALG_NULL)
++ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
++ else
++ esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
++ if (esp->conf.tfm == NULL)
++ goto error;
++ esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
++ esp->conf.padlen = 0;
++ if (esp->conf.ivlen) {
++ esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
++ if (unlikely(esp->conf.ivec == NULL))
++ goto error;
++ get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
++ }
++ crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len);
++ x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct ipv6hdr);
++ x->data = esp;
++ return 0;
++
++error:
++ if (esp) {
++ if (esp->auth.tfm)
++ crypto_free_tfm(esp->auth.tfm);
++ if (esp->auth.work_icv)
++ kfree(esp->auth.work_icv);
++ if (esp->conf.tfm)
++ crypto_free_tfm(esp->conf.tfm);
++ kfree(esp);
++ }
++ return -EINVAL;
++}
++
++static struct xfrm_type esp6_type =
++{
++ .description = "ESP6",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_ESP,
++ .init_state = esp6_init_state,
++ .destructor = esp6_destroy,
++ .get_max_size = esp6_get_max_size,
++ .input = esp6_input,
++ .output = esp6_output
++};
++
++static struct inet6_protocol esp6_protocol = {
++ .handler = xfrm6_rcv,
++ .err_handler = esp6_err,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++static int __init esp6_init(void)
++{
++ if (xfrm_register_type(&esp6_type, AF_INET6) < 0) {
++ printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) {
++ printk(KERN_INFO "ipv6 esp init: can't add protocol\n");
++ xfrm_unregister_type(&esp6_type, AF_INET6);
++ return -EAGAIN;
++ }
++
++ return 0;
++}
++
++static void __exit esp6_fini(void)
++{
++ if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0)
++ printk(KERN_INFO "ipv6 esp close: can't remove protocol\n");
++ if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
++ printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n");
++}
++
++module_init(esp6_init);
++module_exit(esp6_fini);
++
++MODULE_LICENSE("GPL");
+diff -Nru a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
+--- a/net/ipv6/exthdrs.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/exthdrs.c 2005-02-13 21:25:09 +11:00
+@@ -18,6 +18,9 @@
+ /* Changes:
+ * yoshfuji : ensure not to overrun while parsing
+ * tlv options.
++ * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
++ * YOSHIFUJI Hideaki @USAGI Register inbound extention header
++ * handlers as inet6_protocol{}.
+ */
+
+ #include <linux/errno.h>
+@@ -44,20 +47,6 @@
+ #include <asm/uaccess.h>
+
+ /*
+- * Parsing inbound headers.
+- *
+- * Parsing function "func" returns offset wrt skb->nh of the place,
+- * where next nexthdr value is stored or NULL, if parsing
+- * failed. It should also update skb->h tp point at the next header.
+- */
+-
+-struct hdrtype_proc
+-{
+- int type;
+- int (*func) (struct sk_buff **, int offset);
+-};
+-
+-/*
+ * Parsing tlv encoded headers.
+ *
+ * Parsing function "func" returns 1, if parsing succeed
+@@ -164,9 +153,9 @@
+ {-1, NULL}
+ };
+
+-static int ipv6_dest_opt(struct sk_buff **skb_ptr, int nhoff)
++static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+ {
+- struct sk_buff *skb=*skb_ptr;
++ struct sk_buff *skb = *skbp;
+ struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+
+ if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) ||
+@@ -179,29 +168,56 @@
+
+ if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
+ skb->h.raw += ((skb->h.raw[1]+1)<<3);
+- return opt->dst1;
++ *nhoffp = opt->dst1;
++ return 1;
+ }
+
+ return -1;
+ }
+
++static struct inet6_protocol destopt_protocol =
++{
++ .handler = ipv6_destopt_rcv,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++void __init ipv6_destopt_init(void)
++{
++ if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0)
++ printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n");
++}
++
+ /********************************
+ NONE header. No data in packet.
+ ********************************/
+
+-static int ipv6_nodata(struct sk_buff **skb_ptr, int nhoff)
++static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+ {
+- kfree_skb(*skb_ptr);
+- return -1;
++ struct sk_buff *skb = *skbp;
++
++ kfree_skb(skb);
++ return 0;
++}
++
++static struct inet6_protocol nodata_protocol =
++{
++ .handler = ipv6_nodata_rcv,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++void __init ipv6_nodata_init(void)
++{
++ if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0)
++ printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n");
+ }
+
+ /********************************
+ Routing header.
+ ********************************/
+
+-static int ipv6_routing_header(struct sk_buff **skb_ptr, int nhoff)
++static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+ {
+- struct sk_buff *skb = *skb_ptr;
++ struct sk_buff *skb = *skbp;
+ struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+ struct in6_addr *addr;
+ struct in6_addr daddr;
+@@ -232,7 +248,8 @@
+ skb->h.raw += (hdr->hdrlen + 1) << 3;
+ opt->dst0 = opt->dst1;
+ opt->dst1 = 0;
+- return (&hdr->nexthdr) - skb->nh.raw;
++ *nhoffp = (&hdr->nexthdr) - skb->nh.raw;
++ return 1;
+ }
+
+ if (hdr->type != IPV6_SRCRT_TYPE_0) {
+@@ -247,7 +264,7 @@
+
+ /*
+ * This is the routing header forwarding algorithm from
+- * RFC 1883, page 17.
++ * RFC 2460, page 16.
+ */
+
+ n = hdr->hdrlen >> 1;
+@@ -265,7 +282,7 @@
+ kfree_skb(skb);
+ if (skb2 == NULL)
+ return -1;
+- *skb_ptr = skb = skb2;
++ *skbp = skb = skb2;
+ opt = (struct inet6_skb_parm *)skb2->cb;
+ hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
+ }
+@@ -294,7 +311,7 @@
+ ip6_route_input(skb);
+ if (skb->dst->error) {
+ skb_push(skb, skb->data - skb->nh.raw);
+- skb->dst->input(skb);
++ dst_input(skb);
+ return -1;
+ }
+
+@@ -310,10 +327,22 @@
+ }
+
+ skb_push(skb, skb->data - skb->nh.raw);
+- skb->dst->input(skb);
++ dst_input(skb);
+ return -1;
+ }
+
++static struct inet6_protocol rthdr_protocol =
++{
++ .handler = ipv6_rthdr_rcv,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++void __init ipv6_rthdr_init(void)
++{
++ if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0)
++ printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n");
++};
++
+ /*
+ This function inverts received rthdr.
+ NOTE: specs allow to make it automatically only if
+@@ -379,97 +408,6 @@
+ return opt;
+ }
+
+-/********************************
+- AUTH header.
+- ********************************/
+-
+-/*
+- rfc1826 said, that if a host does not implement AUTH header
+- it MAY ignore it. We use this hole 8)
+-
+- Actually, now we can implement OSPFv6 without kernel IPsec.
+- Authentication for poors may be done in user space with the same success.
+-
+- Yes, it means, that we allow application to send/receive
+- raw authentication header. Apparently, we suppose, that it knows
+- what it does and calculates authentication data correctly.
+- Certainly, it is possible only for udp and raw sockets, but not for tcp.
+-
+- AUTH header has 4byte granular length, which kills all the idea
+- behind AUTOMATIC 64bit alignment of IPv6. Now we will lose
+- cpu ticks, checking that sender did not something stupid
+- and opt->hdrlen is even. Shit! --ANK (980730)
+- */
+-
+-static int ipv6_auth_hdr(struct sk_buff **skb_ptr, int nhoff)
+-{
+- struct sk_buff *skb=*skb_ptr;
+- struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+- int len;
+-
+- if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8))
+- goto fail;
+-
+- /*
+- * RFC2402 2.2 Payload Length
+- * The 8-bit field specifies the length of AH in 32-bit words
+- * (4-byte units), minus "2".
+- * -- Noriaki Takamiya @USAGI Project
+- */
+- len = (skb->h.raw[1]+2)<<2;
+-
+- if (len&7)
+- goto fail;
+-
+- if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+len))
+- goto fail;
+-
+- opt->auth = skb->h.raw - skb->nh.raw;
+- skb->h.raw += len;
+- return opt->auth;
+-
+-fail:
+- kfree_skb(skb);
+- return -1;
+-}
+-
+-/* This list MUST NOT contain entry for NEXTHDR_HOP.
+- It is parsed immediately after packet received
+- and if it occurs somewhere in another place we must
+- generate error.
+- */
+-
+-struct hdrtype_proc hdrproc_lst[] = {
+- {NEXTHDR_FRAGMENT, ipv6_reassembly},
+- {NEXTHDR_ROUTING, ipv6_routing_header},
+- {NEXTHDR_DEST, ipv6_dest_opt},
+- {NEXTHDR_NONE, ipv6_nodata},
+- {NEXTHDR_AUTH, ipv6_auth_hdr},
+- /*
+- {NEXTHDR_ESP, ipv6_esp_hdr},
+- */
+- {-1, NULL}
+-};
+-
+-int ipv6_parse_exthdrs(struct sk_buff **skb_in, int nhoff)
+-{
+- struct hdrtype_proc *hdrt;
+- u8 nexthdr = (*skb_in)->nh.raw[nhoff];
+-
+-restart:
+- for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) {
+- if (hdrt->type == nexthdr) {
+- if ((nhoff = hdrt->func(skb_in, nhoff)) >= 0) {
+- nexthdr = (*skb_in)->nh.raw[nhoff];
+- goto restart;
+- }
+- return -1;
+- }
+- }
+- return nhoff;
+-}
+-
+-
+ /**********************************
+ Hop-by-hop options.
+ **********************************/
+@@ -501,7 +439,7 @@
+ }
+
+ pkt_len = ntohl(*(u32*)(skb->nh.raw+optoff+2));
+- if (pkt_len < 0x10000) {
++ if (pkt_len <= IPV6_MAXPLEN) {
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
+ return 0;
+ }
+diff -Nru a/net/ipv6/icmp.c b/net/ipv6/icmp.c
+--- a/net/ipv6/icmp.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/icmp.c 2005-02-13 21:25:09 +11:00
+@@ -26,6 +26,7 @@
+ * yoshfuji : ensure to sent parameter problem for
+ * fragments.
+ * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit.
++ * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data
+ */
+
+ #include <linux/module.h>
+@@ -74,17 +75,11 @@
+ #define icmpv6_socket __icmpv6_socket[smp_processor_id()]
+ #define icmpv6_socket_cpu(X) __icmpv6_socket[(X)]
+
+-int icmpv6_rcv(struct sk_buff *skb);
++static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+
+-static struct inet6_protocol icmpv6_protocol =
+-{
+- icmpv6_rcv, /* handler */
+- NULL, /* error control */
+- NULL, /* next */
+- IPPROTO_ICMPV6, /* protocol ID */
+- 0, /* copy */
+- NULL, /* data */
+- "ICMPv6" /* name */
++static struct inet6_protocol icmpv6_protocol = {
++ .handler = icmpv6_rcv,
++ .flags = INET6_PROTO_FINAL,
+ };
+
+ struct icmpv6_msg {
+@@ -116,40 +111,6 @@
+ spin_unlock_bh(&icmpv6_socket->sk->lock.slock);
+ }
+
+-/*
+- * getfrag callback
+- */
+-
+-static int icmpv6_getfrag(const void *data, struct in6_addr *saddr,
+- char *buff, unsigned int offset, unsigned int len)
+-{
+- struct icmpv6_msg *msg = (struct icmpv6_msg *) data;
+- struct icmp6hdr *icmph;
+- __u32 csum;
+-
+- if (offset) {
+- csum = skb_copy_and_csum_bits(msg->skb, msg->offset +
+- (offset - sizeof(struct icmp6hdr)),
+- buff, len, msg->csum);
+- msg->csum = csum;
+- return 0;
+- }
+-
+- csum = csum_partial_copy_nocheck((void *) &msg->icmph, buff,
+- sizeof(struct icmp6hdr), msg->csum);
+-
+- csum = skb_copy_and_csum_bits(msg->skb, msg->offset,
+- buff + sizeof(struct icmp6hdr),
+- len - sizeof(struct icmp6hdr), csum);
+-
+- icmph = (struct icmp6hdr *) buff;
+-
+- icmph->icmp6_cksum = csum_ipv6_magic(saddr, msg->daddr, msg->len,
+- IPPROTO_ICMPV6, csum);
+- return 0;
+-}
+-
+-
+ /*
+ * Slightly more convenient version of icmpv6_send.
+ */
+@@ -252,21 +213,74 @@
+ return (optval&0xC0) == 0x80;
+ }
+
++int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len)
++{
++ struct sk_buff *skb;
++ struct icmp6hdr *icmp6h;
++ int err = 0;
++
++ if ((skb = skb_peek(&sk->write_queue)) == NULL)
++ goto out;
++
++ icmp6h = (struct icmp6hdr*) skb->h.raw;
++ memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
++ icmp6h->icmp6_cksum = 0;
++
++ if (skb_queue_len(&sk->write_queue) == 1) {
++ skb->csum = csum_partial((char *)icmp6h,
++ sizeof(struct icmp6hdr), skb->csum);
++ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ len, fl->proto,
++ skb->csum);
++ } else {
++ u32 tmp_csum = 0;
++
++ skb_queue_walk(&sk->write_queue, skb) {
++ tmp_csum = csum_add(tmp_csum, skb->csum);
++ }
++
++ tmp_csum = csum_partial((char *)icmp6h,
++ sizeof(struct icmp6hdr), tmp_csum);
++ tmp_csum = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ len, fl->proto, tmp_csum);
++ icmp6h->icmp6_cksum = tmp_csum;
++ }
++ if (icmp6h->icmp6_cksum == 0)
++ icmp6h->icmp6_cksum = -1;
++ ip6_push_pending_frames(sk);
++out:
++ return err;
++}
++
++static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
++{
++ struct sk_buff *org_skb = (struct sk_buff *)from;
++ __u32 csum = 0;
++ csum = skb_copy_and_csum_bits(org_skb, offset, to, len, csum);
++ skb->csum = csum_block_add(skb->csum, csum, odd);
++ return 0;
++}
++
+ /*
+ * Send an ICMP message in response to a packet in error
+ */
+-
+ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
+ struct net_device *dev)
+ {
+ struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct sock *sk = icmpv6_socket->sk;
++ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *saddr = NULL;
+- int iif = 0;
+- struct icmpv6_msg msg;
++ struct dst_entry *dst;
++ struct icmp6hdr tmp_hdr;
+ struct flowi fl;
++ int iif = 0;
+ int addr_type = 0;
+- int len;
++ int len, plen;
++ int hlimit = -1;
++ int err = 0;
+
+ if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail)
+ return;
+@@ -324,13 +338,14 @@
+ return;
+ }
+
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_ICMPV6;
+- fl.nl_u.ip6_u.daddr = &hdr->saddr;
+- fl.nl_u.ip6_u.saddr = saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr);
++ if (saddr)
++ ipv6_addr_copy(&fl.fl6_src, saddr);
+ fl.oif = iif;
+- fl.fl6_flowlabel = 0;
+- fl.uli_u.icmpt.type = type;
+- fl.uli_u.icmpt.code = code;
++ fl.fl_icmp_type = type;
++ fl.fl_icmp_code = code;
+
+ if (icmpv6_xmit_lock())
+ return;
+@@ -338,37 +353,52 @@
+ if (!icmpv6_xrlim_allow(sk, type, &fl))
+ goto out;
+
+- /*
+- * ok. kick it. checksum will be provided by the
+- * getfrag_t callback.
+- */
++ tmp_hdr.icmp6_type = type;
++ tmp_hdr.icmp6_code = code;
++ tmp_hdr.icmp6_cksum = 0;
++ tmp_hdr.icmp6_pointer = htonl(info);
++
++ if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
++ fl.oif = np->mcast_oif;
+
+- msg.icmph.icmp6_type = type;
+- msg.icmph.icmp6_code = code;
+- msg.icmph.icmp6_cksum = 0;
+- msg.icmph.icmp6_pointer = htonl(info);
+-
+- msg.skb = skb;
+- msg.offset = skb->nh.raw - skb->data;
+- msg.csum = 0;
+- msg.daddr = &hdr->saddr;
++ err = ip6_dst_lookup(sk, &dst, &fl);
++ if (err)
++ goto out;
+
+- len = skb->len - msg.offset + sizeof(struct icmp6hdr);
+- len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr));
++ if (hlimit < 0) {
++ if (ipv6_addr_is_multicast(&fl.fl6_dst))
++ hlimit = np->mcast_hops;
++ else
++ hlimit = np->hop_limit;
++ if (hlimit < 0)
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
++ }
+
++ plen = skb->nh.raw - skb->data;
++ __skb_pull(skb, plen);
++ len = skb->len;
++ len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
+ if (len < 0) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "icmp: len problem\n");
+- goto out;
++ __skb_push(skb, plen);
++ goto out_dst_release;
+ }
+
+- msg.len = len;
++ err = ip6_append_data(sk, icmpv6_getfrag, skb, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr),
++ hlimit, NULL, &fl, (struct rt6_info*)dst, MSG_DONTWAIT);
++ if (err) {
++ ip6_flush_pending_frames(sk);
++ goto out_dst_release;
++ }
++ err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr));
++ __skb_push(skb, plen);
+
+- ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
+- MSG_DONTWAIT);
+ if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+ (&(icmpv6_statistics[smp_processor_id()*2].Icmp6OutDestUnreachs))[type-1]++;
+ ICMP6_INC_STATS_BH(Icmp6OutMsgs);
++out_dst_release:
++ dst_release(dst);
+ out:
+ icmpv6_xmit_unlock();
+ }
+@@ -376,45 +406,66 @@
+ static void icmpv6_echo_reply(struct sk_buff *skb)
+ {
+ struct sock *sk = icmpv6_socket->sk;
++ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct in6_addr *saddr = NULL;
+ struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
+- struct in6_addr *saddr;
+- struct icmpv6_msg msg;
++ struct icmp6hdr tmp_hdr;
+ struct flowi fl;
++ struct dst_entry *dst;
++ int err = 0;
++ int hlimit = -1;
+
+ saddr = &skb->nh.ipv6h->daddr;
+
+- if (ipv6_addr_type(saddr) & IPV6_ADDR_MULTICAST ||
+- ipv6_chk_acast_addr(0, saddr))
++ if (!ipv6_unicast_destination(skb))
+ saddr = NULL;
+
+- msg.icmph.icmp6_type = ICMPV6_ECHO_REPLY;
+- msg.icmph.icmp6_code = 0;
+- msg.icmph.icmp6_cksum = 0;
+- msg.icmph.icmp6_identifier = icmph->icmp6_identifier;
+- msg.icmph.icmp6_sequence = icmph->icmp6_sequence;
+-
+- msg.skb = skb;
+- msg.offset = 0;
+- msg.csum = 0;
+- msg.len = skb->len + sizeof(struct icmp6hdr);
+- msg.daddr = &skb->nh.ipv6h->saddr;
++ memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
++ tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_ICMPV6;
+- fl.nl_u.ip6_u.daddr = msg.daddr;
+- fl.nl_u.ip6_u.saddr = saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
++ if (saddr)
++ ipv6_addr_copy(&fl.fl6_src, saddr);
+ fl.oif = skb->dev->ifindex;
+- fl.fl6_flowlabel = 0;
+- fl.uli_u.icmpt.type = ICMPV6_ECHO_REPLY;
+- fl.uli_u.icmpt.code = 0;
++ fl.fl_icmp_type = ICMPV6_ECHO_REPLY;
+
+ if (icmpv6_xmit_lock())
+ return;
+
+- ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, msg.len, NULL, -1,
+- MSG_DONTWAIT);
+- ICMP6_INC_STATS_BH(Icmp6OutEchoReplies);
+- ICMP6_INC_STATS_BH(Icmp6OutMsgs);
++ if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
++ fl.oif = np->mcast_oif;
++
++ err = ip6_dst_lookup(sk, &dst, &fl);
++ if (err)
++ goto out;
+
++ if (hlimit < 0) {
++ if (ipv6_addr_is_multicast(&fl.fl6_dst))
++ hlimit = np->mcast_hops;
++ else
++ hlimit = np->hop_limit;
++ if (hlimit < 0)
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
++ }
++
++ err = ip6_append_data(sk, icmpv6_getfrag, skb, skb->len + sizeof(struct icmp6hdr),
++ sizeof(struct icmp6hdr), hlimit, NULL, &fl,
++ (struct rt6_info*)dst, MSG_DONTWAIT);
++
++ if (err) {
++ ip6_flush_pending_frames(sk);
++ goto out_dst_release;
++ }
++ err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
++
++ ICMP6_INC_STATS_BH(Icmp6OutEchoReplies);
++ ICMP6_INC_STATS_BH(Icmp6OutMsgs);
++
++out_dst_release:
++ dst_release(dst);
++out:
+ icmpv6_xmit_unlock();
+ }
+
+@@ -456,15 +507,9 @@
+
+ hash = nexthdr & (MAX_INET_PROTOS - 1);
+
+- for (ipprot = (struct inet6_protocol *) inet6_protos[hash];
+- ipprot != NULL;
+- ipprot=(struct inet6_protocol *)ipprot->next) {
+- if (ipprot->protocol != nexthdr)
+- continue;
+-
+- if (ipprot->err_handler)
+- ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+- }
++ ipprot = inet6_protos[hash];
++ if (ipprot && ipprot->err_handler)
++ ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+
+ read_lock(&raw_v6_lock);
+ if ((sk = raw_v6_htable[hash]) != NULL) {
+@@ -480,8 +525,9 @@
+ * Handle icmp messages
+ */
+
+-int icmpv6_rcv(struct sk_buff *skb)
++static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ {
++ struct sk_buff *skb = *pskb;
+ struct net_device *dev = skb->dev;
+ struct in6_addr *saddr, *daddr;
+ struct ipv6hdr *orig_hdr;
+@@ -508,22 +554,7 @@
+ skb_checksum(skb, 0, skb->len, 0))) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+- ntohs(saddr->s6_addr16[0]),
+- ntohs(saddr->s6_addr16[1]),
+- ntohs(saddr->s6_addr16[2]),
+- ntohs(saddr->s6_addr16[3]),
+- ntohs(saddr->s6_addr16[4]),
+- ntohs(saddr->s6_addr16[5]),
+- ntohs(saddr->s6_addr16[6]),
+- ntohs(saddr->s6_addr16[7]),
+- ntohs(daddr->s6_addr16[0]),
+- ntohs(daddr->s6_addr16[1]),
+- ntohs(daddr->s6_addr16[2]),
+- ntohs(daddr->s6_addr16[3]),
+- ntohs(daddr->s6_addr16[4]),
+- ntohs(daddr->s6_addr16[5]),
+- ntohs(daddr->s6_addr16[6]),
+- ntohs(daddr->s6_addr16[7]));
++ NIP6(*saddr), NIP6(*daddr));
+ goto discard_it;
+ }
+ }
+@@ -659,7 +690,12 @@
+ sk->prot->unhash(sk);
+ }
+
+- inet6_add_protocol(&icmpv6_protocol);
++ if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) {
++ printk(KERN_ERR "Failed to register ICMP6 protocol\n");
++ sock_release(icmpv6_socket);
++ icmpv6_socket = NULL;
++ return -EAGAIN;
++ }
+
+ return 0;
+ fail:
+@@ -678,7 +714,7 @@
+ sock_release(icmpv6_socket_cpu(i));
+ icmpv6_socket_cpu(i) = NULL;
+ }
+- inet6_del_protocol(&icmpv6_protocol);
++ inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
+ }
+
+ static struct icmp6_err {
+diff -Nru a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
+--- a/net/ipv6/ip6_fib.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ip6_fib.c 2005-02-13 21:25:09 +11:00
+@@ -40,7 +40,6 @@
+ #include <net/ip6_route.h>
+
+ #define RT6_DEBUG 2
+-#undef CONFIG_IPV6_SUBTREES
+
+ #if RT6_DEBUG >= 3
+ #define RT6_TRACE(x...) printk(KERN_DEBUG x)
+@@ -453,7 +452,6 @@
+ */
+
+ if ((iter->rt6i_dev == rt->rt6i_dev) &&
+- (iter->rt6i_flowr == rt->rt6i_flowr) &&
+ (ipv6_addr_cmp(&iter->rt6i_gateway,
+ &rt->rt6i_gateway) == 0)) {
+ if (!(iter->rt6i_flags&RTF_EXPIRES))
+@@ -500,13 +498,19 @@
+ mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
+ }
+
++void fib6_force_start_gc(void)
++{
++ if (ip6_fib_timer.expires == 0)
++ mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
++}
++
+ /*
+ * Add routing information to the routing tree.
+ * <destination addr>/<source addr>
+ * with source addr info in sub-trees
+ */
+
+-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh)
++int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ struct fib6_node *fn;
+ int err = -ENOMEM;
+@@ -597,8 +601,8 @@
+ is orphan. If it is, shoot it.
+ */
+ st_failure:
+- if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT))
+- fib_repair_tree(fn);
++ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
++ fib6_repair_tree(fn);
+ dst_free(&rt->u.dst);
+ return err;
+ #endif
+@@ -888,7 +892,7 @@
+ }
+
+ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
+- struct nlmsghdr *nlh)
++ struct nlmsghdr *nlh, void *_rtattr)
+ {
+ struct fib6_walker_t *w;
+ struct rt6_info *rt = *rtp;
+@@ -947,7 +951,7 @@
+ rt6_release(rt);
+ }
+
+-int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh)
++int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ struct fib6_node *fn = rt->rt6i_node;
+ struct rt6_info **rtp;
+@@ -972,7 +976,7 @@
+
+ for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
+ if (*rtp == rt) {
+- fib6_del_route(fn, rtp, nlh);
++ fib6_del_route(fn, rtp, nlh, _rtattr);
+ return 0;
+ }
+ }
+@@ -1101,7 +1105,7 @@
+ res = c->func(rt, c->arg);
+ if (res < 0) {
+ w->leaf = rt;
+- res = fib6_del(rt, NULL);
++ res = fib6_del(rt, NULL, NULL);
+ if (res) {
+ #if RT6_DEBUG >= 2
+ printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
+@@ -1218,6 +1222,7 @@
+
+
+ write_lock_bh(&rt6_lock);
++ ndisc_dst_gc(&gc_args.more);
+ fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
+ write_unlock_bh(&rt6_lock);
+
+@@ -1232,17 +1237,17 @@
+
+ void __init fib6_init(void)
+ {
+- if (!fib6_node_kmem)
+- fib6_node_kmem = kmem_cache_create("fib6_nodes",
+- sizeof(struct fib6_node),
+- 0, SLAB_HWCACHE_ALIGN,
+- NULL, NULL);
++ fib6_node_kmem = kmem_cache_create("fib6_nodes",
++ sizeof(struct fib6_node),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
+ }
+
+ #ifdef MODULE
+ void fib6_gc_cleanup(void)
+ {
+ del_timer(&ip6_fib_timer);
++ kmem_cache_destroy(fib6_node_kmem);
+ }
+ #endif
+
+diff -Nru a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c
+--- a/net/ipv6/ip6_fw.c 2005-02-13 21:25:10 +11:00
++++ /dev/null Wed Dec 31 16:00:00 196900
+@@ -1,390 +0,0 @@
+-/*
+- * IPv6 Firewall
+- * Linux INET6 implementation
+- *
+- * Authors:
+- * Pedro Roque <roque at di.fc.ul.pt>
+- *
+- * $Id: ip6_fw.c,v 1.16 2001/10/31 08:17:58 davem Exp $
+- *
+- * This program is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU General Public License
+- * as published by the Free Software Foundation; either version
+- * 2 of the License, or (at your option) any later version.
+- */
+-
+-#include <linux/config.h>
+-#include <linux/errno.h>
+-#include <linux/types.h>
+-#include <linux/string.h>
+-#include <linux/socket.h>
+-#include <linux/sockios.h>
+-#include <linux/net.h>
+-#include <linux/route.h>
+-#include <linux/netdevice.h>
+-#include <linux/in6.h>
+-#include <linux/udp.h>
+-#include <linux/init.h>
+-
+-#include <net/ipv6.h>
+-#include <net/ip6_route.h>
+-#include <net/ip6_fw.h>
+-#include <net/netlink.h>
+-
+-static unsigned long ip6_fw_rule_cnt;
+-static struct ip6_fw_rule ip6_fw_rule_list = {
+- {0},
+- NULL, NULL,
+- {0},
+- IP6_FW_REJECT
+-};
+-
+-static int ip6_fw_accept(struct dst_entry *dst, struct fl_acc_args *args);
+-
+-struct flow_rule_ops ip6_fw_ops = {
+- ip6_fw_accept
+-};
+-
+-
+-static struct rt6_info ip6_fw_null_entry = {
+- {{NULL, 0, 0, NULL,
+- 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL,
+- ip6_pkt_discard, ip6_pkt_discard, NULL}},
+- NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0UL,
+- 0, &ip6_fw_rule_list, {{{{0}}}, 128}, {{{{0}}}, 128}
+-};
+-
+-static struct fib6_node ip6_fw_fib = {
+- NULL, NULL, NULL, NULL,
+- &ip6_fw_null_entry,
+- 0, RTN_ROOT|RTN_TL_ROOT, 0
+-};
+-
+-rwlock_t ip6_fw_lock = RW_LOCK_UNLOCKED;
+-
+-
+-static void ip6_rule_add(struct ip6_fw_rule *rl)
+-{
+- struct ip6_fw_rule *next;
+-
+- write_lock_bh(&ip6_fw_lock);
+- ip6_fw_rule_cnt++;
+- next = &ip6_fw_rule_list;
+- rl->next = next;
+- rl->prev = next->prev;
+- rl->prev->next = rl;
+- next->prev = rl;
+- write_unlock_bh(&ip6_fw_lock);
+-}
+-
+-static void ip6_rule_del(struct ip6_fw_rule *rl)
+-{
+- struct ip6_fw_rule *next, *prev;
+-
+- write_lock_bh(&ip6_fw_lock);
+- ip6_fw_rule_cnt--;
+- next = rl->next;
+- prev = rl->prev;
+- next->prev = prev;
+- prev->next = next;
+- write_unlock_bh(&ip6_fw_lock);
+-}
+-
+-static __inline__ struct ip6_fw_rule * ip6_fwrule_alloc(void)
+-{
+- struct ip6_fw_rule *rl;
+-
+- rl = kmalloc(sizeof(struct ip6_fw_rule), GFP_ATOMIC);
+- if (rl)
+- {
+- memset(rl, 0, sizeof(struct ip6_fw_rule));
+- rl->flowr.ops = &ip6_fw_ops;
+- }
+- return rl;
+-}
+-
+-static __inline__ void ip6_fwrule_free(struct ip6_fw_rule * rl)
+-{
+- kfree(rl);
+-}
+-
+-static __inline__ int port_match(int rl_port, int fl_port)
+-{
+- int res = 0;
+- if (rl_port == 0 || (rl_port == fl_port))
+- res = 1;
+- return res;
+-}
+-
+-static int ip6_fw_accept_trans(struct ip6_fw_rule *rl,
+- struct fl_acc_args *args)
+-{
+- int res = FLOWR_NODECISION;
+- int proto = 0;
+- int sport = 0;
+- int dport = 0;
+-
+- switch (args->type) {
+- case FL_ARG_FORWARD:
+- {
+- struct sk_buff *skb = args->fl_u.skb;
+- struct ipv6hdr *hdr = skb->nh.ipv6h;
+- int len;
+-
+- len = skb->len - sizeof(struct ipv6hdr);
+-
+- proto = hdr->nexthdr;
+-
+- switch (proto) {
+- case IPPROTO_TCP:
+- {
+- struct tcphdr *th;
+-
+- if (len < sizeof(struct tcphdr)) {
+- res = FLOWR_ERROR;
+- goto out;
+- }
+- th = (struct tcphdr *)(hdr + 1);
+- sport = th->source;
+- dport = th->dest;
+- break;
+- }
+- case IPPROTO_UDP:
+- {
+- struct udphdr *uh;
+-
+- if (len < sizeof(struct udphdr)) {
+- res = FLOWR_ERROR;
+- goto out;
+- }
+- uh = (struct udphdr *)(hdr + 1);
+- sport = uh->source;
+- dport = uh->dest;
+- break;
+- }
+- default:
+- goto out;
+- };
+- break;
+- }
+-
+- case FL_ARG_ORIGIN:
+- {
+- proto = args->fl_u.fl_o.flow->proto;
+-
+- if (proto == IPPROTO_ICMPV6) {
+- goto out;
+- } else {
+- sport = args->fl_u.fl_o.flow->uli_u.ports.sport;
+- dport = args->fl_u.fl_o.flow->uli_u.ports.dport;
+- }
+- break;
+- }
+-
+- if (proto == rl->info.proto &&
+- port_match(args->fl_u.fl_o.flow->uli_u.ports.sport, sport) &&
+- port_match(args->fl_u.fl_o.flow->uli_u.ports.dport, dport)) {
+- if (rl->policy & IP6_FW_REJECT)
+- res = FLOWR_SELECT;
+- else
+- res = FLOWR_CLEAR;
+- }
+-
+- default:
+-#if IP6_FW_DEBUG >= 1
+- printk(KERN_DEBUG "ip6_fw_accept: unknown arg type\n");
+-#endif
+- goto out;
+- };
+-
+-out:
+- return res;
+-}
+-
+-static int ip6_fw_accept(struct dst_entry *dst, struct fl_acc_args *args)
+-{
+- struct rt6_info *rt;
+- struct ip6_fw_rule *rl;
+- int proto;
+- int res = FLOWR_NODECISION;
+-
+- rt = (struct rt6_info *) dst;
+- rl = (struct ip6_fw_rule *) rt->rt6i_flowr;
+-
+- proto = rl->info.proto;
+-
+- switch (proto) {
+- case 0:
+- if (rl->policy & IP6_FW_REJECT)
+- res = FLOWR_SELECT;
+- else
+- res = FLOWR_CLEAR;
+- break;
+- case IPPROTO_TCP:
+- case IPPROTO_UDP:
+- res = ip6_fw_accept_trans(rl, args);
+- break;
+- case IPPROTO_ICMPV6:
+- };
+-
+- return res;
+-}
+-
+-static struct dst_entry * ip6_fw_dup(struct dst_entry *frule,
+- struct dst_entry *rt,
+- struct fl_acc_args *args)
+-{
+- struct ip6_fw_rule *rl;
+- struct rt6_info *nrt;
+- struct rt6_info *frt;
+-
+- frt = (struct rt6_info *) frule;
+-
+- rl = (struct ip6_fw_rule *) frt->rt6i_flowr;
+-
+- nrt = ip6_rt_copy((struct rt6_info *) rt);
+-
+- if (nrt) {
+- nrt->u.dst.input = frule->input;
+- nrt->u.dst.output = frule->output;
+-
+- nrt->rt6i_flowr = flow_clone(frt->rt6i_flowr);
+-
+- nrt->rt6i_flags |= RTF_CACHE;
+- nrt->rt6i_tstamp = jiffies;
+- }
+-
+- return (struct dst_entry *) nrt;
+-}
+-
+-int ip6_fw_reject(struct sk_buff *skb)
+-{
+-#if IP6_FW_DEBUG >= 1
+- printk(KERN_DEBUG "packet rejected: \n");
+-#endif
+-
+- icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADM_PROHIBITED, 0,
+- skb->dev);
+- /*
+- * send it via netlink, as (rule, skb)
+- */
+-
+- kfree_skb(skb);
+- return 0;
+-}
+-
+-int ip6_fw_discard(struct sk_buff *skb)
+-{
+- printk(KERN_DEBUG "ip6_fw: BUG fw_reject called\n");
+- kfree_skb(skb);
+- return 0;
+-}
+-
+-int ip6_fw_msg_add(struct ip6_fw_msg *msg)
+-{
+- struct in6_rtmsg rtmsg;
+- struct ip6_fw_rule *rl;
+- struct rt6_info *rt;
+- int err;
+-
+- ipv6_addr_copy(&rtmsg.rtmsg_dst, &msg->dst);
+- ipv6_addr_copy(&rtmsg.rtmsg_src, &msg->src);
+- rtmsg.rtmsg_dst_len = msg->dst_len;
+- rtmsg.rtmsg_src_len = msg->src_len;
+- rtmsg.rtmsg_metric = IP6_RT_PRIO_FW;
+-
+- rl = ip6_fwrule_alloc();
+-
+- if (rl == NULL)
+- return -ENOMEM;
+-
+- rl->policy = msg->policy;
+- rl->info.proto = msg->proto;
+- rl->info.uli_u.data = msg->u.data;
+-
+- rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY;
+- err = ip6_route_add(&rtmsg);
+-
+- if (err) {
+- ip6_fwrule_free(rl);
+- return err;
+- }
+-
+- /* The rest will not work for now. --ABK (989725) */
+-
+-#ifndef notdef
+- ip6_fwrule_free(rl);
+- return -EPERM;
+-#else
+- rt->u.dst.error = -EPERM;
+-
+- if (msg->policy == IP6_FW_ACCEPT) {
+- /*
+- * Accept rules are never selected
+- * (i.e. packets use normal forwarding)
+- */
+- rt->u.dst.input = ip6_fw_discard;
+- rt->u.dst.output = ip6_fw_discard;
+- } else {
+- rt->u.dst.input = ip6_fw_reject;
+- rt->u.dst.output = ip6_fw_reject;
+- }
+-
+- ip6_rule_add(rl);
+-
+- rt->rt6i_flowr = flow_clone((struct flow_rule *)rl);
+-
+- return 0;
+-#endif
+-}
+-
+-static int ip6_fw_msgrcv(int unit, struct sk_buff *skb)
+-{
+- int count = 0;
+-
+- while (skb->len) {
+- struct ip6_fw_msg *msg;
+-
+- if (skb->len < sizeof(struct ip6_fw_msg)) {
+- count = -EINVAL;
+- break;
+- }
+-
+- msg = (struct ip6_fw_msg *) skb->data;
+- skb_pull(skb, sizeof(struct ip6_fw_msg));
+- count += sizeof(struct ip6_fw_msg);
+-
+- switch (msg->action) {
+- case IP6_FW_MSG_ADD:
+- ip6_fw_msg_add(msg);
+- break;
+- case IP6_FW_MSG_DEL:
+- break;
+- default:
+- return -EINVAL;
+- };
+- }
+-
+- return count;
+-}
+-
+-static void ip6_fw_destroy(struct flow_rule *rl)
+-{
+- ip6_fwrule_free((struct ip6_fw_rule *)rl);
+-}
+-
+-#ifdef MODULE
+-#define ip6_fw_init module_init
+-#endif
+-
+-void __init ip6_fw_init(void)
+-{
+- netlink_attach(NETLINK_IP6_FW, ip6_fw_msgrcv);
+-}
+-
+-#ifdef MODULE
+-void cleanup_module(void)
+-{
+- netlink_detach(NETLINK_IP6_FW);
+-}
+-#endif
+diff -Nru a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
+--- a/net/ipv6/ip6_input.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ip6_input.c 2005-02-13 21:25:09 +11:00
+@@ -15,6 +15,11 @@
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
++/* Changes
++ *
++ * Mitsuru KANDA @USAGI and
++ * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
++ */
+
+ #include <linux/errno.h>
+ #include <linux/types.h>
+@@ -39,6 +44,7 @@
+ #include <net/ndisc.h>
+ #include <net/ip6_route.h>
+ #include <net/addrconf.h>
++#include <net/xfrm.h>
+
+
+
+@@ -47,7 +53,7 @@
+ if (skb->dst == NULL)
+ ip6_route_input(skb);
+
+- return skb->dst->input(skb);
++ return dst_input(skb);
+ }
+
+ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+@@ -121,13 +127,12 @@
+
+ static inline int ip6_input_finish(struct sk_buff *skb)
+ {
+- struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct inet6_protocol *ipprot;
+ struct sock *raw_sk;
+- int nhoff;
++ unsigned int nhoff;
+ int nexthdr;
+- int found = 0;
+ u8 hash;
++ int cksum_sub = 0;
+
+ skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
+
+@@ -135,7 +140,7 @@
+ * Parse extension headers
+ */
+
+- nexthdr = hdr->nexthdr;
++ nexthdr = skb->nh.ipv6h->nexthdr;
+ nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ /* Skip hop-by-hop options, they are already parsed. */
+@@ -145,58 +150,46 @@
+ skb->h.raw += (skb->h.raw[1]+1)<<3;
+ }
+
+- /* This check is sort of optimization.
+- It would be stupid to detect for optional headers,
+- which are missing with probability of 200%
+- */
+- if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) {
+- nhoff = ipv6_parse_exthdrs(&skb, nhoff);
+- if (nhoff < 0)
+- return 0;
+- nexthdr = skb->nh.raw[nhoff];
+- hdr = skb->nh.ipv6h;
+- }
+-
++resubmit:
+ if (!pskb_pull(skb, skb->h.raw - skb->data))
+ goto discard;
++ nexthdr = skb->nh.raw[nhoff];
+
+- if (skb->ip_summed == CHECKSUM_HW)
+- skb->csum = csum_sub(skb->csum,
+- csum_partial(skb->nh.raw, skb->h.raw-skb->nh.raw, 0));
+-
+- raw_sk = raw_v6_htable[nexthdr&(MAX_INET_PROTOS-1)];
++ raw_sk = raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)];
+ if (raw_sk)
+- raw_sk = ipv6_raw_deliver(skb, nexthdr);
++ ipv6_raw_deliver(skb, nexthdr);
+
+ hash = nexthdr & (MAX_INET_PROTOS - 1);
+- for (ipprot = (struct inet6_protocol *) inet6_protos[hash];
+- ipprot != NULL;
+- ipprot = (struct inet6_protocol *) ipprot->next) {
+- struct sk_buff *buff = skb;
+-
+- if (ipprot->protocol != nexthdr)
+- continue;
+-
+- if (ipprot->copy || raw_sk)
+- buff = skb_clone(skb, GFP_ATOMIC);
+-
+- if (buff)
+- ipprot->handler(buff);
+- found = 1;
+- }
+-
+- if (raw_sk) {
+- rawv6_rcv(raw_sk, skb);
+- sock_put(raw_sk);
+- found = 1;
+- }
+-
+- /*
+- * not found: send ICMP parameter problem back
+- */
+- if (!found) {
+- IP6_INC_STATS_BH(Ip6InUnknownProtos);
+- icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff);
++ if ((ipprot = inet6_protos[hash]) != NULL) {
++ int ret;
++
++ if (ipprot->flags & INET6_PROTO_FINAL) {
++ if (!cksum_sub && skb->ip_summed == CHECKSUM_HW) {
++ skb->csum = csum_sub(skb->csum,
++ csum_partial(skb->nh.raw, skb->h.raw-skb->nh.raw, 0));
++ cksum_sub++;
++ }
++ }
++ if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
++ !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return 0;
++ }
++
++ ret = ipprot->handler(&skb, &nhoff);
++ if (ret > 0)
++ goto resubmit;
++ else if (ret == 0)
++ IP6_INC_STATS_BH(Ip6InDelivers);
++ } else {
++ if (!raw_sk) {
++ if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ IP6_INC_STATS_BH(Ip6InUnknownProtos);
++ icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff);
++ }
++ } else {
++ kfree_skb(skb);
++ }
+ }
+
+ return 0;
+@@ -246,7 +239,7 @@
+ skb2 = skb;
+ }
+
+- dst->output(skb2);
++ dst_output(skb2);
+ }
+ }
+ #endif
+diff -Nru a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+--- a/net/ipv6/ip6_output.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ip6_output.c 2005-02-13 21:25:09 +11:00
+@@ -23,6 +23,9 @@
+ *
+ * H. von Brand : Added missing #include <linux/string.h>
+ * Imran Patel : frag id should be in NBO
++ * Kazunori MIYAZAWA @USAGI
++ * : add ip6_append_data and related functions
++ * for datagram xmit
+ */
+
+ #include <linux/config.h>
+@@ -49,6 +52,9 @@
+ #include <net/addrconf.h>
+ #include <net/rawv6.h>
+ #include <net/icmp.h>
++#include <net/xfrm.h>
++
++static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
+
+ static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
+ {
+@@ -99,7 +105,7 @@
+ }
+
+
+-int ip6_output(struct sk_buff *skb)
++static int ip6_output2(struct sk_buff *skb)
+ {
+ struct dst_entry *dst = skb->dst;
+ struct net_device *dev = dst->dev;
+@@ -134,21 +140,27 @@
+ return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
+ }
+
++int ip6_output(struct sk_buff *skb)
++{
++ if ((skb->len > dst_pmtu(skb->dst) || skb_shinfo(skb)->frag_list))
++ return ip6_fragment(skb, ip6_output2);
++ else
++ return ip6_output2(skb);
++}
+
+ #ifdef CONFIG_NETFILTER
+ int ip6_route_me_harder(struct sk_buff *skb)
+ {
+ struct ipv6hdr *iph = skb->nh.ipv6h;
+ struct dst_entry *dst;
+- struct flowi fl;
+-
+- fl.proto = iph->nexthdr;
+- fl.fl6_dst = &iph->daddr;
+- fl.fl6_src = &iph->saddr;
+- fl.oif = skb->sk ? skb->sk->bound_dev_if : 0;
+- fl.fl6_flowlabel = 0;
+- fl.uli_u.ports.dport = 0;
+- fl.uli_u.ports.sport = 0;
++ struct flowi fl = {
++ .oif = skb->sk ? skb->sk->bound_dev_if : 0,
++ .nl_u =
++ { .ip6_u =
++ { .daddr = iph->daddr,
++ .saddr = iph->saddr, } },
++ .proto = iph->nexthdr,
++ };
+
+ dst = ip6_route_output(skb->sk, &fl);
+
+@@ -177,7 +189,7 @@
+ }
+ }
+ #endif /* CONFIG_NETFILTER */
+- return skb->dst->output(skb);
++ return dst_output(skb);
+ }
+
+ /*
+@@ -188,12 +200,13 @@
+ struct ipv6_txoptions *opt)
+ {
+ struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL;
+- struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr;
++ struct in6_addr *first_hop = &fl->fl6_dst;
+ struct dst_entry *dst = skb->dst;
+ struct ipv6hdr *hdr;
+ u8 proto = fl->proto;
+ int seg_len = skb->len;
+ int hlimit;
++ u32 mtu;
+
+ if (opt) {
+ int head_room;
+@@ -231,16 +244,17 @@
+ if (np)
+ hlimit = np->hop_limit;
+ if (hlimit < 0)
+- hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
+
+ hdr->payload_len = htons(seg_len);
+ hdr->nexthdr = proto;
+ hdr->hop_limit = hlimit;
+
+- ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
++ ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
+ ipv6_addr_copy(&hdr->daddr, first_hop);
+
+- if (skb->len <= dst->pmtu) {
++ mtu = dst_pmtu(dst);
++ if (skb->len <= mtu) {
+ IP6_INC_STATS(Ip6OutRequests);
+ return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+ }
+@@ -248,7 +262,7 @@
+ if (net_ratelimit())
+ printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+ skb->dev = dst->dev;
+- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+@@ -302,8 +316,8 @@
+ hdr->hop_limit = hlimit;
+ hdr->nexthdr = fl->proto;
+
+- ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
+- ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
++ ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
++ ipv6_addr_copy(&hdr->daddr, &fl->fl6_dst);
+ return hdr;
+ }
+
+@@ -507,19 +521,19 @@
+ struct ipv6_txoptions *opt, int hlimit, int flags)
+ {
+ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+- struct in6_addr *final_dst = NULL;
++ struct in6_addr final_dst_buf, *final_dst = NULL;
+ struct dst_entry *dst;
+ int err = 0;
+ unsigned int pktlength, jumbolen, mtu;
+- struct in6_addr saddr;
+
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+- final_dst = fl->fl6_dst;
+- fl->fl6_dst = rt0->addr;
++ ipv6_addr_copy(&final_dst_buf, &fl->fl6_dst);
++ final_dst = &final_dst_buf;
++ ipv6_addr_copy(&fl->fl6_dst, rt0->addr);
+ }
+
+- if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr))
++ if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst))
+ fl->oif = np->mcast_oif;
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+@@ -545,9 +559,9 @@
+ */
+
+ if (((rt->rt6i_dst.plen != 128 ||
+- ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr))
++ ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
+ && (np->daddr_cache == NULL ||
+- ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache)))
++ ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
+ || (fl->oif && fl->oif != dst->dev->ifindex)) {
+ dst = NULL;
+ } else
+@@ -563,8 +577,8 @@
+ return -ENETUNREACH;
+ }
+
+- if (fl->fl6_src == NULL) {
+- err = ipv6_get_saddr(dst, fl->fl6_dst, &saddr);
++ if (ipv6_addr_any(&fl->fl6_src)) {
++ err = ipv6_get_saddr(dst, &fl->fl6_dst, &fl->fl6_src);
+
+ if (err) {
+ #if IP6_DEBUG >= 2
+@@ -573,17 +587,23 @@
+ #endif
+ goto out;
+ }
+- fl->fl6_src = &saddr;
+ }
+ pktlength = length;
+
++ if (dst) {
++ if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) {
++ dst_release(dst);
++ return -ENETUNREACH;
++ }
++ }
++
+ if (hlimit < 0) {
+- if (ipv6_addr_is_multicast(fl->fl6_dst))
++ if (ipv6_addr_is_multicast(&fl->fl6_dst))
+ hlimit = np->mcast_hops;
+ else
+ hlimit = np->hop_limit;
+ if (hlimit < 0)
+- hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
+ }
+
+ jumbolen = 0;
+@@ -593,7 +613,7 @@
+ if (opt)
+ pktlength += opt->opt_flen + opt->opt_nflen;
+
+- if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) {
++ if (pktlength > sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
+ /* Jumbo datagram.
+ It is assumed, that in the case of hdrincl
+ jumbo option is supplied by user.
+@@ -603,7 +623,7 @@
+ }
+ }
+
+- mtu = dst->pmtu;
++ mtu = dst_pmtu(dst);
+ if (np->frag_size < mtu) {
+ if (np->frag_size)
+ mtu = np->frag_size;
+@@ -631,9 +651,8 @@
+ err = 0;
+ if (flags&MSG_PROBE)
+ goto out;
+-
+- skb = sock_alloc_send_skb(sk, pktlength + 15 +
+- dev->hard_header_len,
++ /* alloc skb with mtu as we do in the IPv4 stack for IPsec */
++ skb = sock_alloc_send_skb(sk, mtu + LL_RESERVED_SPACE(dev),
+ flags & MSG_DONTWAIT, &err);
+
+ if (skb == NULL) {
+@@ -664,6 +683,8 @@
+ err = getfrag(data, &hdr->saddr,
+ ((char *) hdr) + (pktlength - length),
+ 0, length);
++ if (!opt || !opt->dst1opt)
++ skb->h.raw = ((char *) hdr) + (pktlength - length);
+
+ if (!err) {
+ IP6_INC_STATS(Ip6OutRequests);
+@@ -688,7 +709,9 @@
+ * cleanup
+ */
+ out:
+- ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL);
++ ip6_dst_store(sk, dst,
++ !ipv6_addr_cmp(&fl->fl6_dst, &np->daddr) ?
++ &np->daddr : NULL);
+ if (err > 0)
+ err = np->recverr ? net_xmit_errno(err) : 0;
+ return err;
+@@ -723,7 +746,7 @@
+
+ static inline int ip6_forward_finish(struct sk_buff *skb)
+ {
+- return skb->dst->output(skb);
++ return dst_output(skb);
+ }
+
+ int ip6_forward(struct sk_buff *skb)
+@@ -735,6 +758,9 @@
+ if (ipv6_devconf.forwarding == 0)
+ goto error;
+
++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb))
++ goto drop;
++
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /*
+@@ -769,6 +795,9 @@
+ return -ETIMEDOUT;
+ }
+
++ if (!xfrm6_route_forward(skb))
++ goto drop;
++
+ /* IPv6 specs say nothing about it, but it is clear that we cannot
+ send redirects to source routed frames.
+ */
+@@ -799,10 +828,10 @@
+ goto error;
+ }
+
+- if (skb->len > dst->pmtu) {
++ if (skb->len > dst_pmtu(dst)) {
+ /* Again, force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_pmtu(dst), skb->dev);
+ IP6_INC_STATS_BH(Ip6InTooBigErrors);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+@@ -825,4 +854,657 @@
+ drop:
+ kfree_skb(skb);
+ return -EINVAL;
++}
++
++static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
++{
++ to->pkt_type = from->pkt_type;
++ to->priority = from->priority;
++ to->protocol = from->protocol;
++ to->security = from->security;
++ to->dst = dst_clone(from->dst);
++ to->dev = from->dev;
++
++#ifdef CONFIG_NET_SCHED
++ to->tc_index = from->tc_index;
++#endif
++#ifdef CONFIG_NETFILTER
++ to->nfmark = from->nfmark;
++ /* Connection association is same as pre-frag packet */
++ to->nfct = from->nfct;
++ nf_conntrack_get(to->nfct);
++#ifdef CONFIG_NETFILTER_DEBUG
++ to->nf_debug = from->nf_debug;
++#endif
++#endif
++}
++
++int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
++{
++ u16 offset = sizeof(struct ipv6hdr);
++ struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
++ unsigned int packet_len = skb->tail - skb->nh.raw;
++ int found_rhdr = 0;
++ *nexthdr = &skb->nh.ipv6h->nexthdr;
++
++ while (offset + 1 <= packet_len) {
++
++ switch (**nexthdr) {
++
++ case NEXTHDR_HOP:
++ case NEXTHDR_ROUTING:
++ case NEXTHDR_DEST:
++ if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
++ if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
++ offset += ipv6_optlen(exthdr);
++ *nexthdr = &exthdr->nexthdr;
++ exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
++ break;
++ default :
++ return offset;
++ }
++ }
++
++ return offset;
++}
++
++static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
++{
++ struct net_device *dev;
++ struct rt6_info *rt = (struct rt6_info*)skb->dst;
++ struct sk_buff *frag;
++ struct ipv6hdr *tmp_hdr;
++ struct frag_hdr *fh;
++ unsigned int mtu, hlen, left, len;
++ u32 frag_id = 0;
++ int ptr, offset = 0, err=0;
++ u8 *prevhdr, nexthdr = 0;
++
++ dev = rt->u.dst.dev;
++ hlen = ip6_find_1stfragopt(skb, &prevhdr);
++ nexthdr = *prevhdr;
++
++ mtu = dst_pmtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
++
++ if (skb_shinfo(skb)->frag_list) {
++ int first_len = skb_pagelen(skb);
++
++ if (first_len - hlen > mtu ||
++ ((first_len - hlen) & 7) ||
++ skb_cloned(skb))
++ goto slow_path;
++
++ for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
++ /* Correct geometry. */
++ if (frag->len > mtu ||
++ ((frag->len & 7) && frag->next) ||
++ skb_headroom(frag) < hlen)
++ goto slow_path;
++
++ /* Correct socket ownership. */
++ if (frag->sk == NULL)
++ goto slow_path;
++
++ /* Partially cloned skb? */
++ if (skb_shared(frag))
++ goto slow_path;
++ }
++
++ err = 0;
++ offset = 0;
++ frag = skb_shinfo(skb)->frag_list;
++ skb_shinfo(skb)->frag_list = 0;
++ /* BUILD HEADER */
++
++ tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
++ if (!tmp_hdr) {
++ IP6_INC_STATS(Ip6FragFails);
++ return -ENOMEM;
++ }
++
++ *prevhdr = NEXTHDR_FRAGMENT;
++ memcpy(tmp_hdr, skb->nh.raw, hlen);
++ __skb_pull(skb, hlen);
++ fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
++ skb->nh.raw = __skb_push(skb, hlen);
++ memcpy(skb->nh.raw, tmp_hdr, hlen);
++
++ ipv6_select_ident(skb, fh);
++ fh->nexthdr = nexthdr;
++ fh->reserved = 0;
++ fh->frag_off = htons(IP6_MF);
++ frag_id = fh->identification;
++
++ first_len = skb_pagelen(skb);
++ skb->data_len = first_len - skb_headlen(skb);
++ skb->len = first_len;
++ skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
++
++
++ for (;;) {
++ /* Prepare header of the next frame,
++ * before previous one went down. */
++ if (frag) {
++ frag->h.raw = frag->data;
++ fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
++ frag->nh.raw = __skb_push(frag, hlen);
++ memcpy(frag->nh.raw, tmp_hdr, hlen);
++ offset += skb->len - hlen - sizeof(struct frag_hdr);
++ fh->nexthdr = nexthdr;
++ fh->reserved = 0;
++ fh->frag_off = htons(offset);
++ if (frag->next != NULL)
++ fh->frag_off |= htons(IP6_MF);
++ fh->identification = frag_id;
++ frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
++ ip6_copy_metadata(frag, skb);
++ }
++ err = output(skb);
++
++ if (err || !frag)
++ break;
++
++ skb = frag;
++ frag = skb->next;
++ skb->next = NULL;
++ }
++
++ if (tmp_hdr)
++ kfree(tmp_hdr);
++
++ if (err == 0) {
++ IP6_INC_STATS(Ip6FragOKs);
++ return 0;
++ }
++
++ while (frag) {
++ skb = frag->next;
++ kfree_skb(frag);
++ frag = skb;
++ }
++
++ IP6_INC_STATS(Ip6FragFails);
++ return err;
++ }
++
++slow_path:
++ left = skb->len - hlen; /* Space per frame */
++ ptr = hlen; /* Where to start from */
++
++ /*
++ * Fragment the datagram.
++ */
++
++ *prevhdr = NEXTHDR_FRAGMENT;
++
++ /*
++ * Keep copying data until we run out.
++ */
++ while(left > 0) {
++ len = left;
++ /* IF: it doesn't fit, use 'mtu' - the data space left */
++ if (len > mtu)
++ len = mtu;
++ /* IF: we are not sending upto and including the packet end
++ then align the next start on an eight byte boundary */
++ if (len < left) {
++ len &= ~7;
++ }
++ /*
++ * Allocate buffer.
++ */
++
++ if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
++ NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
++ err = -ENOMEM;
++ goto fail;
++ }
++
++ /*
++ * Set up data on packet
++ */
++
++ ip6_copy_metadata(frag, skb);
++ skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
++ skb_put(frag, len + hlen + sizeof(struct frag_hdr));
++ frag->nh.raw = frag->data;
++ fh = (struct frag_hdr*)(frag->data + hlen);
++ frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
++
++ /*
++ * Charge the memory for the fragment to any owner
++ * it might possess
++ */
++ if (skb->sk)
++ skb_set_owner_w(frag, skb->sk);
++
++ /*
++ * Copy the packet header into the new buffer.
++ */
++ memcpy(frag->nh.raw, skb->data, hlen);
++
++ /*
++ * Build fragment header.
++ */
++ fh->nexthdr = nexthdr;
++ fh->reserved = 0;
++ if (frag_id) {
++ ipv6_select_ident(skb, fh);
++ frag_id = fh->identification;
++ } else
++ fh->identification = frag_id;
++
++ /*
++ * Copy a block of the IP datagram.
++ */
++ if (skb_copy_bits(skb, ptr, frag->h.raw, len))
++ BUG();
++ left -= len;
++
++ fh->frag_off = htons(offset);
++ if (left > 0)
++ fh->frag_off |= htons(IP6_MF);
++ frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
++
++ ptr += len;
++ offset += len;
++
++ /*
++ * Put this fragment into the sending queue.
++ */
++
++ IP6_INC_STATS(Ip6FragCreates);
++
++ err = output(frag);
++ if (err)
++ goto fail;
++ }
++ kfree_skb(skb);
++ IP6_INC_STATS(Ip6FragOKs);
++ return err;
++
++fail:
++ kfree_skb(skb);
++ IP6_INC_STATS(Ip6FragFails);
++ return err;
++}
++
++int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
++{
++ int err = 0;
++
++ if (sk) {
++ struct ipv6_pinfo *np = inet6_sk(sk);
++
++ *dst = __sk_dst_check(sk, np->dst_cookie);
++ if (*dst) {
++ struct rt6_info *rt = (struct rt6_info*)*dst;
++
++ /* Yes, checking route validity in not connected
++ case is not very simple. Take into account,
++ that we do not support routing by source, TOS,
++ and MSG_DONTROUTE --ANK (980726)
++
++ 1. If route was host route, check that
++ cached destination is current.
++ If it is network route, we still may
++ check its validity using saved pointer
++ to the last used address: daddr_cache.
++ We do not want to save whole address now,
++ (because main consumer of this service
++ is tcp, which has not this problem),
++ so that the last trick works only on connected
++ sockets.
++ 2. oif also should be the same.
++ */
++
++ if (((rt->rt6i_dst.plen != 128 ||
++ ipv6_addr_cmp(&fl->fl6_dst, &rt->rt6i_dst.addr))
++ && (np->daddr_cache == NULL ||
++ ipv6_addr_cmp(&fl->fl6_dst, np->daddr_cache)))
++ || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
++ *dst = NULL;
++ } else
++ dst_hold(*dst);
++ }
++ }
++
++ if (*dst == NULL)
++ *dst = ip6_route_output(sk, fl);
++
++ if ((err = (*dst)->error))
++ goto out_err_release;
++
++ if (ipv6_addr_any(&fl->fl6_src)) {
++ err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
++
++ if (err) {
++#if IP6_DEBUG >= 2
++ printk(KERN_DEBUG "ip6_dst_lookup: "
++ "no availiable source address\n");
++#endif
++ goto out_err_release;
++ }
++ }
++ if ((err = xfrm_lookup(dst, fl, sk, 0)) < 0) {
++ err = -ENETUNREACH;
++ goto out_err_release;
++ }
++
++ return 0;
++
++out_err_release:
++ dst_release(*dst);
++ *dst = NULL;
++ return err;
++}
++
++int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
++ void *from, int length, int transhdrlen,
++ int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
++ unsigned int flags)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct sk_buff *skb;
++ unsigned int maxfraglen, fragheaderlen;
++ int exthdrlen;
++ int hh_len;
++ int mtu;
++ int copy = 0;
++ int err;
++ int offset = 0;
++ int csummode = CHECKSUM_NONE;
++
++ if (flags&MSG_PROBE)
++ return 0;
++ if (skb_queue_empty(&sk->write_queue)) {
++ /*
++ * setup for corking
++ */
++ if (opt) {
++ if (np->cork.opt == NULL) {
++ np->cork.opt = kmalloc(opt->tot_len, sk->allocation);
++ if (unlikely(np->cork.opt == NULL))
++ return -ENOBUFS;
++ }
++ memcpy(np->cork.opt, opt, opt->tot_len);
++ inet->cork.flags |= IPCORK_OPT;
++ /* need source address above miyazawa*/
++ }
++ dst_hold(&rt->u.dst);
++ np->cork.rt = rt;
++ np->cork.fl = *fl;
++ np->cork.hop_limit = hlimit;
++ inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst);
++ inet->cork.length = 0;
++ inet->sndmsg_page = NULL;
++ inet->sndmsg_off = 0;
++ exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
++ length += exthdrlen;
++ transhdrlen += exthdrlen;
++ } else {
++ rt = np->cork.rt;
++ if (inet->cork.flags & IPCORK_OPT)
++ opt = np->cork.opt;
++ transhdrlen = 0;
++ exthdrlen = 0;
++ mtu = inet->cork.fragsize;
++ }
++
++ hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16;
++
++ fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
++ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
++
++ if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
++ if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
++ ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
++ return -EMSGSIZE;
++ }
++ }
++
++ inet->cork.length += length;
++
++ if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
++ goto alloc_new_skb;
++
++ while (length > 0) {
++ if ((copy = maxfraglen - skb->len) <= 0) {
++ char *data;
++ unsigned int datalen;
++ unsigned int fraglen;
++ unsigned int alloclen;
++ BUG_TRAP(copy == 0);
++alloc_new_skb:
++ datalen = maxfraglen - fragheaderlen;
++ if (datalen > length)
++ datalen = length;
++ fraglen = datalen + fragheaderlen;
++ if ((flags & MSG_MORE) &&
++ !(rt->u.dst.dev->features&NETIF_F_SG))
++ alloclen = maxfraglen;
++ else
++ alloclen = fraglen;
++ alloclen += sizeof(struct frag_hdr);
++ if (transhdrlen) {
++ skb = sock_alloc_send_skb(sk,
++ alloclen + hh_len + 15,
++ (flags & MSG_DONTWAIT), &err);
++ } else {
++ skb = NULL;
++ if (atomic_read(&sk->wmem_alloc) <= 2*sk->sndbuf)
++ skb = sock_wmalloc(sk,
++ alloclen + hh_len + 15, 1,
++ sk->allocation);
++ if (unlikely(skb == NULL))
++ err = -ENOBUFS;
++ }
++ if (skb == NULL)
++ goto error;
++ /*
++ * Fill in the control structures
++ */
++ skb->ip_summed = csummode;
++ skb->csum = 0;
++ /* reserve 8 byte for fragmentation */
++ skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
++
++ /*
++ * Find where to start putting bytes
++ */
++ data = skb_put(skb, fraglen);
++ skb->nh.raw = data + exthdrlen;
++ data += fragheaderlen;
++ skb->h.raw = data + exthdrlen;
++ copy = datalen - transhdrlen;
++ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
++ err = -EFAULT;
++ kfree_skb(skb);
++ goto error;
++ }
++
++ offset += copy;
++ length -= datalen;
++ transhdrlen = 0;
++ exthdrlen = 0;
++ csummode = CHECKSUM_NONE;
++
++ /*
++ * Put the packet on the pending queue
++ */
++ __skb_queue_tail(&sk->write_queue, skb);
++ continue;
++ }
++
++ if (copy > length)
++ copy = length;
++
++ if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
++ unsigned int off;
++
++ off = skb->len;
++ if (getfrag(from, skb_put(skb, copy),
++ offset, copy, off, skb) < 0) {
++ __skb_trim(skb, off);
++ err = -EFAULT;
++ goto error;
++ }
++ } else {
++ int i = skb_shinfo(skb)->nr_frags;
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
++ struct page *page = inet->sndmsg_page;
++ int off = inet->sndmsg_off;
++ unsigned int left;
++
++ if (page && (left = PAGE_SIZE - off) > 0) {
++ if (copy >= left)
++ copy = left;
++ if (page != frag->page) {
++ if (i == MAX_SKB_FRAGS) {
++ err = -EMSGSIZE;
++ goto error;
++ }
++ get_page(page);
++ skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
++ frag = &skb_shinfo(skb)->frags[i];
++ }
++ } else if(i < MAX_SKB_FRAGS) {
++ if (copy > PAGE_SIZE)
++ copy = PAGE_SIZE;
++ page = alloc_pages(sk->allocation, 0);
++ if (page == NULL) {
++ err = -ENOMEM;
++ goto error;
++ }
++ inet->sndmsg_page = page;
++ inet->sndmsg_off = 0;
++
++ skb_fill_page_desc(skb, i, page, 0, 0);
++ frag = &skb_shinfo(skb)->frags[i];
++ skb->truesize += PAGE_SIZE;
++ atomic_add(PAGE_SIZE, &sk->wmem_alloc);
++ } else {
++ err = -EMSGSIZE;
++ goto error;
++ }
++ if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
++ err = -EFAULT;
++ goto error;
++ }
++ inet->sndmsg_off += copy;
++ frag->size += copy;
++ skb->len += copy;
++ skb->data_len += copy;
++ }
++ offset += copy;
++ length -= copy;
++ }
++ return 0;
++error:
++ inet->cork.length -= length;
++ IP6_INC_STATS(Ip6OutDiscards);
++ return err;
++}
++
++int ip6_push_pending_frames(struct sock *sk)
++{
++ struct sk_buff *skb, *tmp_skb;
++ struct sk_buff **tail_skb;
++ struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
++ struct inet_opt *inet = inet_sk(sk);
++ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct ipv6hdr *hdr;
++ struct ipv6_txoptions *opt = np->cork.opt;
++ struct rt6_info *rt = np->cork.rt;
++ struct flowi *fl = &np->cork.fl;
++ unsigned char proto = fl->proto;
++ int err = 0;
++
++ if ((skb = __skb_dequeue(&sk->write_queue)) == NULL)
++ goto out;
++ tail_skb = &(skb_shinfo(skb)->frag_list);
++
++ /* move skb->data to ip header from ext header */
++ if (skb->data < skb->nh.raw)
++ __skb_pull(skb, skb->nh.raw - skb->data);
++ while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) {
++ __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
++ *tail_skb = tmp_skb;
++ tail_skb = &(tmp_skb->next);
++ skb->len += tmp_skb->len;
++ skb->data_len += tmp_skb->len;
++#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
++ skb->truesize += tmp_skb->truesize;
++ __sock_put(tmp_skb->sk);
++ tmp_skb->destructor = NULL;
++ tmp_skb->sk = NULL;
++#endif
++ }
++
++ ipv6_addr_copy(final_dst, &fl->fl6_dst);
++ __skb_pull(skb, skb->h.raw - skb->nh.raw);
++ if (opt && opt->opt_flen)
++ ipv6_push_frag_opts(skb, opt, &proto);
++ if (opt && opt->opt_nflen)
++ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
++
++ skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
++
++ *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
++
++ if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
++ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++ else
++ hdr->payload_len = 0;
++ hdr->hop_limit = np->cork.hop_limit;
++ hdr->nexthdr = proto;
++ ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
++ ipv6_addr_copy(&hdr->daddr, final_dst);
++
++ skb->dst = dst_clone(&rt->u.dst);
++ err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
++ if (err) {
++ if (err > 0)
++ err = inet->recverr ? net_xmit_errno(err) : 0;
++ if (err)
++ goto error;
++ }
++
++out:
++ inet->cork.flags &= ~IPCORK_OPT;
++ if (np->cork.opt) {
++ kfree(np->cork.opt);
++ np->cork.opt = NULL;
++ }
++ if (np->cork.rt) {
++ np->cork.rt = NULL;
++ }
++ memset(&np->cork.fl, 0, sizeof(np->cork.fl));
++ return err;
++error:
++ goto out;
++}
++
++void ip6_flush_pending_frames(struct sock *sk)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct sk_buff *skb;
++
++ while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL)
++ kfree_skb(skb);
++
++ inet->cork.flags &= ~IPCORK_OPT;
++
++ if (np->cork.opt) {
++ kfree(np->cork.opt);
++ np->cork.opt = NULL;
++ }
++ if (np->cork.rt) {
++ dst_release(&np->cork.rt->u.dst);
++ dst_release(&np->cork.rt->u.dst);
++ np->cork.rt = NULL;
++ }
++ memset(&np->cork.fl, 0, sizeof(np->cork.fl));
+ }
+diff -Nru a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/ip6_tunnel.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,1146 @@
++/*
++ * IPv6 over IPv6 tunnel device
++ * Linux INET6 implementation
++ *
++ * Authors:
++ * Ville Nuorvala <vnuorval at tcs.hut.fi>
++ *
++ * $Id$
++ *
++ * Based on:
++ * linux/net/ipv6/sit.c
++ *
++ * RFC 2473
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/errno.h>
++#include <linux/types.h>
++#include <linux/sockios.h>
++#include <linux/if.h>
++#include <linux/in.h>
++#include <linux/ip.h>
++#include <linux/if_tunnel.h>
++#include <linux/net.h>
++#include <linux/in6.h>
++#include <linux/netdevice.h>
++#include <linux/if_arp.h>
++#include <linux/icmpv6.h>
++#include <linux/init.h>
++#include <linux/route.h>
++#include <linux/rtnetlink.h>
++#include <linux/netfilter_ipv6.h>
++
++#include <asm/uaccess.h>
++#include <asm/atomic.h>
++
++#include <net/ip.h>
++#include <net/ipv6.h>
++#include <net/protocol.h>
++#include <net/ip6_route.h>
++#include <net/addrconf.h>
++#include <net/ip6_tunnel.h>
++#include <net/xfrm.h>
++
++MODULE_AUTHOR("Ville Nuorvala");
++MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel");
++MODULE_LICENSE("GPL");
++
++#define IPV6_TLV_TEL_DST_SIZE 8
++
++#ifdef IP6_TNL_DEBUG
++#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__)
++#else
++#define IP6_TNL_TRACE(x...) do {;} while(0)
++#endif
++
++#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
++
++#define HASH_SIZE 32
++
++#define HASH(addr) (((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
++ (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
++ (HASH_SIZE - 1))
++
++static int ip6ip6_fb_tnl_dev_init(struct net_device *dev);
++static int ip6ip6_tnl_dev_init(struct net_device *dev);
++static void ip6ip6_tnl_dev_setup(struct net_device *dev);
++
++/* the IPv6 tunnel fallback device */
++static struct net_device *ip6ip6_fb_tnl_dev;
++
++
++/* lists for storing tunnels in use */
++static struct ip6_tnl *tnls_r_l[HASH_SIZE];
++static struct ip6_tnl *tnls_wc[1];
++static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l };
++
++/* lock for the tunnel lists */
++static rwlock_t ip6ip6_lock = RW_LOCK_UNLOCKED;
++
++static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
++{
++ struct dst_entry *dst = t->dst_cache;
++
++ if (dst && dst->obsolete &&
++ dst->ops->check(dst, t->dst_cookie) == NULL) {
++ t->dst_cache = NULL;
++ return NULL;
++ }
++
++ return dst;
++}
++
++static inline void ip6_tnl_dst_reset(struct ip6_tnl *t)
++{
++ dst_release(t->dst_cache);
++ t->dst_cache = NULL;
++}
++
++static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
++{
++ struct rt6_info *rt = (struct rt6_info *) dst;
++ t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
++ dst_release(t->dst_cache);
++ t->dst_cache = dst;
++}
++
++/**
++ * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses
++ * @remote: the address of the tunnel exit-point
++ * @local: the address of the tunnel entry-point
++ *
++ * Return:
++ * tunnel matching given end-points if found,
++ * else fallback tunnel if its device is up,
++ * else %NULL
++ **/
++
++static struct ip6_tnl *
++ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local)
++{
++ unsigned h0 = HASH(remote);
++ unsigned h1 = HASH(local);
++ struct ip6_tnl *t;
++
++ for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) {
++ if (!ipv6_addr_cmp(local, &t->parms.laddr) &&
++ !ipv6_addr_cmp(remote, &t->parms.raddr) &&
++ (t->dev->flags & IFF_UP))
++ return t;
++ }
++ if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP))
++ return t;
++
++ return NULL;
++}
++
++/**
++ * ip6ip6_bucket - get head of list matching given tunnel parameters
++ * @p: parameters containing tunnel end-points
++ *
++ * Description:
++ * ip6ip6_bucket() returns the head of the list matching the
++ * &struct in6_addr entries laddr and raddr in @p.
++ *
++ * Return: head of IPv6 tunnel list
++ **/
++
++static struct ip6_tnl **
++ip6ip6_bucket(struct ip6_tnl_parm *p)
++{
++ struct in6_addr *remote = &p->raddr;
++ struct in6_addr *local = &p->laddr;
++ unsigned h = 0;
++ int prio = 0;
++
++ if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
++ prio = 1;
++ h = HASH(remote) ^ HASH(local);
++ }
++ return &tnls[prio][h];
++}
++
++/**
++ * ip6ip6_tnl_link - add tunnel to hash table
++ * @t: tunnel to be added
++ **/
++
++static void
++ip6ip6_tnl_link(struct ip6_tnl *t)
++{
++ struct ip6_tnl **tp = ip6ip6_bucket(&t->parms);
++
++ write_lock_bh(&ip6ip6_lock);
++ t->next = *tp;
++ write_unlock_bh(&ip6ip6_lock);
++ *tp = t;
++}
++
++/**
++ * ip6ip6_tnl_unlink - remove tunnel from hash table
++ * @t: tunnel to be removed
++ **/
++
++static void
++ip6ip6_tnl_unlink(struct ip6_tnl *t)
++{
++ struct ip6_tnl **tp;
++
++ for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) {
++ if (t == *tp) {
++ write_lock_bh(&ip6ip6_lock);
++ *tp = t->next;
++ write_unlock_bh(&ip6ip6_lock);
++ break;
++ }
++ }
++}
++
++/**
++ * ip6_tnl_create() - create a new tunnel
++ * @p: tunnel parameters
++ * @pt: pointer to new tunnel
++ *
++ * Description:
++ * Create tunnel matching given parameters.
++ *
++ * Return:
++ * 0 on success
++ **/
++
++static int
++ip6_tnl_create(struct ip6_tnl_parm *p, struct ip6_tnl **pt)
++{
++ struct net_device *dev;
++ struct ip6_tnl *t;
++ char name[IFNAMSIZ];
++ int err;
++
++ if (p->name[0]) {
++ memcpy(name, p->name, IFNAMSIZ - 1);
++ name[IFNAMSIZ - 1] = 0;
++ } else {
++ int i;
++ for (i = 1; i < IP6_TNL_MAX; i++) {
++ sprintf(name, "ip6tnl%d", i);
++ if (__dev_get_by_name(name) == NULL)
++ break;
++ }
++ if (i == IP6_TNL_MAX)
++ return -ENOBUFS;
++ }
++ dev = alloc_netdev(sizeof (*t), name, ip6ip6_tnl_dev_setup);
++ if (dev == NULL)
++ return -ENOMEM;
++
++ t = dev->priv;
++ dev->init = ip6ip6_tnl_dev_init;
++ t->parms = *p;
++
++ if ((err = register_netdevice(dev)) < 0) {
++ kfree(dev);
++ return err;
++ }
++ dev_hold(dev);
++
++ ip6ip6_tnl_link(t);
++ *pt = t;
++ return 0;
++}
++
++/**
++ * ip6ip6_tnl_locate - find or create tunnel matching given parameters
++ * @p: tunnel parameters
++ * @create: != 0 if allowed to create new tunnel if no match found
++ *
++ * Description:
++ * ip6ip6_tnl_locate() first tries to locate an existing tunnel
++ * based on @parms. If this is unsuccessful, but @create is set a new
++ * tunnel device is created and registered for use.
++ *
++ * Return:
++ * 0 if tunnel located or created,
++ * -EINVAL if parameters incorrect,
++ * -ENODEV if no matching tunnel available
++ **/
++
++static int
++ip6ip6_tnl_locate(struct ip6_tnl_parm *p, struct ip6_tnl **pt, int create)
++{
++ struct in6_addr *remote = &p->raddr;
++ struct in6_addr *local = &p->laddr;
++ struct ip6_tnl *t;
++
++ if (p->proto != IPPROTO_IPV6)
++ return -EINVAL;
++
++ for (t = *ip6ip6_bucket(p); t; t = t->next) {
++ if (!ipv6_addr_cmp(local, &t->parms.laddr) &&
++ !ipv6_addr_cmp(remote, &t->parms.raddr)) {
++ *pt = t;
++ return (create ? -EEXIST : 0);
++ }
++ }
++ if (!create)
++ return -ENODEV;
++
++ return ip6_tnl_create(p, pt);
++}
++
++/**
++ * ip6ip6_tnl_dev_uninit - tunnel device uninitializer
++ * @dev: the device to be destroyed
++ *
++ * Description:
++ * ip6ip6_tnl_dev_uninit() removes tunnel from its list
++ **/
++
++static void
++ip6ip6_tnl_dev_uninit(struct net_device *dev)
++{
++ struct ip6_tnl *t = dev->priv;
++
++ if (dev == ip6ip6_fb_tnl_dev) {
++ write_lock_bh(&ip6ip6_lock);
++ tnls_wc[0] = NULL;
++ write_unlock_bh(&ip6ip6_lock);
++ } else {
++ ip6ip6_tnl_unlink(t);
++ }
++ ip6_tnl_dst_reset(t);
++ dev_put(dev);
++}
++
++/**
++ * parse_tvl_tnl_enc_lim - handle encapsulation limit option
++ * @skb: received socket buffer
++ *
++ * Return:
++ * 0 if none was found,
++ * else index to encapsulation limit
++ **/
++
++static __u16
++parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
++{
++ struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw;
++ __u8 nexthdr = ipv6h->nexthdr;
++ __u16 off = sizeof (*ipv6h);
++
++ while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
++ __u16 optlen = 0;
++ struct ipv6_opt_hdr *hdr;
++ if (raw + off + sizeof (*hdr) > skb->data &&
++ !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
++ break;
++
++ hdr = (struct ipv6_opt_hdr *) (raw + off);
++ if (nexthdr == NEXTHDR_FRAGMENT) {
++ struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
++ if (frag_hdr->frag_off)
++ break;
++ optlen = 8;
++ } else if (nexthdr == NEXTHDR_AUTH) {
++ optlen = (hdr->hdrlen + 2) << 2;
++ } else {
++ optlen = ipv6_optlen(hdr);
++ }
++ if (nexthdr == NEXTHDR_DEST) {
++ __u16 i = off + 2;
++ while (1) {
++ struct ipv6_tlv_tnl_enc_lim *tel;
++
++ /* No more room for encapsulation limit */
++ if (i + sizeof (*tel) > off + optlen)
++ break;
++
++ tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
++ /* return index of option if found and valid */
++ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
++ tel->length == 1)
++ return i;
++ /* else jump to next option */
++ if (tel->type)
++ i += tel->length + 2;
++ else
++ i++;
++ }
++ }
++ nexthdr = hdr->nexthdr;
++ off += optlen;
++ }
++ return 0;
++}
++
++/**
++ * ip6ip6_err - tunnel error handler
++ *
++ * Description:
++ * ip6ip6_err() should handle errors in the tunnel according
++ * to the specifications in RFC 2473.
++ **/
++
++static void
++ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info)
++{
++ struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data;
++ struct ip6_tnl *t;
++ int rel_msg = 0;
++ int rel_type = ICMPV6_DEST_UNREACH;
++ int rel_code = ICMPV6_ADDR_UNREACH;
++ __u32 rel_info = 0;
++ __u16 len;
++
++ /* If the packet doesn't contain the original IPv6 header we are
++ in trouble since we might need the source address for furter
++ processing of the error. */
++
++ read_lock(&ip6ip6_lock);
++ if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL)
++ goto out;
++
++ switch (type) {
++ __u32 teli;
++ struct ipv6_tlv_tnl_enc_lim *tel;
++ __u32 mtu;
++ case ICMPV6_DEST_UNREACH:
++ if (net_ratelimit())
++ printk(KERN_WARNING
++ "%s: Path to destination invalid "
++ "or inactive!\n", t->parms.name);
++ rel_msg = 1;
++ break;
++ case ICMPV6_TIME_EXCEED:
++ if (code == ICMPV6_EXC_HOPLIMIT) {
++ if (net_ratelimit())
++ printk(KERN_WARNING
++ "%s: Too small hop limit or "
++ "routing loop in tunnel!\n",
++ t->parms.name);
++ rel_msg = 1;
++ }
++ break;
++ case ICMPV6_PARAMPROB:
++ /* ignore if parameter problem not caused by a tunnel
++ encapsulation limit sub-option */
++ if (code != ICMPV6_HDR_FIELD) {
++ break;
++ }
++ teli = parse_tlv_tnl_enc_lim(skb, skb->data);
++
++ if (teli && teli == ntohl(info) - 2) {
++ tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
++ if (tel->encap_limit == 0) {
++ if (net_ratelimit())
++ printk(KERN_WARNING
++ "%s: Too small encapsulation "
++ "limit or routing loop in "
++ "tunnel!\n", t->parms.name);
++ rel_msg = 1;
++ }
++ }
++ break;
++ case ICMPV6_PKT_TOOBIG:
++ mtu = ntohl(info) - offset;
++ if (mtu < IPV6_MIN_MTU)
++ mtu = IPV6_MIN_MTU;
++ t->dev->mtu = mtu;
++
++ if ((len = sizeof (*ipv6h) + ipv6h->payload_len) > mtu) {
++ rel_type = ICMPV6_PKT_TOOBIG;
++ rel_code = 0;
++ rel_info = mtu;
++ rel_msg = 1;
++ }
++ break;
++ }
++ if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) {
++ struct rt6_info *rt;
++ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
++ if (!skb2)
++ goto out;
++
++ dst_release(skb2->dst);
++ skb2->dst = NULL;
++ skb_pull(skb2, offset);
++ skb2->nh.raw = skb2->data;
++
++ /* Try to guess incoming interface */
++ rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0);
++
++ if (rt && rt->rt6i_dev)
++ skb2->dev = rt->rt6i_dev;
++
++ icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev);
++
++ if (rt)
++ dst_release(&rt->u.dst);
++
++ kfree_skb(skb2);
++ }
++out:
++ read_unlock(&ip6ip6_lock);
++}
++
++/**
++ * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally
++ * @skb: received socket buffer
++ *
++ * Return: 0
++ **/
++
++static int
++ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
++{
++ struct sk_buff *skb = *pskb;
++ struct ipv6hdr *ipv6h;
++ struct ip6_tnl *t;
++
++ if (!pskb_may_pull(skb, sizeof (*ipv6h)))
++ goto discard;
++
++ ipv6h = skb->nh.ipv6h;
++
++ read_lock(&ip6ip6_lock);
++
++ if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) {
++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return 0;
++ }
++
++ if (!(t->parms.flags & IP6_TNL_F_CAP_RCV)) {
++ t->stat.rx_dropped++;
++ read_unlock(&ip6ip6_lock);
++ goto discard;
++ }
++ secpath_reset(skb);
++ skb->mac.raw = skb->nh.raw;
++ skb->nh.raw = skb->data;
++ skb->protocol = htons(ETH_P_IPV6);
++ skb->pkt_type = PACKET_HOST;
++ memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
++ skb->dev = t->dev;
++ dst_release(skb->dst);
++ skb->dst = NULL;
++ t->stat.rx_packets++;
++ t->stat.rx_bytes += skb->len;
++ netif_rx(skb);
++ read_unlock(&ip6ip6_lock);
++ return 0;
++ }
++ read_unlock(&ip6ip6_lock);
++ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
++discard:
++ return 1;
++}
++
++static inline struct ipv6_txoptions *create_tel(__u8 encap_limit)
++{
++ struct ipv6_tlv_tnl_enc_lim *tel;
++ struct ipv6_txoptions *opt;
++ __u8 *raw;
++
++ int opt_len = sizeof(*opt) + 8;
++
++ if (!(opt = kmalloc(opt_len, GFP_ATOMIC))) {
++ return NULL;
++ }
++ memset(opt, 0, opt_len);
++ opt->tot_len = opt_len;
++ opt->dst0opt = (struct ipv6_opt_hdr *) (opt + 1);
++ opt->opt_nflen = 8;
++
++ tel = (struct ipv6_tlv_tnl_enc_lim *) (opt->dst0opt + 1);
++ tel->type = IPV6_TLV_TNL_ENCAP_LIMIT;
++ tel->length = 1;
++ tel->encap_limit = encap_limit;
++
++ raw = (__u8 *) opt->dst0opt;
++ raw[5] = IPV6_TLV_PADN;
++ raw[6] = 1;
++
++ return opt;
++}
++
++/**
++ * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
++ * @t: the outgoing tunnel device
++ * @hdr: IPv6 header from the incoming packet
++ *
++ * Description:
++ * Avoid trivial tunneling loop by checking that tunnel exit-point
++ * doesn't match source of incoming packet.
++ *
++ * Return:
++ * 1 if conflict,
++ * 0 else
++ **/
++
++static inline int
++ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr)
++{
++ return !ipv6_addr_cmp(&t->parms.raddr, &hdr->saddr);
++}
++
++/**
++ * ip6ip6_tnl_xmit - encapsulate packet and send
++ * @skb: the outgoing socket buffer
++ * @dev: the outgoing tunnel device
++ *
++ * Description:
++ * Build new header and do some sanity checks on the packet before sending
++ * it.
++ *
++ * Return:
++ * 0
++ **/
++
++static int
++ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++ struct ip6_tnl *t = (struct ip6_tnl *) dev->priv;
++ struct net_device_stats *stats = &t->stat;
++ struct ipv6hdr *ipv6h = skb->nh.ipv6h;
++ struct ipv6_txoptions *opt = NULL;
++ int encap_limit = -1;
++ __u16 offset;
++ struct flowi fl;
++ struct dst_entry *dst;
++ struct net_device *tdev;
++ int mtu;
++ int max_headroom = sizeof(struct ipv6hdr);
++ u8 proto;
++ int err;
++ int pkt_len;
++
++ if (t->recursion++) {
++ stats->collisions++;
++ goto tx_err;
++ }
++ if (skb->protocol != htons(ETH_P_IPV6) ||
++ !(t->parms.flags & IP6_TNL_F_CAP_XMIT) ||
++ ip6ip6_tnl_addr_conflict(t, ipv6h)) {
++ goto tx_err;
++ }
++ if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) {
++ struct ipv6_tlv_tnl_enc_lim *tel;
++ tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset];
++ if (tel->encap_limit == 0) {
++ icmpv6_send(skb, ICMPV6_PARAMPROB,
++ ICMPV6_HDR_FIELD, offset + 2, skb->dev);
++ goto tx_err;
++ }
++ encap_limit = tel->encap_limit - 1;
++ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
++ encap_limit = t->parms.encap_limit;
++ }
++ memcpy(&fl, &t->fl, sizeof (fl));
++ proto = fl.proto;
++
++ if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
++ fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_TCLASS_MASK);
++ if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL))
++ fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_FLOWLABEL_MASK);
++
++ if (encap_limit >= 0 && (opt = create_tel(encap_limit)) == NULL)
++ goto tx_err;
++
++ if ((dst = ip6_tnl_dst_check(t)) != NULL)
++ dst_hold(dst);
++ else
++ dst = ip6_route_output(NULL, &fl);
++
++ if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0)
++ goto tx_err_link_failure;
++
++ tdev = dst->dev;
++
++ if (tdev == dev) {
++ stats->collisions++;
++ if (net_ratelimit())
++ printk(KERN_WARNING
++ "%s: Local routing loop detected!\n",
++ t->parms.name);
++ goto tx_err_dst_release;
++ }
++ mtu = dst_pmtu(dst) - sizeof (*ipv6h);
++ if (opt) {
++ max_headroom += 8;
++ mtu -= 8;
++ }
++ if (mtu < IPV6_MIN_MTU)
++ mtu = IPV6_MIN_MTU;
++ if (skb->dst && mtu < dst_pmtu(skb->dst)) {
++ struct rt6_info *rt = (struct rt6_info *) skb->dst;
++ rt->rt6i_flags |= RTF_MODIFIED;
++ rt->u.dst.metrics[RTAX_MTU-1] = mtu;
++ }
++ if (skb->len > mtu) {
++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
++ goto tx_err_dst_release;
++ }
++
++ /*
++ * Okay, now see if we can stuff it in the buffer as-is.
++ */
++ max_headroom += LL_RESERVED_SPACE(tdev);
++
++ if (skb_headroom(skb) < max_headroom ||
++ skb_cloned(skb) || skb_shared(skb)) {
++ struct sk_buff *new_skb;
++
++ if (!(new_skb = skb_realloc_headroom(skb, max_headroom)))
++ goto tx_err_dst_release;
++
++ if (skb->sk)
++ skb_set_owner_w(new_skb, skb->sk);
++ kfree_skb(skb);
++ skb = new_skb;
++ }
++ dst_release(skb->dst);
++ skb->dst = dst_clone(dst);
++
++ skb->h.raw = skb->nh.raw;
++
++ if (opt)
++ ipv6_push_nfrag_opts(skb, opt, &proto, NULL);
++
++ skb->nh.raw = skb_push(skb, sizeof(struct ipv6hdr));
++ ipv6h = skb->nh.ipv6h;
++ *(u32*)ipv6h = fl.fl6_flowlabel | htonl(0x60000000);
++ ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++ ipv6h->hop_limit = t->parms.hop_limit;
++ ipv6h->nexthdr = proto;
++ ipv6_addr_copy(&ipv6h->saddr, &fl.fl6_src);
++ ipv6_addr_copy(&ipv6h->daddr, &fl.fl6_dst);
++ nf_reset(skb);
++ pkt_len = skb->len;
++ err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL,
++ skb->dst->dev, dst_output);
++
++ if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) {
++ stats->tx_bytes += pkt_len;
++ stats->tx_packets++;
++ } else {
++ stats->tx_errors++;
++ stats->tx_aborted_errors++;
++ }
++ ip6_tnl_dst_store(t, dst);
++
++ if (opt)
++ kfree(opt);
++
++ t->recursion--;
++ return 0;
++tx_err_link_failure:
++ stats->tx_carrier_errors++;
++ dst_link_failure(skb);
++tx_err_dst_release:
++ dst_release(dst);
++ if (opt)
++ kfree(opt);
++tx_err:
++ stats->tx_errors++;
++ stats->tx_dropped++;
++ kfree_skb(skb);
++ t->recursion--;
++ return 0;
++}
++
++static void ip6_tnl_set_cap(struct ip6_tnl *t)
++{
++ struct ip6_tnl_parm *p = &t->parms;
++ struct in6_addr *laddr = &p->laddr;
++ struct in6_addr *raddr = &p->raddr;
++ int ltype = ipv6_addr_type(laddr);
++ int rtype = ipv6_addr_type(raddr);
++
++ p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV);
++
++ if (ltype != IPV6_ADDR_ANY && rtype != IPV6_ADDR_ANY &&
++ ((ltype|rtype) &
++ (IPV6_ADDR_UNICAST|
++ IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL|
++ IPV6_ADDR_MAPPED|IPV6_ADDR_RESERVED)) == IPV6_ADDR_UNICAST) {
++ struct net_device *ldev = NULL;
++ int l_ok = 1;
++ int r_ok = 1;
++
++ if (p->link)
++ ldev = dev_get_by_index(p->link);
++
++ if ((ltype&IPV6_ADDR_UNICAST) && !ipv6_chk_addr(laddr, ldev))
++ l_ok = 0;
++
++ if ((rtype&IPV6_ADDR_UNICAST) && ipv6_chk_addr(raddr, NULL))
++ r_ok = 0;
++
++ if (l_ok && r_ok) {
++ if (ltype&IPV6_ADDR_UNICAST)
++ p->flags |= IP6_TNL_F_CAP_XMIT;
++ if (rtype&IPV6_ADDR_UNICAST)
++ p->flags |= IP6_TNL_F_CAP_RCV;
++ }
++ if (ldev)
++ dev_put(ldev);
++ }
++}
++
++static void ip6ip6_tnl_link_config(struct ip6_tnl *t)
++{
++ struct net_device *dev = t->dev;
++ struct ip6_tnl_parm *p = &t->parms;
++ struct flowi *fl = &t->fl;
++
++ ipv6_addr_copy(&fl->fl6_src, &p->laddr);
++ ipv6_addr_copy(&fl->fl6_dst, &p->raddr);
++ fl->oif = p->link;
++ fl->fl6_flowlabel = 0;
++
++ if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
++ fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
++ if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
++ fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
++
++ ip6_tnl_set_cap(t);
++
++ if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
++ dev->flags |= IFF_POINTOPOINT;
++ else
++ dev->flags &= ~IFF_POINTOPOINT;
++
++ dev->iflink = p->link;
++
++ if (p->flags & IP6_TNL_F_CAP_XMIT) {
++ struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr,
++ p->link, 0);
++
++ if (rt == NULL)
++ return;
++
++ if (rt->rt6i_dev) {
++ dev->hard_header_len = rt->rt6i_dev->hard_header_len +
++ sizeof (struct ipv6hdr);
++
++ dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr);
++
++ if (dev->mtu < IPV6_MIN_MTU)
++ dev->mtu = IPV6_MIN_MTU;
++ }
++ dst_release(&rt->u.dst);
++ }
++}
++
++/**
++ * ip6ip6_tnl_change - update the tunnel parameters
++ * @t: tunnel to be changed
++ * @p: tunnel configuration parameters
++ * @active: != 0 if tunnel is ready for use
++ *
++ * Description:
++ * ip6ip6_tnl_change() updates the tunnel parameters
++ **/
++
++static int
++ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
++{
++ ipv6_addr_copy(&t->parms.laddr, &p->laddr);
++ ipv6_addr_copy(&t->parms.raddr, &p->raddr);
++ t->parms.flags = p->flags;
++ t->parms.hop_limit = p->hop_limit;
++ t->parms.encap_limit = p->encap_limit;
++ t->parms.flowinfo = p->flowinfo;
++ ip6ip6_tnl_link_config(t);
++ return 0;
++}
++
++/**
++ * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace
++ * @dev: virtual device associated with tunnel
++ * @ifr: parameters passed from userspace
++ * @cmd: command to be performed
++ *
++ * Description:
++ * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels
++ * from userspace.
++ *
++ * The possible commands are the following:
++ * %SIOCGETTUNNEL: get tunnel parameters for device
++ * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
++ * %SIOCCHGTUNNEL: change tunnel parameters to those given
++ * %SIOCDELTUNNEL: delete tunnel
++ *
++ * The fallback device "ip6tnl0", created during module
++ * initialization, can be used for creating other tunnel devices.
++ *
++ * Return:
++ * 0 on success,
++ * %-EFAULT if unable to copy data to or from userspace,
++ * %-EPERM if current process hasn't %CAP_NET_ADMIN set
++ * %-EINVAL if passed tunnel parameters are invalid,
++ * %-EEXIST if changing a tunnel's parameters would cause a conflict
++ * %-ENODEV if attempting to change or delete a nonexisting device
++ **/
++
++static int
++ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
++{
++ int err = 0;
++ int create;
++ struct ip6_tnl_parm p;
++ struct ip6_tnl *t = NULL;
++
++ switch (cmd) {
++ case SIOCGETTUNNEL:
++ if (dev == ip6ip6_fb_tnl_dev) {
++ if (copy_from_user(&p,
++ ifr->ifr_ifru.ifru_data,
++ sizeof (p))) {
++ err = -EFAULT;
++ break;
++ }
++ if ((err = ip6ip6_tnl_locate(&p, &t, 0)) == -ENODEV)
++ t = (struct ip6_tnl *) dev->priv;
++ else if (err)
++ break;
++ } else
++ t = (struct ip6_tnl *) dev->priv;
++
++ memcpy(&p, &t->parms, sizeof (p));
++ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) {
++ err = -EFAULT;
++ }
++ break;
++ case SIOCADDTUNNEL:
++ case SIOCCHGTUNNEL:
++ err = -EPERM;
++ create = (cmd == SIOCADDTUNNEL);
++ if (!capable(CAP_NET_ADMIN))
++ break;
++ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) {
++ err = -EFAULT;
++ break;
++ }
++ if (!create && dev != ip6ip6_fb_tnl_dev) {
++ t = (struct ip6_tnl *) dev->priv;
++ }
++ if (!t && (err = ip6ip6_tnl_locate(&p, &t, create))) {
++ break;
++ }
++ if (cmd == SIOCCHGTUNNEL) {
++ if (t->dev != dev) {
++ err = -EEXIST;
++ break;
++ }
++ ip6ip6_tnl_unlink(t);
++ err = ip6ip6_tnl_change(t, &p);
++ ip6ip6_tnl_link(t);
++ netdev_state_change(dev);
++ }
++ if (copy_to_user(ifr->ifr_ifru.ifru_data,
++ &t->parms, sizeof (p))) {
++ err = -EFAULT;
++ } else {
++ err = 0;
++ }
++ break;
++ case SIOCDELTUNNEL:
++ err = -EPERM;
++ if (!capable(CAP_NET_ADMIN))
++ break;
++
++ if (dev == ip6ip6_fb_tnl_dev) {
++ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
++ sizeof (p))) {
++ err = -EFAULT;
++ break;
++ }
++ err = ip6ip6_tnl_locate(&p, &t, 0);
++ if (err)
++ break;
++ if (t == ip6ip6_fb_tnl_dev->priv) {
++ err = -EPERM;
++ break;
++ }
++ } else {
++ t = (struct ip6_tnl *) dev->priv;
++ }
++ err = unregister_netdevice(t->dev);
++ break;
++ default:
++ err = -EINVAL;
++ }
++ return err;
++}
++
++/**
++ * ip6ip6_tnl_get_stats - return the stats for tunnel device
++ * @dev: virtual device associated with tunnel
++ *
++ * Return: stats for device
++ **/
++
++static struct net_device_stats *
++ip6ip6_tnl_get_stats(struct net_device *dev)
++{
++ return &(((struct ip6_tnl *) dev->priv)->stat);
++}
++
++/**
++ * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device
++ * @dev: virtual device associated with tunnel
++ * @new_mtu: the new mtu
++ *
++ * Return:
++ * 0 on success,
++ * %-EINVAL if mtu too small
++ **/
++
++static int
++ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
++{
++ if (new_mtu < IPV6_MIN_MTU) {
++ return -EINVAL;
++ }
++ dev->mtu = new_mtu;
++ return 0;
++}
++
++/**
++ * ip6ip6_tnl_dev_setup - setup virtual tunnel device
++ * @dev: virtual device associated with tunnel
++ *
++ * Description:
++ * Initialize function pointers and device parameters
++ **/
++
++static void ip6ip6_tnl_dev_setup(struct net_device *dev)
++{
++ SET_MODULE_OWNER(dev);
++ dev->uninit = ip6ip6_tnl_dev_uninit;
++ dev->destructor = (void (*)(struct net_device *))kfree;
++ dev->hard_start_xmit = ip6ip6_tnl_xmit;
++ dev->get_stats = ip6ip6_tnl_get_stats;
++ dev->do_ioctl = ip6ip6_tnl_ioctl;
++ dev->change_mtu = ip6ip6_tnl_change_mtu;
++
++ dev->type = ARPHRD_TUNNEL6;
++ dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
++ dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
++ dev->flags |= IFF_NOARP;
++ /* Hmm... MAX_ADDR_LEN is 8, so the ipv6 addresses can't be
++ copied to dev->dev_addr and dev->broadcast, like the ipv4
++ addresses were in ipip.c, ip_gre.c and sit.c. */
++ dev->addr_len = 0;
++}
++
++
++/**
++ * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices
++ * @dev: virtual device associated with tunnel
++ **/
++
++static inline void
++ip6ip6_tnl_dev_init_gen(struct net_device *dev)
++{
++ struct ip6_tnl *t = (struct ip6_tnl *) dev->priv;
++ t->fl.proto = IPPROTO_IPV6;
++ t->dev = dev;
++ strcpy(t->parms.name, dev->name);
++}
++
++/**
++ * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices
++ * @dev: virtual device associated with tunnel
++ **/
++
++static int
++ip6ip6_tnl_dev_init(struct net_device *dev)
++{
++ struct ip6_tnl *t = (struct ip6_tnl *) dev->priv;
++ ip6ip6_tnl_dev_init_gen(dev);
++ ip6ip6_tnl_link_config(t);
++ return 0;
++}
++
++/**
++ * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device
++ * @dev: fallback device
++ *
++ * Return: 0
++ **/
++
++static int
++ip6ip6_fb_tnl_dev_init(struct net_device *dev)
++{
++ struct ip6_tnl *t = dev->priv;
++ ip6ip6_tnl_dev_init_gen(dev);
++ dev_hold(dev);
++ tnls_wc[0] = t;
++ return 0;
++}
++
++static struct xfrm6_tunnel ip6ip6_handler = {
++ .handler = ip6ip6_rcv,
++ .err_handler = ip6ip6_err,
++};
++
++/**
++ * ip6_tunnel_init - register protocol and reserve needed resources
++ *
++ * Return: 0 on success
++ **/
++
++int __init ip6_tunnel_init(void)
++{
++ int err;
++
++ if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) {
++ printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
++ return -EAGAIN;
++ }
++ ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
++ ip6ip6_tnl_dev_setup);
++
++ if (!ip6ip6_fb_tnl_dev) {
++ err = -ENOMEM;
++ goto fail;
++ }
++ ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init;
++
++ if ((err = register_netdev(ip6ip6_fb_tnl_dev))) {
++ kfree(ip6ip6_fb_tnl_dev);
++ goto fail;
++ }
++ return 0;
++fail:
++ xfrm6_tunnel_deregister(&ip6ip6_handler);
++ return err;
++}
++
++/**
++ * ip6_tunnel_cleanup - free resources and unregister protocol
++ **/
++
++void ip6_tunnel_cleanup(void)
++{
++ if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0)
++ printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
++
++ unregister_netdev(ip6ip6_fb_tnl_dev);
++}
++
++#ifdef MODULE
++module_init(ip6_tunnel_init);
++module_exit(ip6_tunnel_cleanup);
++#endif
+diff -Nru a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/ipcomp6.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,376 @@
++/*
++ * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
++ *
++ * Copyright (C)2003 USAGI/WIDE Project
++ *
++ * Author Mitsuru KANDA <mk at linux-ipv6.org>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ */
++/*
++ * [Memo]
++ *
++ * Outbound:
++ * The compression of IP datagram MUST be done before AH/ESP processing,
++ * fragmentation, and the addition of Hop-by-Hop/Routing header.
++ *
++ * Inbound:
++ * The decompression of IP datagram MUST be done after the reassembly,
++ * AH/ESP processing.
++ */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/ipcomp.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++#include <linux/pfkeyv2.h>
++#include <linux/random.h>
++#include <net/icmp.h>
++#include <net/ipv6.h>
++#include <linux/ipv6.h>
++#include <linux/icmpv6.h>
++
++static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ int err = 0;
++ u8 nexthdr = 0;
++ int hdr_len = skb->h.raw - skb->nh.raw;
++ unsigned char *tmp_hdr = NULL;
++ struct ipv6hdr *iph;
++ int plen, dlen;
++ struct ipcomp_data *ipcd = x->data;
++ u8 *start, *scratch = ipcd->scratch;
++
++ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
++ skb_linearize(skb, GFP_ATOMIC) != 0) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ /* Remove ipcomp header and decompress original payload */
++ iph = skb->nh.ipv6h;
++ tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC);
++ if (!tmp_hdr)
++ goto out;
++ memcpy(tmp_hdr, iph, hdr_len);
++ nexthdr = *(u8 *)skb->data;
++ skb_pull(skb, sizeof(struct ipv6_comp_hdr));
++ skb->nh.raw += sizeof(struct ipv6_comp_hdr);
++ memcpy(skb->nh.raw, tmp_hdr, hdr_len);
++ iph = skb->nh.ipv6h;
++ iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr));
++ skb->h.raw = skb->data;
++
++ /* decompression */
++ plen = skb->len;
++ dlen = IPCOMP_SCRATCH_SIZE;
++ start = skb->data;
++
++ err = crypto_comp_decompress(ipcd->tfm, start, plen, scratch, &dlen);
++ if (err) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ if (dlen < (plen + sizeof(struct ipv6_comp_hdr))) {
++ err = -EINVAL;
++ goto out;
++ }
++
++ err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
++ if (err) {
++ goto out;
++ }
++
++ skb_put(skb, dlen - plen);
++ memcpy(skb->data, scratch, dlen);
++
++ iph = skb->nh.ipv6h;
++ iph->payload_len = htons(skb->len);
++
++out:
++ if (tmp_hdr)
++ kfree(tmp_hdr);
++ if (err)
++ goto error_out;
++ return nexthdr;
++error_out:
++ return err;
++}
++
++static int ipcomp6_output(struct sk_buff *skb)
++{
++ int err;
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct ipv6hdr *top_iph;
++ int hdr_len;
++ struct ipv6_comp_hdr *ipch;
++ struct ipcomp_data *ipcd = x->data;
++ int plen, dlen;
++ u8 *start, *scratch = ipcd->scratch;
++
++ hdr_len = skb->h.raw - skb->data;
++
++ /* check whether datagram len is larger than threshold */
++ if ((skb->len - hdr_len) < ipcd->threshold) {
++ goto out_ok;
++ }
++
++ if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
++ skb_linearize(skb, GFP_ATOMIC) != 0) {
++ err = -ENOMEM;
++ goto error;
++ }
++
++ /* compression */
++ plen = skb->len - hdr_len;
++ dlen = IPCOMP_SCRATCH_SIZE;
++ start = skb->h.raw;
++
++ err = crypto_comp_compress(ipcd->tfm, start, plen, scratch, &dlen);
++ if (err) {
++ goto error;
++ }
++ if ((dlen + sizeof(struct ipv6_comp_hdr)) >= plen) {
++ goto out_ok;
++ }
++ memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
++ pskb_trim(skb, hdr_len + dlen + sizeof(struct ip_comp_hdr));
++
++ /* insert ipcomp header and replace datagram */
++ top_iph = (struct ipv6hdr *)skb->data;
++
++ top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++
++ ipch = (struct ipv6_comp_hdr *)start;
++ ipch->nexthdr = *skb->nh.raw;
++ ipch->flags = 0;
++ ipch->cpi = htons((u16 )ntohl(x->id.spi));
++ *skb->nh.raw = IPPROTO_COMP;
++
++out_ok:
++ err = 0;
++
++error:
++ return err;
++}
++
++static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info)
++{
++ u32 spi;
++ struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
++ struct ipv6_comp_hdr *ipcomph = (struct ipv6_comp_hdr*)(skb->data+offset);
++ struct xfrm_state *x;
++
++ if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG)
++ return;
++
++ spi = ntohl(ntohs(ipcomph->cpi));
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6);
++ if (!x)
++ return;
++
++ printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/"
++ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
++ spi, NIP6(iph->daddr));
++ xfrm_state_put(x);
++}
++
++static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
++{
++ struct xfrm_state *t = NULL;
++
++ t = xfrm_state_alloc();
++ if (!t)
++ goto out;
++
++ t->id.proto = IPPROTO_IPV6;
++ t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr);
++ memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
++ memcpy(&t->sel, &x->sel, sizeof(t->sel));
++ t->props.family = AF_INET6;
++ t->props.mode = 1;
++ memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
++
++ t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family);
++ if (t->type == NULL)
++ goto error;
++
++ if (t->type->init_state(t, NULL))
++ goto error;
++
++ t->km.state = XFRM_STATE_VALID;
++ atomic_set(&t->tunnel_users, 1);
++
++out:
++ return t;
++
++error:
++ xfrm_state_put(t);
++ goto out;
++}
++
++static int ipcomp6_tunnel_attach(struct xfrm_state *x)
++{
++ int err = 0;
++ struct xfrm_state *t = NULL;
++ u32 spi;
++
++ spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr);
++ if (spi)
++ t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr,
++ spi, IPPROTO_IPV6, AF_INET6);
++ if (!t) {
++ t = ipcomp6_tunnel_create(x);
++ if (!t) {
++ err = -EINVAL;
++ goto out;
++ }
++ xfrm_state_insert(t);
++ xfrm_state_hold(t);
++ }
++ x->tunnel = t;
++ atomic_inc(&t->tunnel_users);
++
++out:
++ return err;
++}
++
++static void ipcomp6_free_data(struct ipcomp_data *ipcd)
++{
++ if (ipcd->tfm)
++ crypto_free_tfm(ipcd->tfm);
++ if (ipcd->scratch)
++ kfree(ipcd->scratch);
++}
++
++static void ipcomp6_destroy(struct xfrm_state *x)
++{
++ struct ipcomp_data *ipcd = x->data;
++ if (!ipcd)
++ return;
++ xfrm_state_delete_tunnel(x);
++ ipcomp6_free_data(ipcd);
++ kfree(ipcd);
++
++ xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
++}
++
++static int ipcomp6_init_state(struct xfrm_state *x, void *args)
++{
++ int err;
++ struct ipcomp_data *ipcd;
++ struct xfrm_algo_desc *calg_desc;
++
++ err = -EINVAL;
++ if (!x->calg)
++ goto out;
++
++ if (x->encap)
++ goto out;
++
++ err = -ENOMEM;
++ ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
++ if (!ipcd)
++ goto error;
++
++ memset(ipcd, 0, sizeof(*ipcd));
++ x->props.header_len = 0;
++ if (x->props.mode)
++ x->props.header_len += sizeof(struct ipv6hdr);
++
++ ipcd->scratch = kmalloc(IPCOMP_SCRATCH_SIZE, GFP_KERNEL);
++ if (!ipcd->scratch)
++ goto error;
++
++ ipcd->tfm = crypto_alloc_tfm(x->calg->alg_name, 0);
++ if (!ipcd->tfm)
++ goto error;
++
++ if (x->props.mode) {
++ err = ipcomp6_tunnel_attach(x);
++ if (err)
++ goto error;
++ }
++
++ calg_desc = xfrm_calg_get_byname(x->calg->alg_name);
++ BUG_ON(!calg_desc);
++ ipcd->threshold = calg_desc->uinfo.comp.threshold;
++ x->data = ipcd;
++ err = 0;
++out:
++ return err;
++error:
++ if (ipcd) {
++ ipcomp6_free_data(ipcd);
++ kfree(ipcd);
++ }
++
++ goto out;
++}
++
++static struct xfrm_type ipcomp6_type =
++{
++ .description = "IPCOMP6",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_COMP,
++ .init_state = ipcomp6_init_state,
++ .destructor = ipcomp6_destroy,
++ .input = ipcomp6_input,
++ .output = ipcomp6_output,
++};
++
++static struct inet6_protocol ipcomp6_protocol =
++{
++ .handler = xfrm6_rcv,
++ .err_handler = ipcomp6_err,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
++static int __init ipcomp6_init(void)
++{
++ if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) {
++ printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
++ printk(KERN_INFO "ipcomp6 init: can't add protocol\n");
++ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit ipcomp6_fini(void)
++{
++ if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0)
++ printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n");
++ if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
++ printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n");
++}
++
++module_init(ipcomp6_init);
++module_exit(ipcomp6_fini);
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173");
++MODULE_AUTHOR("Mitsuru KANDA <mk at linux-ipv6.org>");
++
++
+diff -Nru a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
+--- a/net/ipv6/ipv6_sockglue.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ipv6_sockglue.c 2005-02-13 21:25:09 +11:00
+@@ -51,6 +51,7 @@
+ #include <net/inet_common.h>
+ #include <net/tcp.h>
+ #include <net/udp.h>
++#include <net/xfrm.h>
+
+ #include <asm/uaccess.h>
+
+@@ -517,6 +518,10 @@
+ case IPV6_FLOWLABEL_MGR:
+ retv = ipv6_flowlabel_opt(sk, optval, optlen);
+ break;
++ case IPV6_IPSEC_POLICY:
++ case IPV6_XFRM_POLICY:
++ retv = xfrm_user_policy(sk, optname, optval, optlen);
++ break;
+
+ #ifdef CONFIG_NETFILTER
+ default:
+@@ -550,6 +555,15 @@
+ if (get_user(len, optlen))
+ return -EFAULT;
+ switch (optname) {
++ case IPV6_ADDRFORM:
++ if (sk->protocol != IPPROTO_UDP &&
++ sk->protocol != IPPROTO_TCP)
++ return -EINVAL;
++ if (sk->state != TCP_ESTABLISHED)
++ return -ENOTCONN;
++ val = sk->family;
++ break;
++
+ case IPV6_PKTOPTIONS:
+ {
+ struct msghdr msg;
+@@ -595,7 +609,7 @@
+ lock_sock(sk);
+ dst = sk_dst_get(sk);
+ if (dst) {
+- val = dst->pmtu;
++ val = dst_pmtu(dst) - dst->header_len;
+ dst_release(dst);
+ }
+ release_sock(sk);
+diff -Nru a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
+--- a/net/ipv6/ipv6_syms.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ipv6_syms.c 2005-02-13 21:25:09 +11:00
+@@ -6,6 +6,7 @@
+ #include <net/ipv6.h>
+ #include <net/addrconf.h>
+ #include <net/ip6_route.h>
++#include <net/xfrm.h>
+
+ EXPORT_SYMBOL(ipv6_addr_type);
+ EXPORT_SYMBOL(icmpv6_send);
+@@ -33,5 +34,16 @@
+ EXPORT_SYMBOL(ipv6_get_saddr);
+ EXPORT_SYMBOL(ipv6_chk_addr);
+ EXPORT_SYMBOL(in6_dev_finish_destroy);
++EXPORT_SYMBOL(ip6_find_1stfragopt);
++#ifdef CONFIG_XFRM
++EXPORT_SYMBOL(xfrm6_rcv);
++#endif
++EXPORT_SYMBOL(rt6_lookup);
++EXPORT_SYMBOL(fl6_sock_lookup);
++EXPORT_SYMBOL(ipv6_ext_hdr);
++EXPORT_SYMBOL(ip6_append_data);
++EXPORT_SYMBOL(ip6_flush_pending_frames);
++EXPORT_SYMBOL(ip6_push_pending_frames);
++EXPORT_SYMBOL(ipv6_push_nfrag_opts);
+ EXPORT_SYMBOL(ipv6_skip_exthdr);
+
+diff -Nru a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
+--- a/net/ipv6/ndisc.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/ndisc.c 2005-02-13 21:25:09 +11:00
+@@ -72,6 +72,7 @@
+ #include <net/addrconf.h>
+ #include <net/icmp.h>
+
++#include <net/flow.h>
+ #include <net/checksum.h>
+ #include <linux/proc_fs.h>
+
+@@ -139,6 +140,19 @@
+ 30*HZ, 128, 512, 1024,
+ };
+
++/* ND options */
++struct ndisc_options {
++ struct nd_opt_hdr *nd_opt_array[7];
++ struct nd_opt_hdr *nd_opt_piend;
++};
++
++#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
++#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR]
++#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO]
++#define nd_opts_pi_end nd_opt_piend
++#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR]
++#define nd_opts_mtu nd_opt_array[ND_OPT_MTU]
++
+ #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
+
+ static u8 *ndisc_fill_option(u8 *opt, int type, void *data, int data_len)
+@@ -155,8 +169,8 @@
+ return opt + space;
+ }
+
+-struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
+- struct nd_opt_hdr *end)
++static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
++ struct nd_opt_hdr *end)
+ {
+ int type;
+ if (!cur || !end || cur >= end)
+@@ -168,8 +182,8 @@
+ return (cur <= end && cur->nd_opt_type == type ? cur : NULL);
+ }
+
+-struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+- struct ndisc_options *ndopts)
++static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
++ struct ndisc_options *ndopts)
+ {
+ struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
+
+@@ -333,8 +347,6 @@
+ unsigned char ha[MAX_ADDR_LEN];
+ unsigned char *h_dest = NULL;
+
+- skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
+-
+ if (dev->hard_header) {
+ if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) {
+ ndisc_mc_map(daddr, ha, dev, 1);
+@@ -371,11 +383,38 @@
+ * Send a Neighbour Advertisement
+ */
+
++static int ndisc_output(struct sk_buff *skb)
++{
++ if (skb) {
++ struct neighbour *neigh = (skb->dst ? skb->dst->neighbour : NULL);
++ if (ndisc_build_ll_hdr(skb, skb->dev, &skb->nh.ipv6h->daddr, neigh, skb->len) == 0) {
++ kfree_skb(skb);
++ return -EINVAL;
++ }
++ dev_queue_xmit(skb);
++ return 0;
++ }
++ return -EINVAL;
++}
++
++static inline void ndisc_flow_init(struct flowi *fl, u8 type,
++ struct in6_addr *saddr, struct in6_addr *daddr)
++{
++ memset(fl, 0, sizeof(*fl));
++ ipv6_addr_copy(&fl->fl6_src, saddr);
++ ipv6_addr_copy(&fl->fl6_dst, daddr);
++ fl->proto = IPPROTO_ICMPV6;
++ fl->fl_icmp_type = type;
++ fl->fl_icmp_code = 0;
++}
++
+ void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+ struct in6_addr *daddr, struct in6_addr *solicited_addr,
+- int router, int solicited, int override, int inc_opt)
++ int router, int solicited, int override, int inc_opt)
+ {
+- static struct in6_addr tmpaddr;
++ struct flowi fl;
++ struct dst_entry* dst;
++ struct in6_addr tmpaddr;
+ struct inet6_ifaddr *ifp;
+ struct sock *sk = ndisc_socket->sk;
+ struct in6_addr *src_addr;
+@@ -386,6 +425,29 @@
+
+ len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+
++ /* for anycast or proxy, solicited_addr != src_addr */
++ ifp = ipv6_get_ifaddr(solicited_addr, dev);
++ if (ifp) {
++ src_addr = solicited_addr;
++ in6_ifa_put(ifp);
++ } else {
++ if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr, 0))
++ return;
++ src_addr = &tmpaddr;
++ }
++
++ ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr);
++
++ dst = ndisc_dst_alloc(dev, neigh, ndisc_output);
++ if (!dst)
++ return;
++
++ err = xfrm_lookup(&dst, &fl, NULL, 0);
++ if (err < 0) {
++ dst_release(dst);
++ return;
++ }
++
+ if (inc_opt) {
+ if (dev->addr_len)
+ len += NDISC_OPT_SPACE(dev->addr_len);
+@@ -398,27 +460,14 @@
+
+ if (skb == NULL) {
+ ND_PRINTK1("send_na: alloc skb failed\n");
+- return;
+- }
+- /* for anycast or proxy, solicited_addr != src_addr */
+- ifp = ipv6_get_ifaddr(solicited_addr, dev);
+- if (ifp) {
+- src_addr = solicited_addr;
+- in6_ifa_put(ifp);
+- } else {
+- if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr, 0))
+- return;
+- src_addr = &tmpaddr;
+- }
+-
+- if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) {
+- kfree_skb(skb);
++ dst_release(dst);
+ return;
+ }
+
++ skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
+ ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len);
+
+- msg = (struct nd_msg *) skb_put(skb, len);
++ skb->h.raw = (unsigned char*) msg = (struct nd_msg *) skb_put(skb, len);
+
+ msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
+ msg->icmph.icmp6_code = 0;
+@@ -441,7 +490,8 @@
+ csum_partial((__u8 *) msg,
+ len, 0));
+
+- dev_queue_xmit(skb);
++ skb->dst = dst;
++ dst_output(skb);
+
+ ICMP6_INC_STATS(Icmp6OutNeighborAdvertisements);
+ ICMP6_INC_STATS(Icmp6OutMsgs);
+@@ -451,6 +501,8 @@
+ struct in6_addr *solicit,
+ struct in6_addr *daddr, struct in6_addr *saddr)
+ {
++ struct flowi fl;
++ struct dst_entry* dst;
+ struct sock *sk = ndisc_socket->sk;
+ struct sk_buff *skb;
+ struct nd_msg *msg;
+@@ -465,6 +517,18 @@
+ saddr = &addr_buf;
+ }
+
++ ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr);
++
++ dst = ndisc_dst_alloc(dev, neigh, ndisc_output);
++ if (!dst)
++ return;
++
++ err = xfrm_lookup(&dst, &fl, NULL, 0);
++ if (err < 0) {
++ dst_release(dst);
++ return;
++ }
++
+ len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+ send_llinfo = dev->addr_len && ipv6_addr_type(saddr) != IPV6_ADDR_ANY;
+ if (send_llinfo)
+@@ -474,17 +538,14 @@
+ 1, &err);
+ if (skb == NULL) {
+ ND_PRINTK1("send_ns: alloc skb failed\n");
++ dst_release(dst);
+ return;
+ }
+
+- if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) {
+- kfree_skb(skb);
+- return;
+- }
+-
++ skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
+ ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len);
+
+- msg = (struct nd_msg *)skb_put(skb, len);
++ skb->h.raw = (unsigned char*) msg = (struct nd_msg *)skb_put(skb, len);
+ msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION;
+ msg->icmph.icmp6_code = 0;
+ msg->icmph.icmp6_cksum = 0;
+@@ -503,7 +564,8 @@
+ csum_partial((__u8 *) msg,
+ len, 0));
+ /* send it! */
+- dev_queue_xmit(skb);
++ skb->dst = dst;
++ dst_output(skb);
+
+ ICMP6_INC_STATS(Icmp6OutNeighborSolicits);
+ ICMP6_INC_STATS(Icmp6OutMsgs);
+@@ -512,6 +574,8 @@
+ void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
+ struct in6_addr *daddr)
+ {
++ struct flowi fl;
++ struct dst_entry* dst;
+ struct sock *sk = ndisc_socket->sk;
+ struct sk_buff *skb;
+ struct icmp6hdr *hdr;
+@@ -519,6 +583,18 @@
+ int len;
+ int err;
+
++ ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr);
++
++ dst = ndisc_dst_alloc(dev, NULL, ndisc_output);
++ if (!dst)
++ return;
++
++ err = xfrm_lookup(&dst, &fl, NULL, 0);
++ if (err < 0) {
++ dst_release(dst);
++ return;
++ }
++
+ len = sizeof(struct icmp6hdr);
+ if (dev->addr_len)
+ len += NDISC_OPT_SPACE(dev->addr_len);
+@@ -530,14 +606,10 @@
+ return;
+ }
+
+- if (ndisc_build_ll_hdr(skb, dev, daddr, NULL, len) == 0) {
+- kfree_skb(skb);
+- return;
+- }
+-
++ skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
+ ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len);
+
+- hdr = (struct icmp6hdr *) skb_put(skb, len);
++ skb->h.raw = (unsigned char*) hdr = (struct icmp6hdr *) skb_put(skb, len);
+ hdr->icmp6_type = NDISC_ROUTER_SOLICITATION;
+ hdr->icmp6_code = 0;
+ hdr->icmp6_cksum = 0;
+@@ -554,7 +626,8 @@
+ csum_partial((__u8 *) hdr, len, 0));
+
+ /* send it! */
+- dev_queue_xmit(skb);
++ skb->dst = dst;
++ dst_output(skb);
+
+ ICMP6_INC_STATS(Icmp6OutRouterSolicits);
+ ICMP6_INC_STATS(Icmp6OutMsgs);
+@@ -598,7 +671,7 @@
+ }
+ }
+
+-void ndisc_recv_ns(struct sk_buff *skb)
++static void ndisc_recv_ns(struct sk_buff *skb)
+ {
+ struct nd_msg *msg = (struct nd_msg *)skb->h.raw;
+ struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+@@ -610,6 +683,7 @@
+ struct net_device *dev = skb->dev;
+ struct inet6_ifaddr *ifp;
+ struct neighbour *neigh;
++ int addr_type = ipv6_addr_type(saddr);
+
+ if (skb->len < sizeof(struct nd_msg)) {
+ if (net_ratelimit())
+@@ -623,6 +697,20 @@
+ return;
+ }
+
++ /*
++ * RFC2461 7.1.1:
++ * DAD has to be destined for solicited node multicast address.
++ */
++ if (addr_type == IPV6_ADDR_ANY &&
++ !(daddr->s6_addr32[0] == htonl(0xff020000) &&
++ daddr->s6_addr32[1] == htonl(0x00000000) &&
++ daddr->s6_addr32[2] == htonl(0x00000001) &&
++ daddr->s6_addr [12] == 0xff )) {
++ if (net_ratelimit())
++ printk(KERN_DEBUG "ICMP6 NS: bad DAD packet (wrong destination\n");
++ return;
++ }
++
+ if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "ICMP NS: invalid ND option, ignored.\n");
+@@ -637,23 +725,20 @@
+ printk(KERN_WARNING "ICMP NS: bad lladdr length.\n");
+ return;
+ }
+- }
+
+- /* XXX: RFC2461 7.1.1:
+- * If the IP source address is the unspecified address, there
+- * MUST NOT be source link-layer address option in the message.
+- *
+- * NOTE! Linux kernel < 2.4.4 broke this rule.
+- */
+-
+- /* XXX: RFC2461 7.1.1:
+- * If the IP source address is the unspecified address, the IP
+- * destination address MUST be a solicited-node multicast address.
+- */
++ /* XXX: RFC2461 7.1.1:
++ * If the IP source address is the unspecified address,
++ * there MUST NOT be source link-layer address option
++ * in the message.
++ */
++ if (addr_type == IPV6_ADDR_ANY) {
++ if (net_ratelimit())
++ printk(KERN_WARNING "ICMP6 NS: bad DAD packet (link-layer address option)\n");
++ return;
++ }
++ }
+
+ if ((ifp = ipv6_get_ifaddr(&msg->target, dev)) != NULL) {
+- int addr_type = ipv6_addr_type(saddr);
+-
+ if (ifp->flags & IFA_F_TENTATIVE) {
+ /* Address is tentative. If the source
+ is unspecified address, it is someone
+@@ -686,8 +771,7 @@
+ ipv6_addr_all_nodes(&maddr);
+ ndisc_send_na(dev, NULL, &maddr, &ifp->addr,
+ ifp->idev->cnf.forwarding, 0,
+- ipv6_addr_type(&ifp->addr)&IPV6_ADDR_ANYCAST ? 0 : 1,
+- 1);
++ 1, 1);
+ in6_ifa_put(ifp);
+ return;
+ }
+@@ -710,8 +794,7 @@
+ if (neigh || !dev->hard_header) {
+ ndisc_send_na(dev, neigh, saddr, &ifp->addr,
+ ifp->idev->cnf.forwarding, 1,
+- ipv6_addr_type(&ifp->addr)&IPV6_ADDR_ANYCAST ? 0 : 1,
+- 1);
++ 1, 1);
+ if (neigh)
+ neigh_release(neigh);
+ }
+@@ -719,7 +802,6 @@
+ in6_ifa_put(ifp);
+ } else if (ipv6_chk_acast_addr(dev, &msg->target)) {
+ struct inet6_dev *idev = in6_dev_get(dev);
+- int addr_type = ipv6_addr_type(saddr);
+
+ /* anycast */
+
+@@ -763,10 +845,10 @@
+ in6_dev_put(idev);
+ } else {
+ struct inet6_dev *in6_dev = in6_dev_get(dev);
+- int addr_type = ipv6_addr_type(saddr);
+
+ if (in6_dev && in6_dev->cnf.forwarding &&
+- (addr_type & IPV6_ADDR_UNICAST) &&
++ (addr_type & IPV6_ADDR_UNICAST ||
++ addr_type == IPV6_ADDR_ANY) &&
+ pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) {
+ int inc = ipv6_addr_type(daddr)&IPV6_ADDR_MULTICAST;
+
+@@ -781,12 +863,20 @@
+ NEIGH_CACHE_STAT_INC(&nd_tbl,
+ rcv_probes_ucast);
+
+- neigh = neigh_event_ns(&nd_tbl, lladdr, saddr, dev);
++ if (addr_type & IPV6_ADDR_UNICAST) {
++ neigh = neigh_event_ns(&nd_tbl, lladdr, saddr, dev);
+
+- if (neigh) {
+- ndisc_send_na(dev, neigh, saddr, &msg->target,
+- 0, 1, 0, 1);
+- neigh_release(neigh);
++ if (neigh) {
++ ndisc_send_na(dev, neigh, saddr, &msg->target,
++ 0, 1, 0, 1);
++ neigh_release(neigh);
++ }
++ } else {
++ /* proxy should also protect against DAD */
++ struct in6_addr maddr;
++ ipv6_addr_all_nodes(&maddr);
++ ndisc_send_na(dev, NULL, &maddr, &msg->target,
++ 0, 0, 0, 1);
+ }
+ } else {
+ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
+@@ -802,7 +892,7 @@
+ return;
+ }
+
+-void ndisc_recv_na(struct sk_buff *skb)
++static void ndisc_recv_na(struct sk_buff *skb)
+ {
+ struct nd_msg *msg = (struct nd_msg *)skb->h.raw;
+ struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+@@ -872,12 +962,8 @@
+ */
+ struct rt6_info *rt;
+ rt = rt6_get_dflt_router(saddr, dev);
+- if (rt) {
+- /* It is safe only because
+- we aer in BH */
+- dst_release(&rt->u.dst);
+- ip6_del_rt(rt, NULL);
+- }
++ if (rt)
++ ip6_del_rt(rt, NULL, NULL);
+ }
+ } else {
+ if (msg->icmph.icmp6_router)
+@@ -962,7 +1048,7 @@
+ rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
+
+ if (rt && lifetime == 0) {
+- ip6_del_rt(rt, NULL);
++ ip6_del_rt(rt, NULL, NULL);
+ rt = NULL;
+ }
+
+@@ -1074,7 +1160,7 @@
+ in6_dev->cnf.mtu6 = mtu;
+
+ if (rt)
+- rt->u.dst.pmtu = mtu;
++ rt->u.dst.metrics[RTAX_MTU-1] = mtu;
+
+ rt6_mtu_change(skb->dev, mtu);
+ }
+@@ -1197,27 +1283,44 @@
+ struct in6_addr *addrp;
+ struct net_device *dev;
+ struct rt6_info *rt;
++ struct dst_entry *dst;
++ struct flowi fl;
+ u8 *opt;
+ int rd_len;
+ int err;
+ int hlen;
+
+ dev = skb->dev;
+- rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1);
+
++ if (ipv6_get_lladdr(dev, &saddr_buf)) {
++ ND_PRINTK1("redirect: no link_local addr for dev\n");
++ return;
++ }
++
++ ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr);
++
++ rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1);
+ if (rt == NULL)
+ return;
++ dst = &rt->u.dst;
++
++ err = xfrm_lookup(&dst, &fl, NULL, 0);
++ if (err) {
++ dst_release(dst);
++ return;
++ }
++
++ rt = (struct rt6_info *) dst;
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ ND_PRINTK1("ndisc_send_redirect: not a neighbour\n");
+- dst_release(&rt->u.dst);
++ dst_release(dst);
+ return;
+ }
+- if (!xrlim_allow(&rt->u.dst, 1*HZ)) {
+- dst_release(&rt->u.dst);
++ if (!xrlim_allow(dst, 1*HZ)) {
++ dst_release(dst);
+ return;
+ }
+- dst_release(&rt->u.dst);
+
+ if (dev->addr_len) {
+ if (neigh->nud_state&NUD_VALID) {
+@@ -1227,6 +1330,7 @@
+ We will make it later, when will be sure,
+ that it is alive.
+ */
++ dst_release(dst);
+ return;
+ }
+ }
+@@ -1236,11 +1340,6 @@
+ rd_len &= ~0x7;
+ len += rd_len;
+
+- if (ipv6_get_lladdr(dev, &saddr_buf)) {
+- ND_PRINTK1("redirect: no link_local addr for dev\n");
+- return;
+- }
+-
+ buff = sock_alloc_send_skb(sk, MAX_HEADER + len + dev->hard_header_len + 15,
+ 1, &err);
+ if (buff == NULL) {
+@@ -1250,15 +1349,11 @@
+
+ hlen = 0;
+
+- if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) {
+- kfree_skb(buff);
+- return;
+- }
+-
++ skb_reserve(buff, (dev->hard_header_len + 15) & ~15);
+ ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr,
+ IPPROTO_ICMPV6, len);
+
+- icmph = (struct icmp6hdr *) skb_put(buff, len);
++ buff->h.raw = (unsigned char*) icmph = (struct icmp6hdr *) skb_put(buff, len);
+
+ memset(icmph, 0, sizeof(struct icmp6hdr));
+ icmph->icmp6_type = NDISC_REDIRECT;
+@@ -1296,7 +1391,8 @@
+ len, IPPROTO_ICMPV6,
+ csum_partial((u8 *) icmph, len, 0));
+
+- dev_queue_xmit(buff);
++ buff->dst = dst;
++ dst_output(buff);
+
+ ICMP6_INC_STATS(Icmp6OutRedirects);
+ ICMP6_INC_STATS(Icmp6OutMsgs);
+@@ -1416,6 +1512,9 @@
+
+ void ndisc_cleanup(void)
+ {
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_unregister(&nd_tbl.parms);
++#endif
+ neigh_table_clear(&nd_tbl);
+ sock_release(ndisc_socket);
+ ndisc_socket = NULL; /* For safety. */
+diff -Nru a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
+--- a/net/ipv6/netfilter/ip6t_LOG.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/netfilter/ip6t_LOG.c 2005-02-13 21:25:09 +11:00
+@@ -25,16 +25,6 @@
+ #define DEBUGP(format, args...)
+ #endif
+
+-#define NIP6(addr) \
+- ntohs((addr).s6_addr16[0]), \
+- ntohs((addr).s6_addr16[1]), \
+- ntohs((addr).s6_addr16[2]), \
+- ntohs((addr).s6_addr16[3]), \
+- ntohs((addr).s6_addr16[4]), \
+- ntohs((addr).s6_addr16[5]), \
+- ntohs((addr).s6_addr16[6]), \
+- ntohs((addr).s6_addr16[7])
+-
+ /* FIXME evil kludge */
+
+ struct ahhdr {
+diff -Nru a/net/ipv6/netfilter/ip6t_multiport.c b/net/ipv6/netfilter/ip6t_multiport.c
+--- a/net/ipv6/netfilter/ip6t_multiport.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv6/netfilter/ip6t_multiport.c 2005-02-13 21:25:10 +11:00
+@@ -5,6 +5,7 @@
+ #include <linux/udp.h>
+ #include <linux/skbuff.h>
+ #include <linux/in.h>
++#include <linux/socket.h>
+
+ #include <linux/netfilter_ipv6/ip6t_multiport.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+diff -Nru a/net/ipv6/protocol.c b/net/ipv6/protocol.c
+--- a/net/ipv6/protocol.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/protocol.c 2005-02-13 21:25:09 +11:00
+@@ -42,77 +42,42 @@
+
+ struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
+
+-void inet6_add_protocol(struct inet6_protocol *prot)
++int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol)
+ {
+- unsigned char hash;
+- struct inet6_protocol *p2;
++ int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+
+- hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+- prot->next = inet6_protos[hash];
+- inet6_protos[hash] = prot;
+- prot->copy = 0;
+-
+- /*
+- * Set the copy bit if we need to.
+- */
+-
+- p2 = (struct inet6_protocol *) prot->next;
+- while(p2 != NULL) {
+- if (p2->protocol == prot->protocol) {
+- prot->copy = 1;
+- break;
+- }
+- p2 = (struct inet6_protocol *) p2->next;
++
++ if (inet6_protos[hash]) {
++ ret = -1;
++ } else {
++ inet6_protos[hash] = prot;
++ ret = 0;
+ }
++
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
++
++ return ret;
+ }
+
+ /*
+ * Remove a protocol from the hash tables.
+ */
+
+-int inet6_del_protocol(struct inet6_protocol *prot)
++int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol)
+ {
+- struct inet6_protocol *p;
+- struct inet6_protocol *lp = NULL;
+- unsigned char hash;
++ int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+
+- hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+- if (prot == inet6_protos[hash]) {
+- inet6_protos[hash] = (struct inet6_protocol *) inet6_protos[hash]->next;
+- br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return(0);
+- }
+-
+- p = (struct inet6_protocol *) inet6_protos[hash];
+
+- if (p != NULL && p->protocol == prot->protocol)
+- lp = p;
+-
+- while(p != NULL) {
+- /*
+- * We have to worry if the protocol being deleted is
+- * the last one on the list, then we may need to reset
+- * someone's copied bit.
+- */
+- if (p->next != NULL && p->next == prot) {
+- /*
+- * if we are the last one with this protocol and
+- * there is a previous one, reset its copy bit.
+- */
+- if (prot->copy == 0 && lp != NULL)
+- lp->copy = 0;
+- p->next = prot->next;
+- br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return(0);
+- }
+- if (p->next != NULL && p->next->protocol == prot->protocol)
+- lp = p->next;
+-
+- p = (struct inet6_protocol *) p->next;
++ if (inet6_protos[hash] != prot) {
++ ret = -1;
++ } else {
++ inet6_protos[hash] = NULL;
++ ret = 0;
+ }
++
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+- return(-1);
++
++ return ret;
+ }
+diff -Nru a/net/ipv6/raw.c b/net/ipv6/raw.c
+--- a/net/ipv6/raw.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/raw.c 2005-02-13 21:25:09 +11:00
+@@ -12,6 +12,7 @@
+ * Fixes:
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * YOSHIFUJI,H. at USAGI : raw checksum (RFC2292(bis) compliance)
++ * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+@@ -29,6 +30,8 @@
+ #include <linux/netdevice.h>
+ #include <linux/if_arp.h>
+ #include <linux/icmpv6.h>
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv6.h>
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+
+@@ -45,6 +48,7 @@
+ #include <net/inet_common.h>
+
+ #include <net/rawv6.h>
++#include <net/xfrm.h>
+
+ struct sock *raw_v6_htable[RAWV6_HTABLE_SIZE];
+ rwlock_t raw_v6_lock = RW_LOCK_UNLOCKED;
+@@ -133,12 +137,14 @@
+ * demultiplex raw sockets.
+ * (should consider queueing the skb in the sock receive_queue
+ * without calling rawv6.c)
++ *
++ * Caller owns SKB so we must make clones.
+ */
+-struct sock * ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
++void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+ {
+ struct in6_addr *saddr;
+ struct in6_addr *daddr;
+- struct sock *sk, *sk2;
++ struct sock *sk;
+ __u8 hash;
+
+ saddr = &skb->nh.ipv6h->saddr;
+@@ -159,30 +165,18 @@
+
+ sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr);
+
+- if (sk) {
+- sk2 = sk;
+-
+- while ((sk2 = __raw_v6_lookup(sk2->next, nexthdr, daddr, saddr))) {
+- struct sk_buff *buff;
+-
+- if (nexthdr == IPPROTO_ICMPV6 &&
+- icmpv6_filter(sk2, skb))
+- continue;
+-
+- buff = skb_clone(skb, GFP_ATOMIC);
+- if (buff)
+- rawv6_rcv(sk2, buff);
++ while (sk) {
++ if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
++ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
++
++ /* Not releasing hash table! */
++ if (clone)
++ rawv6_rcv(sk, clone);
+ }
++ sk = __raw_v6_lookup(sk->next, nexthdr, daddr, saddr);
+ }
+-
+- if (sk && nexthdr == IPPROTO_ICMPV6 && icmpv6_filter(sk, skb))
+- sk = NULL;
+-
+ out:
+- if (sk)
+- sock_hold(sk);
+ read_unlock(&raw_v6_lock);
+- return sk;
+ }
+
+ /* This cleans up af_inet6 a bit. -DaveM */
+@@ -311,6 +305,11 @@
+ */
+ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
+ {
++ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return NET_RX_DROP;
++ }
++
+ if (!sk->tp_pinfo.tp_raw.checksum)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+@@ -439,86 +438,160 @@
+ goto out_free;
+ }
+
+-/*
+- * Sending...
+- */
++static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct raw6_opt *opt, int len)
++{
++ struct sk_buff *skb;
++ int err = 0;
++ u16 *csum;
+
+-struct rawv6_fakehdr {
+- struct iovec *iov;
+- struct sock *sk;
+- __u32 len;
+- __u32 cksum;
+- __u32 proto;
+- struct in6_addr *daddr;
+-};
++ if ((skb = skb_peek(&sk->write_queue)) == NULL)
++ goto out;
+
+-static int rawv6_getfrag(const void *data, struct in6_addr *saddr,
+- char *buff, unsigned int offset, unsigned int len)
+-{
+- struct iovec *iov = (struct iovec *) data;
++ if (opt->offset + 1 < len)
++ csum = (u16 *)(skb->h.raw + opt->offset);
++ else {
++ err = -EINVAL;
++ goto out;
++ }
++
++ if (skb_queue_len(&sk->write_queue) == 1) {
++ /*
++ * Only one fragment on the socket.
++ */
++ /* should be check HW csum miyazawa */
++ *csum = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ len, fl->proto, skb->csum);
++ } else {
++ u32 tmp_csum = 0;
++
++ skb_queue_walk(&sk->write_queue, skb) {
++ tmp_csum = csum_add(tmp_csum, skb->csum);
++ }
+
+- return memcpy_fromiovecend(buff, iov, offset, len);
++ tmp_csum = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ len, fl->proto, tmp_csum);
++ *csum = tmp_csum;
++ }
++ if (*csum == 0)
++ *csum = -1;
++ ip6_push_pending_frames(sk);
++out:
++ return err;
+ }
+
+-static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
+- char *buff, unsigned int offset,
+- unsigned int len)
+-{
+- struct rawv6_fakehdr *hdr = (struct rawv6_fakehdr *) data;
+-
+- if (csum_partial_copy_fromiovecend(buff, hdr->iov, offset,
+- len, &hdr->cksum))
+- return -EFAULT;
+-
+- if (offset == 0) {
+- struct sock *sk;
+- struct raw6_opt *opt;
+- struct in6_addr *daddr;
+-
+- sk = hdr->sk;
+- opt = &sk->tp_pinfo.tp_raw;
++static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
++ struct flowi *fl, struct rt6_info *rt,
++ unsigned int flags)
++{
++ struct inet_opt *inet = inet_sk(sk);
++ struct ipv6hdr *iph;
++ struct sk_buff *skb;
++ unsigned int hh_len;
++ int err;
+
+- if (hdr->daddr)
+- daddr = hdr->daddr;
+- else
+- daddr = addr + 1;
+-
+- hdr->cksum = csum_ipv6_magic(addr, daddr, hdr->len,
+- hdr->proto, hdr->cksum);
+-
+- if (opt->offset + 1 < len) {
+- __u16 *csum;
++ if (length > rt->u.dst.dev->mtu) {
++ ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
++ return -EMSGSIZE;
++ }
++ if (flags&MSG_PROBE)
++ goto out;
+
+- csum = (__u16 *) (buff + opt->offset);
+- if (*csum) {
+- /* in case cksum was not initialized */
+- __u32 sum = hdr->cksum;
+- sum += *csum;
+- *csum = hdr->cksum = (sum + (sum>>16));
+- } else {
+- *csum = hdr->cksum;
++ hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
++
++ skb = sock_alloc_send_skb(sk, length+hh_len+15,
++ flags&MSG_DONTWAIT, &err);
++ if (skb == NULL)
++ goto error;
++ skb_reserve(skb, hh_len);
++
++ skb->priority = sk->priority;
++ skb->dst = dst_clone(&rt->u.dst);
++
++ skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length);
++
++ skb->ip_summed = CHECKSUM_NONE;
++
++ skb->h.raw = skb->nh.raw;
++ err = memcpy_fromiovecend((void *)iph, from, 0, length);
++ if (err)
++ goto error_fault;
++
++ err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
++ dst_output);
++ if (err > 0)
++ err = inet->recverr ? net_xmit_errno(err) : 0;
++ if (err)
++ goto error;
++out:
++ return 0;
++
++error_fault:
++ err = -EFAULT;
++ kfree_skb(skb);
++error:
++ IP6_INC_STATS(Ip6OutDiscards);
++ return err;
++}
++
++static void rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
++{
++ struct iovec *iov;
++ u8 __user *type = NULL;
++ u8 __user *code = NULL;
++ int probed = 0;
++ int i;
++
++ if (!msg->msg_iov)
++ return;
++
++ for (i = 0; i < msg->msg_iovlen; i++) {
++ iov = &msg->msg_iov[i];
++ if (!iov)
++ continue;
++
++ switch (fl->proto) {
++ case IPPROTO_ICMPV6:
++ /* check if one-byte field is readable or not. */
++ if (iov->iov_base && iov->iov_len < 1)
++ break;
++
++ if (!type) {
++ type = iov->iov_base;
++ /* check if code field is readable or not. */
++ if (iov->iov_len > 1)
++ code = type + 1;
++ } else if (!code)
++ code = iov->iov_base;
++
++ if (type && code) {
++ get_user(fl->fl_icmp_type, type);
++ __get_user(fl->fl_icmp_code, code);
++ probed = 1;
+ }
+- } else {
+- if (net_ratelimit())
+- printk(KERN_DEBUG "icmp: cksum offset too big\n");
+- return -EINVAL;
++ break;
++ default:
++ probed = 1;
++ break;
+ }
+- }
+- return 0;
++ if (probed)
++ break;
++ }
+ }
+
+-
+ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+ {
+ struct ipv6_txoptions opt_space;
+ struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
++ struct in6_addr *daddr;
+ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
++ struct raw6_opt *raw_opt = raw6_sk(sk);
+ struct ipv6_txoptions *opt = NULL;
+ struct ip6_flowlabel *flowlabel = NULL;
++ struct dst_entry *dst = NULL;
+ struct flowi fl;
+ int addr_len = msg->msg_namelen;
+- struct in6_addr *daddr;
+- struct raw6_opt *raw_opt;
+ int hlimit = -1;
+ u16 proto;
+ int err;
+@@ -536,9 +609,7 @@
+ /*
+ * Get and verify the address.
+ */
+-
+- fl.fl6_flowlabel = 0;
+- fl.oif = 0;
++ memset(&fl, 0, sizeof(fl));
+
+ if (sin6) {
+ if (addr_len < SIN6_LEN_RFC2133)
+@@ -552,6 +623,8 @@
+
+ if (!proto)
+ proto = sk->num;
++ else if (proto != sk->num)
++ return(-EINVAL);
+
+ if (proto > 255)
+ return(-EINVAL);
+@@ -590,16 +663,17 @@
+ * unspecfied destination address
+ * treated as error... is this correct ?
+ */
++ fl6_sock_release(flowlabel);
+ return(-EINVAL);
+ }
+
+ if (fl.oif == 0)
+ fl.oif = sk->bound_dev_if;
+- fl.fl6_src = NULL;
+
+ if (msg->msg_controllen) {
+ opt = &opt_space;
+ memset(opt, 0, sizeof(struct ipv6_txoptions));
++ opt->tot_len = sizeof(struct ipv6_txoptions);
+
+ err = datagram_send_ctl(msg, &fl, opt, &hlimit);
+ if (err < 0) {
+@@ -619,39 +693,73 @@
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+
+- raw_opt = &sk->tp_pinfo.tp_raw;
+-
+ fl.proto = proto;
+- fl.fl6_dst = daddr;
+- if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr))
+- fl.fl6_src = &np->saddr;
+- fl.uli_u.icmpt.type = 0;
+- fl.uli_u.icmpt.code = 0;
+-
+- if (raw_opt->checksum) {
+- struct rawv6_fakehdr hdr;
+-
+- hdr.iov = msg->msg_iov;
+- hdr.sk = sk;
+- hdr.len = len;
+- hdr.cksum = 0;
+- hdr.proto = proto;
++ rawv6_probe_proto_opt(&fl, msg);
++
++ ipv6_addr_copy(&fl.fl6_dst, daddr);
++ if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
++ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
++
++ /* merge ip6_build_xmit from ip6_output */
++ if (opt && opt->srcrt) {
++ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
++ }
++
++ if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
++ fl.oif = np->mcast_oif;
++
++ err = ip6_dst_lookup(sk, &dst, &fl);
++ if (err)
++ goto out;
+
+- if (opt && opt->srcrt)
+- hdr.daddr = daddr;
++ if (hlimit < 0) {
++ if (ipv6_addr_is_multicast(&fl.fl6_dst))
++ hlimit = np->mcast_hops;
+ else
+- hdr.daddr = NULL;
++ hlimit = np->hop_limit;
++ if (hlimit < 0)
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
++ }
++
++ if (msg->msg_flags&MSG_CONFIRM)
++ goto do_confirm;
+
+- err = ip6_build_xmit(sk, rawv6_frag_cksum, &hdr, &fl, len,
+- opt, hlimit, msg->msg_flags);
++back_from_confirm:
++ if (sk->protinfo.af_inet.hdrincl) {
++ err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags);
+ } else {
+- err = ip6_build_xmit(sk, rawv6_getfrag, msg->msg_iov, &fl, len,
+- opt, hlimit, msg->msg_flags);
++ lock_sock(sk);
++ err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
++ hlimit, opt, &fl, (struct rt6_info*)dst, msg->msg_flags);
++
++ if (err)
++ ip6_flush_pending_frames(sk);
++ else if (!(msg->msg_flags & MSG_MORE)) {
++ if (raw_opt->checksum) {
++ err = rawv6_push_pending_frames(sk, &fl, raw_opt, len);
++ } else {
++ err = ip6_push_pending_frames(sk);
++ }
++ }
+ }
++done:
++ ip6_dst_store(sk, dst,
++ !ipv6_addr_cmp(&fl.fl6_dst, &np->daddr) ?
++ &np->daddr : NULL);
++ if (err > 0)
++ err = np->recverr ? net_xmit_errno(err) : 0;
+
++ release_sock(sk);
++out:
+ fl6_sock_release(flowlabel);
+-
+ return err<0?err:len;
++do_confirm:
++ dst_confirm(dst);
++ if (!(msg->msg_flags & MSG_PROBE) || len)
++ goto back_from_confirm;
++ err = 0;
++ goto done;
+ }
+
+ static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
+diff -Nru a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
+--- a/net/ipv6/reassembly.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/reassembly.c 2005-02-13 21:25:09 +11:00
+@@ -23,6 +23,10 @@
+ * Horst von Brand Add missing #include <linux/string.h>
+ * Alexey Kuznetsov SMP races, threading, cleanup.
+ * Patrick McHardy LRU queue of frag heads for evictor.
++ * Mitsuru KANDA @USAGI Register inet6_protocol{}.
++ * David Stevens and
++ * YOSHIFUJI,H. @USAGI Always remove fragment header to
++ * calculate ICV correctly.
+ */
+ #include <linux/config.h>
+ #include <linux/errno.h>
+@@ -428,7 +432,7 @@
+ end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
+ ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
+
+- if ((unsigned int)end >= 65536) {
++ if ((unsigned int)end > IPV6_MAXPLEN) {
+ icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw);
+ return;
+ }
+@@ -438,7 +442,7 @@
+ csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0));
+
+ /* Is this the final fragment? */
+- if (!(fhdr->frag_off & htons(0x0001))) {
++ if (!(fhdr->frag_off & htons(IP6_MF))) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+@@ -586,12 +590,12 @@
+ * the last and the first frames arrived and all the bits are here.
+ */
+ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
++ unsigned int *nhoffp,
+ struct net_device *dev)
+ {
+ struct sk_buff *fp, *head = fq->fragments;
+- int remove_fraghdr = 0;
+ int payload_len;
+- int nhoff;
++ unsigned int nhoff;
+
+ fq_kill(fq);
+
+@@ -599,15 +603,9 @@
+ BUG_TRAP(FRAG6_CB(head)->offset == 0);
+
+ /* Unfragmented part is taken from the first segment. */
+- payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len;
+- nhoff = head->h.raw - head->nh.raw;
+-
+- if (payload_len > 65535) {
+- payload_len -= 8;
+- if (payload_len > 65535)
+- goto out_oversize;
+- remove_fraghdr = 1;
+- }
++ payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr);
++ if (payload_len > IPV6_MAXPLEN)
++ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+@@ -636,18 +634,14 @@
+ atomic_add(clone->truesize, &ip6_frag_mem);
+ }
+
+- /* Normally we do not remove frag header from datagram, but
+- * we have to do this and to relocate header, when payload
+- * is > 65535-8. */
+- if (remove_fraghdr) {
+- nhoff = fq->nhoffset;
+- head->nh.raw[nhoff] = head->h.raw[0];
+- memmove(head->head+8, head->head, (head->data-head->head)-8);
+- head->mac.raw += 8;
+- head->nh.raw += 8;
+- } else {
+- ((struct frag_hdr*)head->h.raw)->frag_off = 0;
+- }
++ /* We have to remove fragment header from datagram and to relocate
++ * header in order to calculate ICV correctly. */
++ nhoff = fq->nhoffset;
++ head->nh.raw[nhoff] = head->h.raw[0];
++ memmove(head->head + sizeof(struct frag_hdr), head->head,
++ (head->data - head->head) - sizeof(struct frag_hdr));
++ head->mac.raw += sizeof(struct frag_hdr);
++ head->nh.raw += sizeof(struct frag_hdr);
+
+ skb_shinfo(head)->frag_list = head->next;
+ head->h.raw = head->data;
+@@ -678,7 +672,8 @@
+
+ IP6_INC_STATS_BH(Ip6ReasmOKs);
+ fq->fragments = NULL;
+- return nhoff;
++ *nhoffp = nhoff;
++ return 1;
+
+ out_oversize:
+ if (net_ratelimit())
+@@ -692,7 +687,7 @@
+ return -1;
+ }
+
+-int ipv6_reassembly(struct sk_buff **skbp, int nhoff)
++static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+ {
+ struct sk_buff *skb = *skbp;
+ struct net_device *dev = skb->dev;
+@@ -722,7 +717,8 @@
+ skb->h.raw += sizeof(struct frag_hdr);
+ IP6_INC_STATS_BH(Ip6ReasmOKs);
+
+- return (u8*)fhdr - skb->nh.raw;
++ *nhoffp = (u8*)fhdr - skb->nh.raw;
++ return 1;
+ }
+
+ if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
+@@ -733,11 +729,11 @@
+
+ spin_lock(&fq->lock);
+
+- ip6_frag_queue(fq, skb, fhdr, nhoff);
++ ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+
+ if (fq->last_in == (FIRST_IN|LAST_IN) &&
+ fq->meat == fq->len)
+- ret = ip6_frag_reasm(fq, skbp, dev);
++ ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+
+ spin_unlock(&fq->lock);
+ fq_put(fq, NULL);
+@@ -749,8 +745,17 @@
+ return -1;
+ }
+
++static struct inet6_protocol frag_protocol =
++{
++ .handler = ipv6_frag_rcv,
++ .flags = INET6_PROTO_NOPOLICY,
++};
++
+ void __init ipv6_frag_init(void)
+ {
++ if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0)
++ printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n");
++
+ ip6_frag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+ (jiffies ^ (jiffies >> 6)));
+
+diff -Nru a/net/ipv6/route.c b/net/ipv6/route.c
+--- a/net/ipv6/route.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/route.c 2005-02-13 21:25:09 +11:00
+@@ -49,6 +49,8 @@
+ #include <net/addrconf.h>
+ #include <net/tcp.h>
+ #include <linux/rtnetlink.h>
++#include <net/dst.h>
++#include <net/xfrm.h>
+
+ #include <asm/uaccess.h>
+
+@@ -56,8 +58,6 @@
+ #include <linux/sysctl.h>
+ #endif
+
+-#undef CONFIG_RT6_POLICY
+-
+ /* Set to 3 to get tracing. */
+ #define RT6_DEBUG 2
+
+@@ -80,39 +80,43 @@
+
+ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
+ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
+-static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst,
+- struct sk_buff *skb);
+ static struct dst_entry *ip6_negative_advice(struct dst_entry *);
+ static int ip6_dst_gc(void);
+
+ static int ip6_pkt_discard(struct sk_buff *skb);
+ static void ip6_link_failure(struct sk_buff *skb);
++static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+
+ struct dst_ops ip6_dst_ops = {
+- AF_INET6,
+- __constant_htons(ETH_P_IPV6),
+- 1024,
+-
+- ip6_dst_gc,
+- ip6_dst_check,
+- ip6_dst_reroute,
+- NULL,
+- ip6_negative_advice,
+- ip6_link_failure,
+- sizeof(struct rt6_info),
++ .family = AF_INET6,
++ .protocol = __constant_htons(ETH_P_IPV6),
++ .gc = ip6_dst_gc,
++ .gc_thresh = 1024,
++ .check = ip6_dst_check,
++ .negative_advice = ip6_negative_advice,
++ .link_failure = ip6_link_failure,
++ .update_pmtu = ip6_rt_update_pmtu,
++ .entry_size = sizeof(struct rt6_info),
+ };
+
+ struct rt6_info ip6_null_entry = {
+- {{NULL, ATOMIC_INIT(1), 1, &loopback_dev,
+- -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+- -ENETUNREACH, NULL, NULL,
+- ip6_pkt_discard, ip6_pkt_discard,
+-#ifdef CONFIG_NET_CLS_ROUTE
+- 0,
+-#endif
+- &ip6_dst_ops}},
+- NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
+- 255, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
++ .u = {
++ .dst = {
++ .__refcnt = ATOMIC_INIT(1),
++ .__use = 1,
++ .dev = &loopback_dev,
++ .obsolete = -1,
++ .error = -ENETUNREACH,
++ .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
++ .input = ip6_pkt_discard,
++ .output = ip6_pkt_discard,
++ .ops = &ip6_dst_ops,
++ .path = (struct dst_entry*)&ip6_null_entry,
++ }
++ },
++ .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
++ .rt6i_metric = ~(u32) 0,
++ .rt6i_ref = ATOMIC_INIT(1),
+ };
+
+ struct fib6_node ip6_routing_table = {
+@@ -121,29 +125,17 @@
+ 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0
+ };
+
+-#ifdef CONFIG_RT6_POLICY
+-int ip6_rt_policy = 0;
+-
+-struct pol_chain *rt6_pol_list = NULL;
+-
+-
+-static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb);
+-static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk);
+-
+-static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
+- struct in6_addr *daddr,
+- struct in6_addr *saddr,
+- struct fl_acc_args *args);
+-
+-#else
+-#define ip6_rt_policy (0)
+-#endif
+-
+ /* Protects all the ip6 fib */
+
+ rwlock_t rt6_lock = RW_LOCK_UNLOCKED;
+
+
++/* allocate dst with ip6_dst_ops */
++static __inline__ struct rt6_info *ip6_dst_alloc(void)
++{
++ return dst_alloc(&ip6_dst_ops);
++}
++
+ /*
+ * Route lookup. Any rt6_lock is implied.
+ */
+@@ -269,9 +261,12 @@
+ }
+ }
+
+- if (match)
++ if (match) {
++ if (rt6_dflt_pointer != match)
++ RT6_TRACE("changed default router: %p->%p\n",
++ rt6_dflt_pointer, match);
+ rt6_dflt_pointer = match;
+-
++ }
+ spin_unlock(&rt6_dflt_lock);
+
+ if (!match) {
+@@ -325,12 +320,12 @@
+ be destroyed.
+ */
+
+-static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh)
++static int rt6_ins(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ int err;
+
+ write_lock_bh(&rt6_lock);
+- err = fib6_add(&ip6_routing_table, rt, nlh);
++ err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr);
+ write_unlock_bh(&rt6_lock);
+
+ return err;
+@@ -373,7 +368,7 @@
+
+ dst_hold(&rt->u.dst);
+
+- err = rt6_ins(rt, NULL);
++ err = rt6_ins(rt, NULL, NULL);
+ if (err == 0)
+ return rt;
+
+@@ -385,38 +380,6 @@
+ return &ip6_null_entry;
+ }
+
+-#ifdef CONFIG_RT6_POLICY
+-static __inline__ struct rt6_info *rt6_flow_lookup_in(struct rt6_info *rt,
+- struct sk_buff *skb)
+-{
+- struct in6_addr *daddr, *saddr;
+- struct fl_acc_args arg;
+-
+- arg.type = FL_ARG_FORWARD;
+- arg.fl_u.skb = skb;
+-
+- saddr = &skb->nh.ipv6h->saddr;
+- daddr = &skb->nh.ipv6h->daddr;
+-
+- return rt6_flow_lookup(rt, daddr, saddr, &arg);
+-}
+-
+-static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt,
+- struct sock *sk,
+- struct flowi *fl)
+-{
+- struct fl_acc_args arg;
+-
+- arg.type = FL_ARG_ORIGIN;
+- arg.fl_u.fl_o.sk = sk;
+- arg.fl_u.fl_o.flow = fl;
+-
+- return rt6_flow_lookup(rt, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr,
+- &arg);
+-}
+-
+-#endif
+-
+ #define BACKTRACK() \
+ if (rt == &ip6_null_entry && strict) { \
+ while ((fn = fn->parent) != NULL) { \
+@@ -449,53 +412,30 @@
+ rt = fn->leaf;
+
+ if ((rt->rt6i_flags & RTF_CACHE)) {
+- if (ip6_rt_policy == 0) {
+- rt = rt6_device_match(rt, skb->dev->ifindex, strict);
+- BACKTRACK();
+- dst_hold(&rt->u.dst);
+- goto out;
+- }
+-
+-#ifdef CONFIG_RT6_POLICY
+- if ((rt->rt6i_flags & RTF_FLOW)) {
+- struct rt6_info *sprt;
+-
+- for (sprt = rt; sprt; sprt = sprt->u.next) {
+- if (rt6_flow_match_in(sprt, skb)) {
+- rt = sprt;
+- dst_hold(&rt->u.dst);
+- goto out;
+- }
+- }
+- }
+-#endif
++ rt = rt6_device_match(rt, skb->dev->ifindex, strict);
++ BACKTRACK();
++ dst_hold(&rt->u.dst);
++ goto out;
+ }
+
+ rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+ BACKTRACK();
+
+- if (ip6_rt_policy == 0) {
+- if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+- read_unlock_bh(&rt6_lock);
++ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
++ read_unlock_bh(&rt6_lock);
+
+- rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
+- &skb->nh.ipv6h->saddr);
++ rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
++ &skb->nh.ipv6h->saddr);
+
+- if (rt->u.dst.error != -EEXIST || --attempts <= 0)
+- goto out2;
+- /* Race condition! In the gap, when rt6_lock was
+- released someone could insert this route. Relookup.
+- */
+- goto relookup;
+- }
+- dst_hold(&rt->u.dst);
+- } else {
+-#ifdef CONFIG_RT6_POLICY
+- rt = rt6_flow_lookup_in(rt, skb);
+-#else
+- /* NEVER REACHED */
+-#endif
++ if (rt->u.dst.error != -EEXIST || --attempts <= 0)
++ goto out2;
++ /* Race condition! In the gap, when rt6_lock was
++ released someone could insert this route. Relookup.
++ */
++ dst_release(&rt->u.dst);
++ goto relookup;
+ }
++ dst_hold(&rt->u.dst);
+
+ out:
+ read_unlock_bh(&rt6_lock);
+@@ -512,38 +452,21 @@
+ int strict;
+ int attempts = 3;
+
+- strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
++ strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
+
+ relookup:
+ read_lock_bh(&rt6_lock);
+
+- fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
+- fl->nl_u.ip6_u.saddr);
++ fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
+
+ restart:
+ rt = fn->leaf;
+
+ if ((rt->rt6i_flags & RTF_CACHE)) {
+- if (ip6_rt_policy == 0) {
+- rt = rt6_device_match(rt, fl->oif, strict);
+- BACKTRACK();
+- dst_hold(&rt->u.dst);
+- goto out;
+- }
+-
+-#ifdef CONFIG_RT6_POLICY
+- if ((rt->rt6i_flags & RTF_FLOW)) {
+- struct rt6_info *sprt;
+-
+- for (sprt = rt; sprt; sprt = sprt->u.next) {
+- if (rt6_flow_match_out(sprt, sk)) {
+- rt = sprt;
+- dst_hold(&rt->u.dst);
+- goto out;
+- }
+- }
+- }
+-#endif
++ rt = rt6_device_match(rt, fl->oif, strict);
++ BACKTRACK();
++ dst_hold(&rt->u.dst);
++ goto out;
+ }
+ if (rt->rt6i_flags & RTF_DEFAULT) {
+ if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
+@@ -553,29 +476,21 @@
+ BACKTRACK();
+ }
+
+- if (ip6_rt_policy == 0) {
+- if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+- read_unlock_bh(&rt6_lock);
++ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
++ read_unlock_bh(&rt6_lock);
+
+- rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
+- fl->nl_u.ip6_u.saddr);
+-
+- if (rt->u.dst.error != -EEXIST || --attempts <= 0)
+- goto out2;
++ rt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src);
+
+- /* Race condition! In the gap, when rt6_lock was
+- released someone could insert this route. Relookup.
+- */
+- goto relookup;
+- }
+- dst_hold(&rt->u.dst);
+- } else {
+-#ifdef CONFIG_RT6_POLICY
+- rt = rt6_flow_lookup_out(rt, sk, fl);
+-#else
+- /* NEVER REACHED */
+-#endif
++ if (rt->u.dst.error != -EEXIST || --attempts <= 0)
++ goto out2;
++
++ /* Race condition! In the gap, when rt6_lock was
++ released someone could insert this route. Relookup.
++ */
++ dst_release(&rt->u.dst);
++ goto relookup;
+ }
++ dst_hold(&rt->u.dst);
+
+ out:
+ read_unlock_bh(&rt6_lock);
+@@ -603,23 +518,13 @@
+ return NULL;
+ }
+
+-static struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb)
+-{
+- /*
+- * FIXME
+- */
+- RDBG(("ip6_dst_reroute(%p,%p)[%p] (AIEEE)\n", dst, skb,
+- __builtin_return_address(0)));
+- return NULL;
+-}
+-
+ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
+ {
+ struct rt6_info *rt = (struct rt6_info *) dst;
+
+ if (rt) {
+ if (rt->rt6i_flags & RTF_CACHE)
+- ip6_del_rt(rt, NULL);
++ ip6_del_rt(rt, NULL, NULL);
+ else
+ dst_release(dst);
+ }
+@@ -642,7 +547,80 @@
+ }
+ }
+
+-static int ip6_dst_gc()
++static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
++{
++ struct rt6_info *rt6 = (struct rt6_info*)dst;
++
++ if (mtu < dst_pmtu(dst) && rt6->rt6i_dst.plen == 128) {
++ rt6->rt6i_flags |= RTF_MODIFIED;
++ dst->metrics[RTAX_MTU-1] = mtu;
++ }
++}
++
++/* Protected by rt6_lock. */
++static struct dst_entry *ndisc_dst_gc_list;
++static int ipv6_get_mtu(struct net_device *dev);
++static inline unsigned int ipv6_advmss(unsigned int mtu);
++
++struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
++ struct neighbour *neigh,
++ int (*output)(struct sk_buff *))
++{
++ struct rt6_info *rt = ip6_dst_alloc();
++
++ if (unlikely(rt == NULL))
++ goto out;
++
++ if (dev)
++ dev_hold(dev);
++ if (neigh)
++ neigh_hold(neigh);
++
++ rt->rt6i_dev = dev;
++ rt->rt6i_nexthop = neigh;
++ rt->rt6i_expires = 0;
++ rt->rt6i_flags = RTF_LOCAL | RTF_NDISC;
++ rt->rt6i_metric = 0;
++ atomic_set(&rt->u.dst.__refcnt, 1);
++ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
++ rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
++ rt->u.dst.output = output;
++
++ write_lock_bh(&rt6_lock);
++ rt->u.dst.next = ndisc_dst_gc_list;
++ ndisc_dst_gc_list = &rt->u.dst;
++ write_unlock_bh(&rt6_lock);
++
++ fib6_force_start_gc();
++
++out:
++ return (struct dst_entry *)rt;
++}
++
++int ndisc_dst_gc(int *more)
++{
++ struct dst_entry *dst, *next, **pprev;
++ int freed;
++
++ next = NULL;
++ pprev = &ndisc_dst_gc_list;
++ freed = 0;
++ while ((dst = *pprev) != NULL) {
++ if (!atomic_read(&dst->__refcnt)) {
++ *pprev = dst->next;
++ dst_free(dst);
++ freed++;
++ } else {
++ pprev = &dst->next;
++ (*more)++;
++ }
++ }
++
++ return freed;
++}
++
++static int ip6_dst_gc(void)
+ {
+ static unsigned expire = 30*HZ;
+ static unsigned long last_gc;
+@@ -669,19 +647,6 @@
+ Remove it only when all the things will work!
+ */
+
+-static void ipv6_addr_prefix(struct in6_addr *pfx,
+- const struct in6_addr *addr, int plen)
+-{
+- int b = plen&0x7;
+- int o = plen>>3;
+-
+- memcpy(pfx->s6_addr, addr, o);
+- if (o < 16)
+- memset(pfx->s6_addr + o, 0, 16 - o);
+- if (b != 0)
+- pfx->s6_addr[o] = addr->s6_addr[o]&(0xff00 >> b);
+-}
+-
+ static int ipv6_get_mtu(struct net_device *dev)
+ {
+ int mtu = IPV6_MIN_MTU;
+@@ -695,6 +660,24 @@
+ return mtu;
+ }
+
++static inline unsigned int ipv6_advmss(unsigned int mtu)
++{
++ mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
++
++ if (mtu < ip6_rt_min_advmss)
++ mtu = ip6_rt_min_advmss;
++
++ /*
++ * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
++ * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
++ * IPV6_MAXPLEN is also valid and means: "any MSS,
++ * rely only on pmtu discovery"
++ */
++ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
++ mtu = IPV6_MAXPLEN;
++ return mtu;
++}
++
+ static int ipv6_get_hoplimit(struct net_device *dev)
+ {
+ int hoplimit = ipv6_devconf.hop_limit;
+@@ -712,14 +695,17 @@
+ *
+ */
+
+-int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh)
++int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ int err;
+ struct rtmsg *r;
++ struct rtattr **rta;
+ struct rt6_info *rt;
+ struct net_device *dev = NULL;
+ int addr_type;
+
++ rta = (struct rtattr **) _rtattr;
++
+ if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
+ return -EINVAL;
+ #ifndef CONFIG_IPV6_SUBTREES
+@@ -729,7 +715,7 @@
+ if (rtmsg->rtmsg_metric == 0)
+ rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
+
+- rt = dst_alloc(&ip6_dst_ops);
++ rt = ip6_dst_alloc();
+
+ if (rt == NULL)
+ return -ENOMEM;
+@@ -849,23 +835,42 @@
+ }
+ }
+
+- if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
+- rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
+- else
+- rt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
+- rt->rt6i_flags = rtmsg->rtmsg_flags;
++ rt->rt6i_flags = rtmsg->rtmsg_flags & ~RTF_NDISC;
+
+ install_route:
+- rt->u.dst.pmtu = ipv6_get_mtu(dev);
+- rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.pmtu - 60, ip6_rt_min_advmss);
+- /* Maximal non-jumbo IPv6 payload is 65535 and corresponding
+- MSS is 65535 - tcp_header_size. 65535 is also valid and
+- means: "any MSS, rely only on pmtu discovery"
+- */
+- if (rt->u.dst.advmss > 65535-20)
+- rt->u.dst.advmss = 65535;
++ if (rta && rta[RTA_METRICS-1]) {
++ int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
++ struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
++
++ while (RTA_OK(attr, attrlen)) {
++ unsigned flavor = attr->rta_type;
++ if (flavor) {
++ if (flavor > RTAX_MAX) {
++ err = -EINVAL;
++ goto out;
++ }
++ rt->u.dst.metrics[flavor-1] =
++ *(u32 *)RTA_DATA(attr);
++ }
++ attr = RTA_NEXT(attr, attrlen);
++ }
++ }
++
++ if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) {
++ if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
++ rt->u.dst.metrics[RTAX_HOPLIMIT-1] =
++ IPV6_DEFAULT_MCASTHOPS;
++ else
++ rt->u.dst.metrics[RTAX_HOPLIMIT-1] =
++ ipv6_get_hoplimit(dev);
++ }
++
++ if (!rt->u.dst.metrics[RTAX_MTU-1])
++ rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
++ if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
+ rt->u.dst.dev = dev;
+- return rt6_ins(rt, nlh);
++ return rt6_ins(rt, nlh, _rtattr);
+
+ out:
+ if (dev)
+@@ -874,7 +879,7 @@
+ return err;
+ }
+
+-int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh)
++int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ int err;
+
+@@ -886,13 +891,13 @@
+
+ dst_release(&rt->u.dst);
+
+- err = fib6_del(rt, nlh);
++ err = fib6_del(rt, nlh, _rtattr);
+ write_unlock_bh(&rt6_lock);
+
+ return err;
+ }
+
+-int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh)
++static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+ {
+ struct fib6_node *fn;
+ struct rt6_info *rt;
+@@ -919,7 +924,7 @@
+ dst_hold(&rt->u.dst);
+ read_unlock_bh(&rt6_lock);
+
+- return ip6_del_rt(rt, nlh);
++ return ip6_del_rt(rt, nlh, _rtattr);
+ }
+ }
+ read_unlock_bh(&rt6_lock);
+@@ -1015,17 +1020,14 @@
+ ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
+ nrt->rt6i_nexthop = neigh_clone(neigh);
+ /* Reset pmtu, it may be better */
+- nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev);
+- nrt->u.dst.advmss = max_t(unsigned int, nrt->u.dst.pmtu - 60, ip6_rt_min_advmss);
+- if (rt->u.dst.advmss > 65535-20)
+- rt->u.dst.advmss = 65535;
+- nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
++ nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
++ nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&nrt->u.dst));
+
+- if (rt6_ins(nrt, NULL))
++ if (rt6_ins(nrt, NULL, NULL))
+ goto out;
+
+ if (rt->rt6i_flags&RTF_CACHE) {
+- ip6_del_rt(rt, NULL);
++ ip6_del_rt(rt, NULL, NULL);
+ return;
+ }
+
+@@ -1060,7 +1062,7 @@
+ if (rt == NULL)
+ return;
+
+- if (pmtu >= rt->u.dst.pmtu)
++ if (pmtu >= dst_pmtu(&rt->u.dst))
+ goto out;
+
+ /* New mtu received -> path was valid.
+@@ -1075,7 +1077,7 @@
+ would return automatically.
+ */
+ if (rt->rt6i_flags & RTF_CACHE) {
+- rt->u.dst.pmtu = pmtu;
++ rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
+ dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
+ rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
+ goto out;
+@@ -1089,7 +1091,7 @@
+ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+ nrt = rt6_cow(rt, daddr, saddr);
+ if (!nrt->u.dst.error) {
+- nrt->u.dst.pmtu = pmtu;
++ nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
+ /* According to RFC 1981, detecting PMTU increase shouldn't be
+ happened within 5 mins, the recommended timer is 10 mins.
+ Here this route expiration time is set to ip6_rt_mtu_expires
+@@ -1098,8 +1100,8 @@
+ */
+ dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
+ nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
+- dst_release(&nrt->u.dst);
+ }
++ dst_release(&nrt->u.dst);
+ } else {
+ nrt = ip6_rt_copy(rt);
+ if (nrt == NULL)
+@@ -1110,8 +1112,8 @@
+ nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+ dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
+ nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES;
+- nrt->u.dst.pmtu = pmtu;
+- rt6_ins(nrt, NULL);
++ nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
++ rt6_ins(nrt, NULL, NULL);
+ }
+
+ out:
+@@ -1124,20 +1126,19 @@
+
+ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
+ {
+- struct rt6_info *rt;
++ struct rt6_info *rt = ip6_dst_alloc();
+
+- rt = dst_alloc(&ip6_dst_ops);
++ BUG_ON(ort->rt6i_flags & RTF_NDISC);
+
+ if (rt) {
+ rt->u.dst.input = ort->u.dst.input;
+ rt->u.dst.output = ort->u.dst.output;
+
+- memcpy(&rt->u.dst.mxlock, &ort->u.dst.mxlock, RTAX_MAX*sizeof(unsigned));
++ memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
+ rt->u.dst.dev = ort->u.dst.dev;
+ if (rt->u.dst.dev)
+ dev_hold(rt->u.dst.dev);
+ rt->u.dst.lastuse = jiffies;
+- rt->rt6i_hoplimit = ort->rt6i_hoplimit;
+ rt->rt6i_expires = 0;
+
+ ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
+@@ -1184,7 +1185,7 @@
+
+ rtmsg.rtmsg_ifindex = dev->ifindex;
+
+- ip6_route_add(&rtmsg, NULL);
++ ip6_route_add(&rtmsg, NULL, NULL);
+ return rt6_get_dflt_router(gwaddr, dev);
+ }
+
+@@ -1210,7 +1211,7 @@
+
+ read_unlock_bh(&rt6_lock);
+
+- ip6_del_rt(rt, NULL);
++ ip6_del_rt(rt, NULL, NULL);
+
+ goto restart;
+ }
+@@ -1236,10 +1237,10 @@
+ rtnl_lock();
+ switch (cmd) {
+ case SIOCADDRT:
+- err = ip6_route_add(&rtmsg, NULL);
++ err = ip6_route_add(&rtmsg, NULL, NULL);
+ break;
+ case SIOCDELRT:
+- err = ip6_route_del(&rtmsg, NULL);
++ err = ip6_route_del(&rtmsg, NULL, NULL);
+ break;
+ default:
+ err = -EINVAL;
+@@ -1268,11 +1269,10 @@
+ * Add address
+ */
+
+-int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev)
++int ip6_rt_addr_add(struct in6_addr *addr, struct net_device *dev, int anycast)
+ {
+- struct rt6_info *rt;
++ struct rt6_info *rt = ip6_dst_alloc();
+
+- rt = dst_alloc(&ip6_dst_ops);
+ if (rt == NULL)
+ return -ENOMEM;
+
+@@ -1280,14 +1280,14 @@
+ rt->u.dst.input = ip6_input;
+ rt->u.dst.output = ip6_output;
+ rt->rt6i_dev = dev_get_by_name("lo");
+- rt->u.dst.pmtu = ipv6_get_mtu(rt->rt6i_dev);
+- rt->u.dst.advmss = max_t(unsigned int, rt->u.dst.pmtu - 60, ip6_rt_min_advmss);
+- if (rt->u.dst.advmss > 65535-20)
+- rt->u.dst.advmss = 65535;
+- rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev);
++ rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_pmtu(&rt->u.dst));
++ rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ipv6_get_hoplimit(rt->rt6i_dev);
+ rt->u.dst.obsolete = -1;
+
+ rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
++ if (!anycast)
++ rt->rt6i_flags |= RTF_LOCAL;
+ rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+ if (rt->rt6i_nexthop == NULL) {
+ dst_free((struct dst_entry *) rt);
+@@ -1296,7 +1296,7 @@
+
+ ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
+ rt->rt6i_dst.plen = 128;
+- rt6_ins(rt, NULL);
++ rt6_ins(rt, NULL, NULL);
+
+ return 0;
+ }
+@@ -1313,129 +1313,13 @@
+ rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
+ if (rt) {
+ if (rt->rt6i_dst.plen == 128)
+- err = ip6_del_rt(rt, NULL);
++ err = ip6_del_rt(rt, NULL, NULL);
+ else
+ dst_release(&rt->u.dst);
+ }
+
+ return err;
+ }
+-
+-#ifdef CONFIG_RT6_POLICY
+-
+-static int rt6_flow_match_in(struct rt6_info *rt, struct sk_buff *skb)
+-{
+- struct flow_filter *frule;
+- struct pkt_filter *filter;
+- int res = 1;
+-
+- if ((frule = rt->rt6i_filter) == NULL)
+- goto out;
+-
+- if (frule->type != FLR_INPUT) {
+- res = 0;
+- goto out;
+- }
+-
+- for (filter = frule->u.filter; filter; filter = filter->next) {
+- __u32 *word;
+-
+- word = (__u32 *) skb->h.raw;
+- word += filter->offset;
+-
+- if ((*word ^ filter->value) & filter->mask) {
+- res = 0;
+- break;
+- }
+- }
+-
+-out:
+- return res;
+-}
+-
+-static int rt6_flow_match_out(struct rt6_info *rt, struct sock *sk)
+-{
+- struct flow_filter *frule;
+- int res = 1;
+-
+- if ((frule = rt->rt6i_filter) == NULL)
+- goto out;
+-
+- if (frule->type != FLR_INPUT) {
+- res = 0;
+- goto out;
+- }
+-
+- if (frule->u.sk != sk)
+- res = 0;
+-out:
+- return res;
+-}
+-
+-static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
+- struct in6_addr *daddr,
+- struct in6_addr *saddr,
+- struct fl_acc_args *args)
+-{
+- struct flow_rule *frule;
+- struct rt6_info *nrt = NULL;
+- struct pol_chain *pol;
+-
+- for (pol = rt6_pol_list; pol; pol = pol->next) {
+- struct fib6_node *fn;
+- struct rt6_info *sprt;
+-
+- fn = fib6_lookup(pol->rules, daddr, saddr);
+-
+- do {
+- for (sprt = fn->leaf; sprt; sprt=sprt->u.next) {
+- int res;
+-
+- frule = sprt->rt6i_flowr;
+-#if RT6_DEBUG >= 2
+- if (frule == NULL) {
+- printk(KERN_DEBUG "NULL flowr\n");
+- goto error;
+- }
+-#endif
+- res = frule->ops->accept(rt, sprt, args, &nrt);
+-
+- switch (res) {
+- case FLOWR_SELECT:
+- goto found;
+- case FLOWR_CLEAR:
+- goto next_policy;
+- case FLOWR_NODECISION:
+- break;
+- default:
+- goto error;
+- };
+- }
+-
+- fn = fn->parent;
+-
+- } while ((fn->fn_flags & RTN_TL_ROOT) == 0);
+-
+- next_policy:
+- }
+-
+-error:
+- dst_hold(&ip6_null_entry.u.dst);
+- return &ip6_null_entry;
+-
+-found:
+- if (nrt == NULL)
+- goto error;
+-
+- nrt->rt6i_flags |= RTF_CACHE;
+- dst_hold(&nrt->u.dst);
+- err = rt6_ins(nrt, NULL);
+- if (err)
+- nrt->u.dst.error = err;
+- return nrt;
+-}
+-#endif
+-
+ static int fib6_ifdown(struct rt6_info *rt, void *arg)
+ {
+ if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
+@@ -1487,14 +1371,12 @@
+ PMTU discouvery.
+ */
+ if (rt->rt6i_dev == arg->dev &&
+- !(rt->u.dst.mxlock&(1<<RTAX_MTU)) &&
+- (rt->u.dst.pmtu > arg->mtu ||
+- (rt->u.dst.pmtu < arg->mtu &&
+- rt->u.dst.pmtu == idev->cnf.mtu6)))
+- rt->u.dst.pmtu = arg->mtu;
+- rt->u.dst.advmss = max_t(unsigned int, arg->mtu - 60, ip6_rt_min_advmss);
+- if (rt->u.dst.advmss > 65535-20)
+- rt->u.dst.advmss = 65535;
++ !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
++ (dst_pmtu(&rt->u.dst) > arg->mtu ||
++ (dst_pmtu(&rt->u.dst) < arg->mtu &&
++ dst_pmtu(&rt->u.dst) == idev->cnf.mtu6)))
++ rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
++ rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
+ return 0;
+ }
+
+@@ -1556,7 +1438,7 @@
+
+ if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
+ return -EINVAL;
+- return ip6_route_del(&rtmsg, nlh);
++ return ip6_route_del(&rtmsg, nlh, arg);
+ }
+
+ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+@@ -1566,7 +1448,7 @@
+
+ if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
+ return -EINVAL;
+- return ip6_route_add(&rtmsg, nlh);
++ return ip6_route_add(&rtmsg, nlh, arg);
+ }
+
+ struct rt6_rtnl_dump_arg
+@@ -1642,7 +1524,7 @@
+ if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
+ RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
+ }
+- if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
++ if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ goto rtattr_failure;
+ if (rt->u.dst.neighbour)
+ RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
+@@ -1798,15 +1680,13 @@
+ skb->mac.raw = skb->data;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
+
+- fl.proto = 0;
+- fl.nl_u.ip6_u.daddr = NULL;
+- fl.nl_u.ip6_u.saddr = NULL;
+- fl.uli_u.icmpt.type = 0;
+- fl.uli_u.icmpt.code = 0;
++ memset(&fl, 0, sizeof(fl));
+ if (rta[RTA_SRC-1])
+- fl.nl_u.ip6_u.saddr = (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]);
++ ipv6_addr_copy(&fl.fl6_src,
++ (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
+ if (rta[RTA_DST-1])
+- fl.nl_u.ip6_u.daddr = (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]);
++ ipv6_addr_copy(&fl.fl6_dst,
++ (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
+
+ if (rta[RTA_IIF-1])
+ memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
+@@ -1830,8 +1710,7 @@
+
+ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+ err = rt6_fill_node(skb, rt,
+- fl.nl_u.ip6_u.daddr,
+- fl.nl_u.ip6_u.saddr,
++ &fl.fl6_dst, &fl.fl6_src,
+ iif,
+ RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq, nlh, 0);
+@@ -2043,7 +1922,6 @@
+
+ #endif
+
+-
+ void __init ip6_route_init(void)
+ {
+ ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
+@@ -2055,6 +1933,9 @@
+ proc_net_create("ipv6_route", 0, rt6_proc_info);
+ proc_net_create("rt6_stats", 0, rt6_proc_stats);
+ #endif
++#ifdef CONFIG_XFRM
++ xfrm6_init();
++#endif
+ }
+
+ #ifdef MODULE
+@@ -2064,8 +1945,11 @@
+ proc_net_remove("ipv6_route");
+ proc_net_remove("rt6_stats");
+ #endif
+-
++#ifdef CONFIG_XFRM
++ xfrm6_fini();
++#endif
+ rt6_ifdown(NULL);
+ fib6_gc_cleanup();
++ kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+ }
+ #endif /* MODULE */
+diff -Nru a/net/ipv6/sit.c b/net/ipv6/sit.c
+--- a/net/ipv6/sit.c 2005-02-13 21:25:10 +11:00
++++ b/net/ipv6/sit.c 2005-02-13 21:25:10 +11:00
+@@ -49,6 +49,7 @@
+ #include <net/icmp.h>
+ #include <net/ipip.h>
+ #include <net/inet_ecn.h>
++#include <net/xfrm.h>
+
+ /*
+ This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
+@@ -392,6 +393,7 @@
+
+ read_lock(&ipip6_lock);
+ if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
++ secpath_reset(skb);
+ skb->mac.raw = skb->nh.raw;
+ skb->nh.raw = skb->data;
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+@@ -416,13 +418,6 @@
+ return 0;
+ }
+
+-/* Need this wrapper because NF_HOOK takes the function address */
+-static inline int do_ip_send(struct sk_buff *skb)
+-{
+- return ip_send(skb);
+-}
+-
+-
+ /* Returns the embedded IPv4 address if the IPv6 address
+ comes from 6to4 (draft-ietf-ngtrans-6to4-04) addr space */
+
+@@ -495,9 +490,17 @@
+ dst = addr6->s6_addr32[3];
+ }
+
+- if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) {
+- tunnel->stat.tx_carrier_errors++;
+- goto tx_error_icmp;
++ {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = dst,
++ .saddr = tiph->saddr,
++ .tos = RT_TOS(tos) } },
++ .oif = tunnel->parms.link,
++ .proto = IPPROTO_IPV6 };
++ if (ip_route_output_key(&rt, &fl)) {
++ tunnel->stat.tx_carrier_errors++;
++ goto tx_error_icmp;
++ }
+ }
+ if (rt->rt_type != RTN_UNICAST) {
+ ip_rt_put(rt);
+@@ -513,9 +516,9 @@
+ }
+
+ if (tiph->frag_off)
+- mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
++ mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+ else
+- mtu = skb->dst ? skb->dst->pmtu : dev->mtu;
++ mtu = skb->dst ? dst_pmtu(skb->dst) : dev->mtu;
+
+ if (mtu < 68) {
+ tunnel->stat.collisions++;
+@@ -524,15 +527,9 @@
+ }
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+- if (skb->dst && mtu < skb->dst->pmtu) {
+- struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+- if (mtu < rt6->u.dst.pmtu) {
+- if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) {
+- rt6->rt6i_flags |= RTF_MODIFIED;
+- rt6->u.dst.pmtu = mtu;
+- }
+- }
+- }
++ if (tunnel->parms.iph.daddr && skb->dst)
++ skb->dst->ops->update_pmtu(skb->dst, mtu);
++
+ if (skb->len > mtu) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ ip_rt_put(rt);
+@@ -550,7 +547,7 @@
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+- max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
++ max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr);
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+@@ -765,8 +762,14 @@
+ ipip6_tunnel_init_gen(dev);
+
+ if (iph->daddr) {
++ struct flowi fl = { .nl_u = { .ip4_u =
++ { .daddr = iph->daddr,
++ .saddr = iph->saddr,
++ .tos = RT_TOS(iph->tos) } },
++ .oif = tunnel->parms.link,
++ .proto = IPPROTO_IPV6 };
+ struct rtable *rt;
+- if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) {
++ if (!ip_route_output_key(&rt, &fl)) {
+ tdev = rt->u.dst.dev;
+ ip_rt_put(rt);
+ }
+@@ -823,19 +826,14 @@
+ }
+
+ static struct inet_protocol sit_protocol = {
+- ipip6_rcv,
+- ipip6_err,
+- 0,
+- IPPROTO_IPV6,
+- 0,
+- NULL,
+- "IPv6"
++ .handler = ipip6_rcv,
++ .err_handler = ipip6_err,
+ };
+
+ #ifdef MODULE
+ void sit_cleanup(void)
+ {
+- inet_del_protocol(&sit_protocol);
++ inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
+ unregister_netdev(&ipip6_fb_tunnel_dev);
+ }
+ #endif
+@@ -844,9 +842,13 @@
+ {
+ printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n");
+
++ if (inet_add_protocol(&sit_protocol, IPPROTO_IPV6) < 0) {
++ printk(KERN_INFO "sit init: Can't add protocol\n");
++ return -EAGAIN;
++ }
++
+ ipip6_fb_tunnel_dev.priv = (void*)&ipip6_fb_tunnel;
+ strcpy(ipip6_fb_tunnel_dev.name, ipip6_fb_tunnel.parms.name);
+ register_netdev(&ipip6_fb_tunnel_dev);
+- inet_add_protocol(&sit_protocol);
+ return 0;
+ }
+diff -Nru a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+--- a/net/ipv6/tcp_ipv6.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/tcp_ipv6.c 2005-02-13 21:25:09 +11:00
+@@ -38,6 +38,7 @@
+ #include <linux/init.h>
+ #include <linux/jhash.h>
+ #include <linux/ipsec.h>
++#include <net/xfrm.h>
+
+ #include <linux/ipv6.h>
+ #include <linux/icmpv6.h>
+@@ -553,7 +554,6 @@
+ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct in6_addr *saddr = NULL;
+- struct in6_addr saddr_buf;
+ struct flowi fl;
+ struct dst_entry *dst;
+ int addr_type;
+@@ -565,7 +565,8 @@
+ if (usin->sin6_family != AF_INET6)
+ return(-EAFNOSUPPORT);
+
+- fl.fl6_flowlabel = 0;
++ memset(&fl, 0, sizeof(fl));
++
+ if (np->sndflow) {
+ fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl.fl6_flowlabel);
+@@ -659,43 +660,45 @@
+ saddr = &np->rcv_saddr;
+
+ fl.proto = IPPROTO_TCP;
+- fl.fl6_dst = &np->daddr;
+- fl.fl6_src = saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
++ ipv6_addr_copy(&fl.fl6_src,
++ (saddr ? saddr : &np->saddr));
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.dport = usin->sin6_port;
+- fl.uli_u.ports.sport = sk->sport;
++ fl.fl_ip_dport = usin->sin6_port;
++ fl.fl_ip_sport = sk->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+- fl.nl_u.ip6_u.daddr = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+- dst = ip6_route_output(sk, &fl);
++ err = ip6_dst_lookup(sk, &dst, &fl);
+
+- if ((err = dst->error) != 0) {
+- dst_release(dst);
++ if (err)
+ goto failure;
+- }
+-
+- ip6_dst_store(sk, dst, NULL);
+- sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
+
+ if (saddr == NULL) {
+- err = ipv6_get_saddr(dst, &np->daddr, &saddr_buf);
+- if (err)
+- goto failure;
+-
+- saddr = &saddr_buf;
++ saddr = &fl.fl6_src;
++ ipv6_addr_copy(&np->rcv_saddr, saddr);
+ }
+
+ /* set the source address */
+- ipv6_addr_copy(&np->rcv_saddr, saddr);
+ ipv6_addr_copy(&np->saddr, saddr);
+ sk->rcv_saddr= LOOPBACK4_IPV6;
+
++ ip6_dst_store(sk, dst, NULL);
++ sk->route_caps = dst->dev->features &
++ ~(NETIF_F_IP_CSUM
++#ifdef NETIF_F_TSO
++ | NETIF_F_TSO
++#endif
++ );
++
+ tp->ext_header_len = 0;
+ if (np->opt)
+ tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
++ tp->ext2_header_len = dst->header_len;
++
+ tp->mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+
+ sk->dport = usin->sin6_port;
+@@ -717,8 +720,8 @@
+
+ late_failure:
+ tcp_set_state(sk, TCP_CLOSE);
+-failure:
+ __sk_dst_reset(sk);
++failure:
+ sk->dport = 0;
+ sk->route_caps = 0;
+ return err;
+@@ -781,21 +784,23 @@
+ to handle rthdr case. Ignore this complexity
+ for now.
+ */
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+- fl.nl_u.ip6_u.daddr = &np->daddr;
+- fl.nl_u.ip6_u.saddr = &np->saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
++ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.dport = sk->dport;
+- fl.uli_u.ports.sport = sk->sport;
++ fl.fl_ip_dport = sk->dport;
++ fl.fl_ip_sport = sk->sport;
+
+- dst = ip6_route_output(sk, &fl);
++ if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
++ sk->err_soft = -err;
++ goto out;
++ }
+ } else
+ dst_hold(dst);
+
+- if (dst->error) {
+- sk->err_soft = -dst->error;
+- } else if (tp->pmtu_cookie > dst->pmtu) {
+- tcp_sync_mss(sk, dst->pmtu);
++ if (tp->pmtu_cookie > dst_pmtu(dst)) {
++ tcp_sync_mss(sk, dst_pmtu(dst));
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
+ dst_release(dst);
+@@ -865,13 +870,14 @@
+ struct flowi fl;
+ int err = -1;
+
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+- fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+- fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr;
++ ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
++ ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+ fl.fl6_flowlabel = 0;
+ fl.oif = req->af.v6_req.iif;
+- fl.uli_u.ports.dport = req->rmt_port;
+- fl.uli_u.ports.sport = sk->sport;
++ fl.fl_ip_dport = req->rmt_port;
++ fl.fl_ip_sport = sk->sport;
+
+ if (dst == NULL) {
+ opt = sk->net_pinfo.af_inet6.opt;
+@@ -886,11 +892,11 @@
+
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+- fl.nl_u.ip6_u.daddr = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+- dst = ip6_route_output(sk, &fl);
+- if (dst->error)
++ err = ip6_dst_lookup(sk, &dst, &fl);
++ if (err)
+ goto done;
+ }
+
+@@ -902,7 +908,7 @@
+ &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
+ csum_partial((char *)th, skb->len, skb->csum));
+
+- fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
++ ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+ err = ip6_xmit(sk, skb, &fl, opt);
+ if (err == NET_XMIT_CN)
+ err = 0;
+@@ -970,7 +976,7 @@
+ if (th->rst)
+ return;
+
+- if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
++ if (!ipv6_unicast_destination(skb))
+ return;
+
+ /*
+@@ -1003,24 +1009,21 @@
+
+ buff->csum = csum_partial((char *)t1, sizeof(*t1), 0);
+
+- fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr;
+- fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr;
+- fl.fl6_flowlabel = 0;
++ memset(&fl, 0, sizeof(fl));
++ ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
++ ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr);
+
+- t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr,
+- fl.nl_u.ip6_u.daddr,
++ t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
+ sizeof(*t1), IPPROTO_TCP,
+ buff->csum);
+
+ fl.proto = IPPROTO_TCP;
+ fl.oif = tcp_v6_iif(skb);
+- fl.uli_u.ports.dport = t1->dest;
+- fl.uli_u.ports.sport = t1->source;
++ fl.fl_ip_dport = t1->dest;
++ fl.fl_ip_sport = t1->source;
+
+ /* sk = NULL, but it is safe for now. RST socket required. */
+- buff->dst = ip6_route_output(NULL, &fl);
+-
+- if (buff->dst->error == 0) {
++ if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
+ ip6_xmit(NULL, buff, &fl, NULL);
+ TCP_INC_STATS_BH(TcpOutSegs);
+ TCP_INC_STATS_BH(TcpOutRsts);
+@@ -1070,23 +1073,20 @@
+
+ buff->csum = csum_partial((char *)t1, tot_len, 0);
+
+- fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->saddr;
+- fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->daddr;
+- fl.fl6_flowlabel = 0;
++ memset(&fl, 0, sizeof(fl));
++ ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr);
++ ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr);
+
+- t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr,
+- fl.nl_u.ip6_u.daddr,
++ t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
+ tot_len, IPPROTO_TCP,
+ buff->csum);
+
+ fl.proto = IPPROTO_TCP;
+ fl.oif = tcp_v6_iif(skb);
+- fl.uli_u.ports.dport = t1->dest;
+- fl.uli_u.ports.sport = t1->source;
++ fl.fl_ip_dport = t1->dest;
++ fl.fl_ip_sport = t1->source;
+
+- buff->dst = ip6_route_output(NULL, &fl);
+-
+- if (buff->dst->error == 0) {
++ if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
+ ip6_xmit(NULL, buff, &fl, NULL);
+ TCP_INC_STATS_BH(TcpOutSegs);
+ return;
+@@ -1177,8 +1177,7 @@
+ if (skb->protocol == htons(ETH_P_IP))
+ return tcp_v4_conn_request(sk, skb);
+
+- /* FIXME: do the same check for anycast */
+- if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
++ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ /*
+@@ -1248,7 +1247,6 @@
+ struct dst_entry *dst)
+ {
+ struct ipv6_pinfo *np;
+- struct flowi fl;
+ struct tcp_opt *newtp;
+ struct sock *newsk;
+ struct ipv6_txoptions *opt;
+@@ -1310,23 +1308,23 @@
+ }
+
+ if (dst == NULL) {
++ struct flowi fl;
++
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+- fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
++ ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+ if (opt && opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+- fl.nl_u.ip6_u.daddr = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+- fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr;
+- fl.fl6_flowlabel = 0;
++ ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.dport = req->rmt_port;
+- fl.uli_u.ports.sport = sk->sport;
+-
+- dst = ip6_route_output(sk, &fl);
+- }
++ fl.fl_ip_dport = req->rmt_port;
++ fl.fl_ip_sport = sk->sport;
+
+- if (dst->error)
+- goto out;
++ if (ip6_dst_lookup(sk, &dst, &fl))
++ goto out;
++ }
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+@@ -1339,7 +1337,12 @@
+ MOD_INC_USE_COUNT;
+
+ ip6_dst_store(newsk, dst, NULL);
+- sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
++ newsk->route_caps = dst->dev->features&
++ ~(NETIF_F_IP_CSUM
++#ifdef NETIF_F_TSO
++ | NETIF_F_TSO
++#endif
++ );
+
+ newtp = &(newsk->tp_pinfo.af_tcp);
+
+@@ -1387,8 +1390,10 @@
+ if (np->opt)
+ newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen;
+
+- tcp_sync_mss(newsk, dst->pmtu);
+- newtp->advmss = dst->advmss;
++ newtp->ext2_header_len = dst->header_len;
++
++ tcp_sync_mss(newsk, dst_pmtu(dst));
++ newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+ tcp_initialize_rcv_mss(newsk);
+
+ newsk->daddr = LOOPBACK4_IPV6;
+@@ -1557,8 +1562,9 @@
+ return 0;
+ }
+
+-int tcp_v6_rcv(struct sk_buff *skb)
++static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ {
++ struct sk_buff *skb = *pskb;
+ struct tcphdr *th;
+ struct sock *sk;
+ int ret;
+@@ -1601,11 +1607,12 @@
+ goto no_tcp_socket;
+
+ process:
+- if(!ipsec_sk_policy(sk,skb))
+- goto discard_and_relse;
+ if(sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
++ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
++ goto discard_and_relse;
++
+ if (sk_filter(sk, skb, 0))
+ goto discard_and_relse;
+
+@@ -1621,9 +1628,12 @@
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+- return ret;
++ return ret ? -1 : 0;
+
+ no_tcp_socket:
++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
++ goto discard_and_relse;
++
+ if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ bad_packet:
+ TCP_INC_STATS_BH(TcpInErrs);
+@@ -1645,6 +1655,10 @@
+ goto discard_it;
+
+ do_time_wait:
++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
++ sock_put(sk);
++ goto discard_it;
++ }
+ if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+ TCP_INC_STATS_BH(TcpInErrs);
+ tcp_tw_put((struct tcp_tw_bucket *) sk);
+@@ -1688,30 +1702,35 @@
+ if (dst == NULL) {
+ struct flowi fl;
+
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+- fl.nl_u.ip6_u.daddr = &np->daddr;
+- fl.nl_u.ip6_u.saddr = &np->saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
++ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.dport = sk->dport;
+- fl.uli_u.ports.sport = sk->sport;
++ fl.fl_ip_dport = sk->dport;
++ fl.fl_ip_sport = sk->sport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+- fl.nl_u.ip6_u.daddr = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+- dst = ip6_route_output(sk, &fl);
++ err = ip6_dst_lookup(sk, &dst, &fl);
+
+- if (dst->error) {
+- err = dst->error;
+- dst_release(dst);
++ if (err) {
+ sk->route_caps = 0;
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+- sk->route_caps = dst->dev->features&~NETIF_F_IP_CSUM;
++ sk->route_caps = dst->dev->features&
++ ~(NETIF_F_IP_CSUM
++#ifdef NETIF_F_TSO
++ | NETIF_F_TSO
++#endif
++ );
++ tcp_sk(sk)->ext2_header_len = dst->header_len;
+ }
+
+ return 0;
+@@ -1724,38 +1743,45 @@
+ struct flowi fl;
+ struct dst_entry *dst;
+
++ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+- fl.fl6_dst = &np->daddr;
+- fl.fl6_src = &np->saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
++ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.fl6_flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.sport = sk->sport;
+- fl.uli_u.ports.dport = sk->dport;
++ fl.fl_ip_sport = sk->sport;
++ fl.fl_ip_dport = sk->dport;
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+- fl.nl_u.ip6_u.daddr = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst == NULL) {
+- dst = ip6_route_output(sk, &fl);
++ int err = ip6_dst_lookup(sk, &dst, &fl);
+
+- if (dst->error) {
+- sk->err_soft = -dst->error;
+- dst_release(dst);
+- return -sk->err_soft;
++ if (err) {
++ sk->err_soft = -err;
++ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
++ sk->route_caps = dst->dev->features &
++ ~(NETIF_F_IP_CSUM
++#ifdef NETIF_F_TSO
++ | NETIF_F_TSO
++#endif
++ );
++ tcp_sk(sk)->ext2_header_len = dst->header_len;
+ }
+
+ skb->dst = dst_clone(dst);
+
+ /* Restore final destination back after routing done */
+- fl.nl_u.ip6_u.daddr = &np->daddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+
+ return ip6_xmit(sk, skb, &fl, np->opt);
+ }
+@@ -1865,6 +1891,7 @@
+ static int tcp_v6_destroy_sock(struct sock *sk)
+ {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
++ struct inet_opt *inet = inet_sk(sk);
+
+ tcp_clear_xmit_timers(sk);
+
+@@ -1882,8 +1909,8 @@
+ tcp_put_port(sk);
+
+ /* If sendmsg cached page exists, toss it. */
+- if (tp->sndmsg_page != NULL)
+- __free_page(tp->sndmsg_page);
++ if (inet->sndmsg_page != NULL)
++ __free_page(inet->sndmsg_page);
+
+ atomic_dec(&tcp_sockets_allocated);
+
+@@ -2143,15 +2170,10 @@
+ get_port: tcp_v6_get_port,
+ };
+
+-static struct inet6_protocol tcpv6_protocol =
+-{
+- tcp_v6_rcv, /* TCP handler */
+- tcp_v6_err, /* TCP error control */
+- NULL, /* next */
+- IPPROTO_TCP, /* protocol ID */
+- 0, /* copy */
+- NULL, /* data */
+- "TCPv6" /* name */
++static struct inet6_protocol tcpv6_protocol = {
++ .handler = tcp_v6_rcv,
++ .err_handler = tcp_v6_err,
++ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+ };
+
+ extern struct proto_ops inet6_stream_ops;
+@@ -2169,6 +2191,7 @@
+ void __init tcpv6_init(void)
+ {
+ /* register inet6 protocol */
+- inet6_add_protocol(&tcpv6_protocol);
++ if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0)
++ printk(KERN_ERR "tcpv6_init: Could not register protocol\n");
+ inet6_register_protosw(&tcpv6_protosw);
+ }
+diff -Nru a/net/ipv6/udp.c b/net/ipv6/udp.c
+--- a/net/ipv6/udp.c 2005-02-13 21:25:09 +11:00
++++ b/net/ipv6/udp.c 2005-02-13 21:25:09 +11:00
+@@ -14,6 +14,7 @@
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
++ * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+@@ -50,6 +51,7 @@
+ #include <net/inet_common.h>
+
+ #include <net/checksum.h>
++#include <net/xfrm.h>
+
+ struct udp_mib udp_stats_in6[NR_CPUS*2];
+
+@@ -226,7 +228,6 @@
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+ struct in6_addr *daddr;
+- struct in6_addr saddr;
+ struct dst_entry *dst;
+ struct flowi fl;
+ struct ip6_flowlabel *flowlabel = NULL;
+@@ -246,7 +247,7 @@
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+- fl.fl6_flowlabel = 0;
++ memset(&fl, 0, sizeof(fl));
+ if (np->sndflow) {
+ fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
+@@ -271,9 +272,10 @@
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ struct sockaddr_in sin;
+
+- if (__ipv6_only_sock(sk))
+- return -ENETUNREACH;
+-
++ if (__ipv6_only_sock(sk)) {
++ err = -ENETUNREACH;
++ goto out;
++ }
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = daddr->s6_addr32[3];
+ sin.sin_port = usin->sin6_port;
+@@ -281,8 +283,8 @@
+ err = udp_connect(sk, (struct sockaddr*) &sin, sizeof(sin));
+
+ ipv4_connected:
+- if (err < 0)
+- return err;
++ if (err)
++ goto out;
+
+ ipv6_addr_set(&np->daddr, 0, 0,
+ htonl(0x0000ffff),
+@@ -299,15 +301,15 @@
+ htonl(0x0000ffff),
+ sk->rcv_saddr);
+ }
+- return 0;
++ goto out;
+ }
+
+ if (addr_type&IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ if (sk->bound_dev_if && sk->bound_dev_if != usin->sin6_scope_id) {
+- fl6_sock_release(flowlabel);
+- return -EINVAL;
++ err = -EINVAL;
++ goto out;
+ }
+ sk->bound_dev_if = usin->sin6_scope_id;
+ if (!sk->bound_dev_if && (addr_type&IPV6_ADDR_MULTICAST))
+@@ -315,8 +317,10 @@
+ }
+
+ /* Connect to link-local address requires an interface */
+- if (sk->bound_dev_if == 0)
+- return -EINVAL;
++ if (sk->bound_dev_if == 0) {
++ err = -EINVAL;
++ goto out;
++ }
+ }
+
+ ipv6_addr_copy(&np->daddr, daddr);
+@@ -330,11 +334,11 @@
+ */
+
+ fl.proto = IPPROTO_UDP;
+- fl.fl6_dst = &np->daddr;
+- fl.fl6_src = &saddr;
++ ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
++ ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+ fl.oif = sk->bound_dev_if;
+- fl.uli_u.ports.dport = sk->dport;
+- fl.uli_u.ports.sport = sk->sport;
++ fl.fl_ip_dport = sk->dport;
++ fl.fl_ip_sport = sk->sport;
+
+ if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST))
+ fl.oif = np->mcast_oif;
+@@ -342,37 +346,33 @@
+ if (flowlabel) {
+ if (flowlabel->opt && flowlabel->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt;
+- fl.fl6_dst = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+ } else if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+- fl.fl6_dst = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+- dst = ip6_route_output(sk, &fl);
+-
+- if ((err = dst->error) != 0) {
+- dst_release(dst);
+- fl6_sock_release(flowlabel);
+- return err;
+- }
++ err = ip6_dst_lookup(sk, &dst, &fl);
++ if (err)
++ goto out;
+
+- ip6_dst_store(sk, dst, fl.fl6_dst);
++ /* source address lookup done in ip6_dst_lookup */
+
+- /* get the source adddress used in the apropriate device */
++ if (ipv6_addr_any(&np->saddr))
++ ipv6_addr_copy(&np->saddr, &fl.fl6_src);
+
+- err = ipv6_get_saddr(dst, daddr, &saddr);
++ if (ipv6_addr_any(&np->rcv_saddr)) {
++ ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
++ sk->rcv_saddr = LOOPBACK4_IPV6;
++ }
+
+- if (err == 0) {
+- if(ipv6_addr_any(&np->saddr))
+- ipv6_addr_copy(&np->saddr, &saddr);
++ ip6_dst_store(sk, dst,
++ !ipv6_addr_cmp(&fl.fl6_dst, &np->daddr) ?
++ &np->daddr : NULL);
+
+- if(ipv6_addr_any(&np->rcv_saddr)) {
+- ipv6_addr_copy(&np->rcv_saddr, &saddr);
+- sk->rcv_saddr = LOOPBACK4_IPV6;
+- }
+- sk->state = TCP_ESTABLISHED;
+- }
++ sk->state = TCP_ESTABLISHED;
++out:
+ fl6_sock_release(flowlabel);
+
+ return err;
+@@ -524,6 +524,11 @@
+
+ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+ {
++ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
++ kfree_skb(skb);
++ return -1;
++ }
++
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
+ UDP6_INC_STATS_BH(UdpInErrors);
+@@ -610,8 +615,9 @@
+ read_unlock(&udp_hash_lock);
+ }
+
+-int udpv6_rcv(struct sk_buff *skb)
++static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ {
++ struct sk_buff *skb = *pskb;
+ struct sock *sk;
+ struct udphdr *uh;
+ struct net_device *dev = skb->dev;
+@@ -678,6 +684,9 @@
+ sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex);
+
+ if (sk == NULL) {
++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
++ goto discard;
++
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+ (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ goto discard;
+@@ -704,103 +713,126 @@
+ kfree_skb(skb);
+ return(0);
+ }
+-
+ /*
+- * Sending
++ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+-
+-struct udpv6fakehdr
++static void udp_v6_flush_pending_frames(struct sock *sk)
+ {
+- struct udphdr uh;
+- struct iovec *iov;
+- __u32 wcheck;
+- __u32 pl_len;
+- struct in6_addr *daddr;
+-};
++ struct udp_opt *up = udp_sk(sk);
++
++ if (up->pending) {
++ up->len = 0;
++ up->pending = 0;
++ ip6_flush_pending_frames(sk);
++ }
++}
+
+ /*
+- * with checksum
++ * Sending
+ */
+
+-static int udpv6_getfrag(const void *data, struct in6_addr *addr,
+- char *buff, unsigned int offset, unsigned int len)
++static int udp_v6_push_pending_frames(struct sock *sk, struct udp_opt *up)
+ {
+- struct udpv6fakehdr *udh = (struct udpv6fakehdr *) data;
+- char *dst;
+- int final = 0;
+- int clen = len;
++ struct sk_buff *skb;
++ struct udphdr *uh;
++ struct ipv6_pinfo *np = inet6_sk(sk);
++ struct flowi *fl = &np->cork.fl;
++ int err = 0;
+
+- dst = buff;
++ /* Grab the skbuff where UDP header space exists. */
++ if ((skb = skb_peek(&sk->write_queue)) == NULL)
++ goto out;
+
+- if (offset) {
+- offset -= sizeof(struct udphdr);
++ /*
++ * Create a UDP header
++ */
++ uh = skb->h.uh;
++ uh->source = fl->fl_ip_sport;
++ uh->dest = fl->fl_ip_dport;
++ uh->len = htons(up->len);
++ uh->check = 0;
++
++ if (sk->no_check == UDP_CSUM_NOXMIT) {
++ skb->ip_summed = CHECKSUM_NONE;
++ goto send;
++ }
++
++ if (skb_queue_len(&sk->write_queue) == 1) {
++ skb->csum = csum_partial((char *)uh,
++ sizeof(struct udphdr), skb->csum);
++ uh->check = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ up->len, fl->proto, skb->csum);
+ } else {
+- dst += sizeof(struct udphdr);
+- final = 1;
+- clen -= sizeof(struct udphdr);
+- }
++ u32 tmp_csum = 0;
+
+- if (csum_partial_copy_fromiovecend(dst, udh->iov, offset,
+- clen, &udh->wcheck))
+- return -EFAULT;
+-
+- if (final) {
+- struct in6_addr *daddr;
+-
+- udh->wcheck = csum_partial((char *)udh, sizeof(struct udphdr),
+- udh->wcheck);
+-
+- if (udh->daddr) {
+- daddr = udh->daddr;
+- } else {
+- /*
+- * use packet destination address
+- * this should improve cache locality
+- */
+- daddr = addr + 1;
+- }
+- udh->uh.check = csum_ipv6_magic(addr, daddr,
+- udh->pl_len, IPPROTO_UDP,
+- udh->wcheck);
+- if (udh->uh.check == 0)
+- udh->uh.check = -1;
++ skb_queue_walk(&sk->write_queue, skb) {
++ tmp_csum = csum_add(tmp_csum, skb->csum);
++ }
++ tmp_csum = csum_partial((char *)uh,
++ sizeof(struct udphdr), tmp_csum);
++ tmp_csum = csum_ipv6_magic(&fl->fl6_src,
++ &fl->fl6_dst,
++ up->len, fl->proto, tmp_csum);
++ uh->check = tmp_csum;
+
+- memcpy(buff, udh, sizeof(struct udphdr));
+ }
+- return 0;
++ if (uh->check == 0)
++ uh->check = -1;
++
++send:
++ err = ip6_push_pending_frames(sk);
++out:
++ up->len = 0;
++ up->pending = 0;
++ return err;
+ }
+
+-static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
++static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+ {
+ struct ipv6_txoptions opt_space;
+- struct udpv6fakehdr udh;
++ struct udp_opt *up = udp_sk(sk);
+ struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
++ struct in6_addr *daddr;
+ struct ipv6_txoptions *opt = NULL;
+ struct ip6_flowlabel *flowlabel = NULL;
+- struct flowi fl;
++ struct flowi *fl = &np->cork.fl;
++ struct dst_entry *dst;
+ int addr_len = msg->msg_namelen;
+- struct in6_addr *daddr;
+- int len = ulen + sizeof(struct udphdr);
++ int ulen = len;
+ int addr_type;
+ int hlimit = -1;
+-
++ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int err;
+
+ /* Rough check on arithmetic overflow,
+ better check is made in ip6_build_xmit
+ */
+- if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr))
++ if (len < 0 || len > INT_MAX - sizeof(struct udphdr))
+ return -EMSGSIZE;
+
+- fl.fl6_flowlabel = 0;
+- fl.oif = 0;
++ if (up->pending) {
++ /*
++ * There are pending frames.
++ * The socket lock must be held while it's corked.
++ */
++ lock_sock(sk);
++ if (likely(up->pending)) {
++ dst = NULL;
++ goto do_append_data;
++ }
++ release_sock(sk);
++ }
++ ulen += sizeof(struct udphdr);
++
++ memset(fl, 0, sizeof(fl));
+
+ if (sin6) {
+ if (sin6->sin6_family == AF_INET) {
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+- return udp_sendmsg(sk, msg, ulen);
++ return udp_sendmsg(sk, msg, len);
+ }
+
+ if (addr_len < SIN6_LEN_RFC2133)
+@@ -812,13 +844,13 @@
+ if (sin6->sin6_port == 0)
+ return -EINVAL;
+
+- udh.uh.dest = sin6->sin6_port;
++ up->dport = sin6->sin6_port;
+ daddr = &sin6->sin6_addr;
+
+ if (np->sndflow) {
+- fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+- if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
+- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
++ fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
++ if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) {
++ flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ daddr = &flowlabel->dst;
+@@ -833,14 +865,14 @@
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ sin6->sin6_scope_id &&
+ ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
+- fl.oif = sin6->sin6_scope_id;
++ fl->oif = sin6->sin6_scope_id;
+ } else {
+ if (sk->state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+
+- udh.uh.dest = sk->dport;
++ up->dport = sk->dport;
+ daddr = &sk->net_pinfo.af_inet6.daddr;
+- fl.fl6_flowlabel = np->flow_label;
++ fl->fl6_flowlabel = np->flow_label;
+ }
+
+ addr_type = ipv6_addr_type(daddr);
+@@ -853,30 +885,28 @@
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = daddr->s6_addr32[3];
+- sin.sin_port = udh.uh.dest;
++ sin.sin_port = up->dport;
+ msg->msg_name = (struct sockaddr *)(&sin);
+ msg->msg_namelen = sizeof(sin);
+ fl6_sock_release(flowlabel);
+
+- return udp_sendmsg(sk, msg, ulen);
++ return udp_sendmsg(sk, msg, len);
+ }
+
+- udh.daddr = NULL;
+- if (!fl.oif)
+- fl.oif = sk->bound_dev_if;
+- fl.fl6_src = NULL;
++ if (!fl->oif)
++ fl->oif = sk->bound_dev_if;
+
+ if (msg->msg_controllen) {
+ opt = &opt_space;
+ memset(opt, 0, sizeof(struct ipv6_txoptions));
+
+- err = datagram_send_ctl(msg, &fl, opt, &hlimit);
++ err = datagram_send_ctl(msg, fl, opt, &hlimit);
+ if (err < 0) {
+ fl6_sock_release(flowlabel);
+ return err;
+ }
+- if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
++ if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
++ flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ }
+@@ -887,44 +917,188 @@
+ opt = np->opt;
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+- if (opt && opt->srcrt)
+- udh.daddr = daddr;
+
+- udh.uh.source = sk->sport;
+- udh.uh.len = len < 0x10000 ? htons(len) : 0;
+- udh.uh.check = 0;
+- udh.iov = msg->msg_iov;
+- udh.wcheck = 0;
+- udh.pl_len = len;
++ fl->proto = IPPROTO_UDP;
++ ipv6_addr_copy(&fl->fl6_dst, daddr);
++ if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr))
++ ipv6_addr_copy(&fl->fl6_src, &np->saddr);
++ fl->fl_ip_dport = up->dport;
++ fl->fl_ip_sport = sk->sport;
++
++ /* merge ip6_build_xmit from ip6_output */
++ if (opt && opt->srcrt) {
++ struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
++ ipv6_addr_copy(&fl->fl6_dst, rt0->addr);
++ }
+
+- fl.proto = IPPROTO_UDP;
+- fl.fl6_dst = daddr;
+- if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr))
+- fl.fl6_src = &np->saddr;
+- fl.uli_u.ports.dport = udh.uh.dest;
+- fl.uli_u.ports.sport = udh.uh.source;
++ if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst))
++ fl->oif = np->mcast_oif;
++
++ err = ip6_dst_lookup(sk, &dst, fl);
++ if (err)
++ goto out;
+
+- err = ip6_build_xmit(sk, udpv6_getfrag, &udh, &fl, len, opt, hlimit,
+- msg->msg_flags);
++ if (hlimit < 0) {
++ if (ipv6_addr_is_multicast(&fl->fl6_dst))
++ hlimit = np->mcast_hops;
++ else
++ hlimit = np->hop_limit;
++ if (hlimit < 0)
++ hlimit = dst_metric(dst, RTAX_HOPLIMIT);
++ }
++
++ if (msg->msg_flags&MSG_CONFIRM)
++ goto do_confirm;
++back_from_confirm:
++
++ lock_sock(sk);
++ if (unlikely(up->pending)) {
++ /* The socket is already corked while preparing it. */
++ /* ... which is an evident application bug. --ANK */
++ release_sock(sk);
+
++ NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
++ err = -EINVAL;
++ goto out;
++ }
++
++ up->pending = 1;
++
++do_append_data:
++ up->len += ulen;
++ err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr),
++ hlimit, opt, fl, (struct rt6_info*)dst,
++ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
++ if (err)
++ udp_v6_flush_pending_frames(sk);
++ else if (!corkreq)
++ err = udp_v6_push_pending_frames(sk, up);
++
++ if (dst)
++ ip6_dst_store(sk, dst,
++ !ipv6_addr_cmp(&fl->fl6_dst, &np->daddr) ?
++ &np->daddr : NULL);
++ if (err > 0)
++ err = np->recverr ? net_xmit_errno(err) : 0;
++ release_sock(sk);
++out:
+ fl6_sock_release(flowlabel);
++ if (!err) {
++ UDP6_INC_STATS_USER(UdpOutDatagrams);
++ return len;
++ }
++ return err;
++
++do_confirm:
++ dst_confirm(dst);
++ if (!(msg->msg_flags&MSG_PROBE) || len)
++ goto back_from_confirm;
++ err = 0;
++ goto out;
++}
++
++static int udpv6_destroy_sock(struct sock *sk)
++{
++ lock_sock(sk);
++ udp_v6_flush_pending_frames(sk);
++ release_sock(sk);
+
+- if (err < 0)
+- return err;
++ inet6_destroy_sock(sk);
+
+- UDP6_INC_STATS_USER(UdpOutDatagrams);
+- return ulen;
++ return 0;
+ }
+
+-static struct inet6_protocol udpv6_protocol =
++/*
++ * Socket option code for UDP
++ */
++static int udpv6_setsockopt(struct sock *sk, int level, int optname,
++ char *optval, int optlen)
+ {
+- udpv6_rcv, /* UDP handler */
+- udpv6_err, /* UDP error control */
+- NULL, /* next */
+- IPPROTO_UDP, /* protocol ID */
+- 0, /* copy */
+- NULL, /* data */
+- "UDPv6" /* name */
++ struct udp_opt *up = udp_sk(sk);
++ int val;
++ int err = 0;
++
++ if (level != SOL_UDP)
++ return ipv6_setsockopt(sk, level, optname, optval, optlen);
++
++ if(optlen<sizeof(int))
++ return -EINVAL;
++
++ if (get_user(val, (int *)optval))
++ return -EFAULT;
++
++ switch(optname) {
++ case UDP_CORK:
++ if (val != 0) {
++ up->corkflag = 1;
++ } else {
++ up->corkflag = 0;
++ lock_sock(sk);
++ udp_v6_push_pending_frames(sk, up);
++ release_sock(sk);
++ }
++ break;
++
++ case UDP_ENCAP:
++ switch (val) {
++ case 0:
++ up->encap_type = val;
++ break;
++ default:
++ err = -ENOPROTOOPT;
++ break;
++ }
++ break;
++
++ default:
++ err = -ENOPROTOOPT;
++ break;
++ };
++
++ return err;
++}
++
++static int udpv6_getsockopt(struct sock *sk, int level, int optname,
++ char *optval, int *optlen)
++{
++ struct udp_opt *up = udp_sk(sk);
++ int val, len;
++
++ if (level != SOL_UDP)
++ return ipv6_getsockopt(sk, level, optname, optval, optlen);
++
++ if(get_user(len,optlen))
++ return -EFAULT;
++
++ len = min_t(unsigned int, len, sizeof(int));
++
++ if(len < 0)
++ return -EINVAL;
++
++ switch(optname) {
++ case UDP_CORK:
++ val = up->corkflag;
++ break;
++
++ case UDP_ENCAP:
++ val = up->encap_type;
++ break;
++
++ default:
++ return -ENOPROTOOPT;
++ };
++
++ if(put_user(len, optlen))
++ return -EFAULT;
++ if(copy_to_user(optval, &val,len))
++ return -EFAULT;
++ return 0;
++}
++
++static struct inet6_protocol udpv6_protocol = {
++ .handler = udpv6_rcv,
++ .err_handler = udpv6_err,
++ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+ };
+
+ #define LINE_LEN 190
+@@ -1001,20 +1175,20 @@
+ }
+
+ struct proto udpv6_prot = {
+- name: "UDP",
+- close: udpv6_close,
+- connect: udpv6_connect,
+- disconnect: udp_disconnect,
+- ioctl: udp_ioctl,
+- destroy: inet6_destroy_sock,
+- setsockopt: ipv6_setsockopt,
+- getsockopt: ipv6_getsockopt,
+- sendmsg: udpv6_sendmsg,
+- recvmsg: udpv6_recvmsg,
+- backlog_rcv: udpv6_queue_rcv_skb,
+- hash: udp_v6_hash,
+- unhash: udp_v6_unhash,
+- get_port: udp_v6_get_port,
++ .name = "UDP",
++ .close = udpv6_close,
++ .connect = udpv6_connect,
++ .disconnect = udp_disconnect,
++ .ioctl = udp_ioctl,
++ .destroy = udpv6_destroy_sock,
++ .setsockopt = udpv6_setsockopt,
++ .getsockopt = udpv6_getsockopt,
++ .sendmsg = udpv6_sendmsg,
++ .recvmsg = udpv6_recvmsg,
++ .backlog_rcv = udpv6_queue_rcv_skb,
++ .hash = udp_v6_hash,
++ .unhash = udp_v6_unhash,
++ .get_port = udp_v6_get_port,
+ };
+
+ extern struct proto_ops inet6_dgram_ops;
+@@ -1032,6 +1206,7 @@
+
+ void __init udpv6_init(void)
+ {
+- inet6_add_protocol(&udpv6_protocol);
++ if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0)
++ printk(KERN_ERR "udpv6_init: Could not register protocol\n");
+ inet6_register_protosw(&udpv6_protosw);
+ }
+diff -Nru a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/xfrm6_input.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,148 @@
++/*
++ * xfrm6_input.c: based on net/ipv4/xfrm4_input.c
++ *
++ * Authors:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * YOSHIFUJI Hideaki @USAGI
++ * IPv6 support
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <net/inet_ecn.h>
++#include <net/ip.h>
++#include <net/ipv6.h>
++#include <net/xfrm.h>
++
++static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
++{
++ struct ipv6hdr *outer_iph = skb->nh.ipv6h;
++ struct ipv6hdr *inner_iph = skb->h.ipv6h;
++
++ if (INET_ECN_is_ce(ip6_get_dsfield(outer_iph)) &&
++ INET_ECN_is_not_ce(ip6_get_dsfield(inner_iph)))
++ IP6_ECN_set_ce(inner_iph);
++}
++
++int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
++{
++ struct sk_buff *skb = *pskb;
++ int err;
++ u32 seq;
++ struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
++ struct xfrm_state *x;
++ int xfrm_nr = 0;
++ int decaps = 0;
++ int nexthdr;
++ unsigned int nhoff;
++
++ nhoff = *nhoffp;
++ nexthdr = skb->nh.raw[nhoff];
++
++ seq = 0;
++ if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0)
++ goto drop;
++
++ do {
++ struct ipv6hdr *iph = skb->nh.ipv6h;
++
++ if (xfrm_nr == XFRM_MAX_DEPTH)
++ goto drop;
++
++ x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, nexthdr, AF_INET6);
++ if (x == NULL)
++ goto drop;
++ spin_lock(&x->lock);
++ if (unlikely(x->km.state != XFRM_STATE_VALID))
++ goto drop_unlock;
++
++ if (x->props.replay_window && xfrm_replay_check(x, seq))
++ goto drop_unlock;
++
++ if (xfrm_state_check_expire(x))
++ goto drop_unlock;
++
++ nexthdr = x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb);
++ if (nexthdr <= 0)
++ goto drop_unlock;
++
++ skb->nh.raw[nhoff] = nexthdr;
++
++ if (x->props.replay_window)
++ xfrm_replay_advance(x, seq);
++
++ x->curlft.bytes += skb->len;
++ x->curlft.packets++;
++
++ spin_unlock(&x->lock);
++
++ xfrm_vec[xfrm_nr++].xvec = x;
++
++ if (x->props.mode) { /* XXX */
++ if (nexthdr != IPPROTO_IPV6)
++ goto drop;
++ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
++ goto drop;
++ if (skb_cloned(skb) &&
++ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ goto drop;
++ if (!(x->props.flags & XFRM_STATE_NOECN))
++ ipip6_ecn_decapsulate(skb);
++ skb->mac.raw = memmove(skb->data - skb->mac_len,
++ skb->mac.raw, skb->mac_len);
++ skb->nh.raw = skb->data;
++ decaps = 1;
++ break;
++ }
++
++ if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) < 0)
++ goto drop;
++ } while (!err);
++
++ /* Allocate new secpath or COW existing one. */
++ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
++ struct sec_path *sp;
++ sp = secpath_dup(skb->sp);
++ if (!sp)
++ goto drop;
++ if (skb->sp)
++ secpath_put(skb->sp);
++ skb->sp = sp;
++ }
++
++ if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
++ goto drop;
++
++ memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
++ skb->sp->len += xfrm_nr;
++ skb->ip_summed = CHECKSUM_NONE;
++
++ if (decaps) {
++ if (!(skb->dev->flags&IFF_LOOPBACK)) {
++ dst_release(skb->dst);
++ skb->dst = NULL;
++ }
++ netif_rx(skb);
++ return -1;
++ } else {
++ return 1;
++ }
++
++drop_unlock:
++ spin_unlock(&x->lock);
++ xfrm_state_put(x);
++drop:
++ while (--xfrm_nr >= 0)
++ xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
++ kfree_skb(skb);
++ return -1;
++}
++
++EXPORT_SYMBOL(xfrm6_rcv_spi);
++
++int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
++{
++ return xfrm6_rcv_spi(pskb, nhoffp, 0);
++}
+diff -Nru a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/xfrm6_output.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,137 @@
++/*
++ * xfrm6_output.c - Common IPsec encapsulation code for IPv6.
++ * Copyright (C) 2002 USAGI/WIDE Project
++ * Copyright (c) 2004 Herbert Xu <herbert at gondor.apana.org.au>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/skbuff.h>
++#include <linux/spinlock.h>
++#include <linux/icmpv6.h>
++#include <net/inet_ecn.h>
++#include <net/ipv6.h>
++#include <net/xfrm.h>
++
++/* Add encapsulation header.
++ *
++ * In transport mode, the IP header and mutable extension headers will be moved
++ * forward to make space for the encapsulation header.
++ *
++ * In tunnel mode, the top IP header will be constructed per RFC 2401.
++ * The following fields in it shall be filled in by x->type->output:
++ * payload_len
++ *
++ * On exit, skb->h will be set to the start of the encapsulation header to be
++ * filled in by x->type->output and skb->nh will be set to the nextheader field
++ * of the extension header directly preceding the encapsulation header, or in
++ * its absence, that of the top IP header. The value of skb->data will always
++ * point to the top IP header.
++ */
++static void xfrm6_encap(struct sk_buff *skb)
++{
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ struct ipv6hdr *iph, *top_iph;
++
++ skb_push(skb, x->props.header_len);
++ iph = skb->nh.ipv6h;
++
++ if (!x->props.mode) {
++ u8 *prevhdr;
++ int hdr_len;
++
++ hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
++ skb->nh.raw = prevhdr - x->props.header_len;
++ skb->h.raw = skb->data + hdr_len;
++ memmove(skb->data, iph, hdr_len);
++ return;
++ }
++
++ skb->nh.raw = skb->data;
++ top_iph = skb->nh.ipv6h;
++ skb->nh.raw = &top_iph->nexthdr;
++ skb->h.ipv6h = top_iph + 1;
++
++ top_iph->version = 6;
++ top_iph->priority = iph->priority;
++ if (x->props.flags & XFRM_STATE_NOECN)
++ IP6_ECN_clear(top_iph);
++ top_iph->flow_lbl[0] = iph->flow_lbl[0];
++ top_iph->flow_lbl[1] = iph->flow_lbl[1];
++ top_iph->flow_lbl[2] = iph->flow_lbl[2];
++ top_iph->nexthdr = IPPROTO_IPV6;
++ top_iph->hop_limit = dst_path_metric(dst, RTAX_HOPLIMIT);
++ ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
++ ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
++}
++
++static int xfrm6_tunnel_check_size(struct sk_buff *skb)
++{
++ int mtu, ret = 0;
++ struct dst_entry *dst = skb->dst;
++
++ mtu = dst_pmtu(dst) - dst->header_len - dst->trailer_len;
++ if (mtu < IPV6_MIN_MTU)
++ mtu = IPV6_MIN_MTU;
++
++ if (skb->len > mtu) {
++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
++ ret = -EMSGSIZE;
++ }
++
++ return ret;
++}
++
++int xfrm6_output(struct sk_buff *skb)
++{
++ struct dst_entry *dst = skb->dst;
++ struct xfrm_state *x = dst->xfrm;
++ int err;
++
++ if (skb->ip_summed == CHECKSUM_HW && skb_checksum_help(skb) == NULL) {
++ err = -EINVAL;
++ goto error_nolock;
++ }
++
++ spin_lock_bh(&x->lock);
++ err = xfrm_state_check(x, skb);
++ if (err)
++ goto error;
++
++ if (x->props.mode) {
++ err = xfrm6_tunnel_check_size(skb);
++ if (err)
++ goto error;
++ }
++
++ xfrm6_encap(skb);
++
++ err = x->type->output(skb);
++ if (err)
++ goto error;
++
++ x->curlft.bytes += skb->len;
++ x->curlft.packets++;
++
++ spin_unlock_bh(&x->lock);
++
++ skb->nh.raw = skb->data;
++
++ if (!(skb->dst = dst_pop(dst))) {
++ err = -EHOSTUNREACH;
++ goto error_nolock;
++ }
++ err = NET_XMIT_BYPASS;
++
++out_exit:
++ return err;
++error:
++ spin_unlock_bh(&x->lock);
++error_nolock:
++ kfree_skb(skb);
++ goto out_exit;
++}
+diff -Nru a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/xfrm6_policy.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,307 @@
++/*
++ * xfrm6_policy.c: based on xfrm4_policy.c
++ *
++ * Authors:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * IPv6 support
++ * YOSHIFUJI Hideaki
++ * Split up af-specific portion
++ *
++ */
++
++#include <linux/config.h>
++#include <net/xfrm.h>
++#include <net/ip.h>
++#include <net/ipv6.h>
++#include <net/ip6_route.h>
++
++static struct dst_ops xfrm6_dst_ops;
++static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
++
++static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
++
++static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
++{
++ int err = 0;
++ *dst = (struct xfrm_dst*)ip6_route_output(NULL, fl);
++ if (!*dst)
++ err = -ENETUNREACH;
++ return err;
++}
++
++/* Check that the bundle accepts the flow and its components are
++ * still valid.
++ */
++
++static int __xfrm6_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl)
++{
++ do {
++ if (xdst->u.dst.ops != &xfrm6_dst_ops)
++ return 1;
++
++ if (!xfrm_selector_match(&xdst->u.dst.xfrm->sel, fl, AF_INET6))
++ return 0;
++ if (xdst->u.dst.xfrm->km.state != XFRM_STATE_VALID ||
++ xdst->u.dst.path->obsolete > 0)
++ return 0;
++ xdst = (struct xfrm_dst*)xdst->u.dst.child;
++ } while (xdst);
++ return 0;
++}
++
++static struct dst_entry *
++__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
++{
++ struct dst_entry *dst;
++ u32 ndisc_bit = 0;
++
++ if (fl->proto == IPPROTO_ICMPV6 &&
++ (fl->fl_icmp_type == NDISC_NEIGHBOUR_ADVERTISEMENT ||
++ fl->fl_icmp_type == NDISC_NEIGHBOUR_SOLICITATION ||
++ fl->fl_icmp_type == NDISC_ROUTER_SOLICITATION))
++ ndisc_bit = RTF_NDISC;
++
++ /* Still not clear if we should set fl->fl6_{src,dst}... */
++ read_lock_bh(&policy->lock);
++ for (dst = policy->bundles; dst; dst = dst->next) {
++ struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
++ struct in6_addr fl_dst_prefix, fl_src_prefix;
++
++ if ((xdst->u.rt6.rt6i_flags & RTF_NDISC) != ndisc_bit)
++ continue;
++
++ ipv6_addr_prefix(&fl_dst_prefix,
++ &fl->fl6_dst,
++ xdst->u.rt6.rt6i_dst.plen);
++ ipv6_addr_prefix(&fl_src_prefix,
++ &fl->fl6_src,
++ xdst->u.rt6.rt6i_src.plen);
++ if (!ipv6_addr_cmp(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
++ !ipv6_addr_cmp(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
++ __xfrm6_bundle_ok(xdst, fl)) {
++ dst_clone(dst);
++ break;
++ }
++ }
++ read_unlock_bh(&policy->lock);
++ return dst;
++}
++
++/* Allocate chain of dst_entry's, attach known xfrm's, calculate
++ * all the metrics... Shortly, bundle a bundle.
++ */
++
++static int
++__xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
++ struct flowi *fl, struct dst_entry **dst_p)
++{
++ struct dst_entry *dst, *dst_prev;
++ struct rt6_info *rt0 = (struct rt6_info*)(*dst_p);
++ struct rt6_info *rt = rt0;
++ struct in6_addr *remote = &fl->fl6_dst;
++ struct in6_addr *local = &fl->fl6_src;
++ int i;
++ int err = 0;
++ int header_len = 0;
++ int trailer_len = 0;
++
++ dst = dst_prev = NULL;
++
++ for (i = 0; i < nx; i++) {
++ struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops);
++
++ if (unlikely(dst1 == NULL)) {
++ err = -ENOBUFS;
++ goto error;
++ }
++
++ if (!dst)
++ dst = dst1;
++ else {
++ dst_prev->child = dst1;
++ dst1->flags |= DST_NOHASH;
++ dst_clone(dst1);
++ }
++ dst_prev = dst1;
++ if (xfrm[i]->props.mode) {
++ remote = (struct in6_addr*)&xfrm[i]->id.daddr;
++ local = (struct in6_addr*)&xfrm[i]->props.saddr;
++ }
++ header_len += xfrm[i]->props.header_len;
++ trailer_len += xfrm[i]->props.trailer_len;
++ }
++
++ if (ipv6_addr_cmp(remote, &fl->fl6_dst)) {
++ struct flowi fl_tunnel;
++
++ memset(&fl_tunnel, 0, sizeof(fl_tunnel));
++ ipv6_addr_copy(&fl_tunnel.fl6_dst, remote);
++ ipv6_addr_copy(&fl_tunnel.fl6_src, local);
++
++ err = xfrm_dst_lookup((struct xfrm_dst **) &rt,
++ &fl_tunnel, AF_INET6);
++ if (err)
++ goto error;
++ } else {
++ dst_hold(&rt->u.dst);
++ }
++ dst_prev->child = &rt->u.dst;
++ i = 0;
++ for (dst_prev = dst; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
++ struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
++
++ dst_prev->xfrm = xfrm[i++];
++ dst_prev->dev = rt->u.dst.dev;
++ if (rt->u.dst.dev)
++ dev_hold(rt->u.dst.dev);
++ dst_prev->obsolete = -1;
++ dst_prev->flags |= DST_HOST;
++ dst_prev->lastuse = jiffies;
++ dst_prev->header_len = header_len;
++ dst_prev->trailer_len = trailer_len;
++ memcpy(&dst_prev->metrics, &rt->u.dst.metrics, sizeof(dst_prev->metrics));
++ dst_prev->path = &rt->u.dst;
++
++ /* Copy neighbour for reachability confirmation */
++ dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour);
++ dst_prev->input = rt->u.dst.input;
++ dst_prev->output = xfrm6_output;
++ /* Sheit... I remember I did this right. Apparently,
++ * it was magically lost, so this code needs audit */
++ x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL|RTF_NDISC);
++ x->u.rt6.rt6i_metric = rt0->rt6i_metric;
++ x->u.rt6.rt6i_node = rt0->rt6i_node;
++ x->u.rt6.rt6i_gateway = rt0->rt6i_gateway;
++ memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway));
++ x->u.rt6.rt6i_dst = rt0->rt6i_dst;
++ x->u.rt6.rt6i_src = rt0->rt6i_src;
++ header_len -= x->u.dst.xfrm->props.header_len;
++ trailer_len -= x->u.dst.xfrm->props.trailer_len;
++ }
++ *dst_p = dst;
++ return 0;
++
++error:
++ if (dst)
++ dst_free(dst);
++ return err;
++}
++
++static inline void
++_decode_session6(struct sk_buff *skb, struct flowi *fl)
++{
++ u16 offset = sizeof(struct ipv6hdr);
++ struct ipv6hdr *hdr = skb->nh.ipv6h;
++ struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
++ u8 nexthdr = skb->nh.ipv6h->nexthdr;
++
++ memset(fl, 0, sizeof(struct flowi));
++ ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr);
++ ipv6_addr_copy(&fl->fl6_src, &hdr->saddr);
++
++ while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) {
++ switch (nexthdr) {
++ case NEXTHDR_ROUTING:
++ case NEXTHDR_HOP:
++ case NEXTHDR_DEST:
++ offset += ipv6_optlen(exthdr);
++ nexthdr = exthdr->nexthdr;
++ exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
++ break;
++
++ case IPPROTO_UDP:
++ case IPPROTO_TCP:
++ case IPPROTO_SCTP:
++ if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) {
++ u16 *ports = (u16 *)exthdr;
++
++ fl->fl_ip_sport = ports[0];
++ fl->fl_ip_dport = ports[1];
++ }
++ fl->proto = nexthdr;
++ return;
++
++ case IPPROTO_ICMPV6:
++ if (pskb_may_pull(skb, skb->nh.raw + offset + 2 - skb->data)) {
++ u8 *icmp = (u8 *)exthdr;
++
++ fl->fl_icmp_type = icmp[0];
++ fl->fl_icmp_code = icmp[1];
++ }
++ fl->proto = nexthdr;
++ return;
++
++ /* XXX Why are there these headers? */
++ case IPPROTO_AH:
++ case IPPROTO_ESP:
++ case IPPROTO_COMP:
++ default:
++ fl->fl_ipsec_spi = 0;
++ fl->proto = nexthdr;
++ return;
++ };
++ }
++}
++
++static inline int xfrm6_garbage_collect(void)
++{
++ read_lock(&xfrm6_policy_afinfo.lock);
++ xfrm6_policy_afinfo.garbage_collect();
++ read_unlock(&xfrm6_policy_afinfo.lock);
++ return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2);
++}
++
++static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
++{
++ struct dst_entry *path = dst->path;
++
++ if (mtu >= 1280 && mtu < dst_pmtu(dst))
++ return;
++
++ path->ops->update_pmtu(path, mtu);
++}
++
++static struct dst_ops xfrm6_dst_ops = {
++ .family = AF_INET6,
++ .protocol = __constant_htons(ETH_P_IPV6),
++ .gc = xfrm6_garbage_collect,
++ .update_pmtu = xfrm6_update_pmtu,
++ .gc_thresh = 1024,
++ .entry_size = sizeof(struct xfrm_dst),
++};
++
++static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
++ .family = AF_INET6,
++ .lock = RW_LOCK_UNLOCKED,
++ .type_map = &xfrm6_type_map,
++ .dst_ops = &xfrm6_dst_ops,
++ .dst_lookup = xfrm6_dst_lookup,
++ .find_bundle = __xfrm6_find_bundle,
++ .bundle_create = __xfrm6_bundle_create,
++ .decode_session = _decode_session6,
++};
++
++static void __init xfrm6_policy_init(void)
++{
++ xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
++}
++
++static void __exit xfrm6_policy_fini(void)
++{
++ xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo);
++}
++
++void __init xfrm6_init(void)
++{
++ xfrm6_policy_init();
++ xfrm6_state_init();
++}
++
++void __exit xfrm6_fini(void)
++{
++ //xfrm6_input_fini();
++ xfrm6_policy_fini();
++ xfrm6_state_fini();
++}
+diff -Nru a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/xfrm6_state.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,132 @@
++/*
++ * xfrm6_state.c: based on xfrm4_state.c
++ *
++ * Authors:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * IPv6 support
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific portion
++ *
++ */
++
++#include <net/xfrm.h>
++#include <linux/pfkeyv2.h>
++#include <linux/ipsec.h>
++#include <net/ipv6.h>
++
++extern struct xfrm_state_afinfo xfrm6_state_afinfo;
++
++static void
++__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
++ struct xfrm_tmpl *tmpl,
++ xfrm_address_t *daddr, xfrm_address_t *saddr)
++{
++ /* Initialize temporary selector matching only
++ * to current session. */
++ ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst);
++ ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src);
++ x->sel.dport = fl->fl_ip_dport;
++ x->sel.dport_mask = ~0;
++ x->sel.sport = fl->fl_ip_sport;
++ x->sel.sport_mask = ~0;
++ x->sel.prefixlen_d = 128;
++ x->sel.prefixlen_s = 128;
++ x->sel.proto = fl->proto;
++ x->sel.ifindex = fl->oif;
++ x->id = tmpl->id;
++ if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
++ memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
++ memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
++ if (ipv6_addr_any((struct in6_addr*)&x->props.saddr))
++ memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
++ x->props.mode = tmpl->mode;
++ x->props.reqid = tmpl->reqid;
++ x->props.family = AF_INET6;
++}
++
++static struct xfrm_state *
++__xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
++{
++ unsigned h = __xfrm6_spi_hash(daddr, spi, proto);
++ struct xfrm_state *x;
++
++ list_for_each_entry(x, xfrm6_state_afinfo.state_byspi+h, byspi) {
++ if (x->props.family == AF_INET6 &&
++ spi == x->id.spi &&
++ !ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
++ proto == x->id.proto) {
++ xfrm_state_hold(x);
++ return x;
++ }
++ }
++ return NULL;
++}
++
++static struct xfrm_state *
++__xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ int create)
++{
++ struct xfrm_state *x, *x0;
++ unsigned h = __xfrm6_dst_hash(daddr);
++
++ x0 = NULL;
++
++ list_for_each_entry(x, xfrm6_state_afinfo.state_bydst+h, bydst) {
++ if (x->props.family == AF_INET6 &&
++ !ipv6_addr_cmp((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
++ mode == x->props.mode &&
++ proto == x->id.proto &&
++ !ipv6_addr_cmp((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) &&
++ reqid == x->props.reqid &&
++ x->km.state == XFRM_STATE_ACQ &&
++ !x->id.spi) {
++ x0 = x;
++ break;
++ }
++ }
++ if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
++ memcpy(x0->sel.daddr.a6, daddr, sizeof(struct in6_addr));
++ memcpy(x0->sel.saddr.a6, saddr, sizeof(struct in6_addr));
++ x0->sel.prefixlen_d = 128;
++ x0->sel.prefixlen_s = 128;
++ memcpy(x0->props.saddr.a6, saddr, sizeof(struct in6_addr));
++ x0->km.state = XFRM_STATE_ACQ;
++ memcpy(x0->id.daddr.a6, daddr, sizeof(struct in6_addr));
++ x0->id.proto = proto;
++ x0->props.family = AF_INET6;
++ x0->props.mode = mode;
++ x0->props.reqid = reqid;
++ x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
++ xfrm_state_hold(x0);
++ x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
++ add_timer(&x0->timer);
++ xfrm_state_hold(x0);
++ list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h);
++ wake_up(&km_waitq);
++ }
++ if (x0)
++ xfrm_state_hold(x0);
++ return x0;
++}
++
++static struct xfrm_state_afinfo xfrm6_state_afinfo = {
++ .family = AF_INET6,
++ .lock = RW_LOCK_UNLOCKED,
++ .init_tempsel = __xfrm6_init_tempsel,
++ .state_lookup = __xfrm6_state_lookup,
++ .find_acq = __xfrm6_find_acq,
++};
++
++void __init xfrm6_state_init(void)
++{
++ xfrm_state_register_afinfo(&xfrm6_state_afinfo);
++}
++
++void __exit xfrm6_state_fini(void)
++{
++ xfrm_state_unregister_afinfo(&xfrm6_state_afinfo);
++}
++
+diff -Nru a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/ipv6/xfrm6_tunnel.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,543 @@
++/*
++ * Copyright (C)2003,2004 USAGI/WIDE Project
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ *
++ * Authors Mitsuru KANDA <mk at linux-ipv6.org>
++ * YOSHIFUJI Hideaki <yoshfuji at linux-ipv6.org>
++ *
++ * Based on net/ipv4/xfrm4_tunnel.c
++ *
++ */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/xfrm.h>
++#include <linux/list.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++#include <net/ipv6.h>
++#include <net/protocol.h>
++#include <linux/ipv6.h>
++#include <linux/icmpv6.h>
++
++#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG
++# define X6TDEBUG 3
++#else
++# define X6TDEBUG 1
++#endif
++
++#define X6TPRINTK(fmt, args...) printk(fmt, ## args)
++#define X6TNOPRINTK(fmt, args...) do { ; } while(0)
++
++#if X6TDEBUG >= 1
++# define X6TPRINTK1 X6TPRINTK
++#else
++# define X6TPRINTK1 X6TNOPRINTK
++#endif
++
++#if X6TDEBUG >= 3
++# define X6TPRINTK3 X6TPRINTK
++#else
++# define X6TPRINTK3 X6TNOPRINTK
++#endif
++
++/*
++ * xfrm_tunnel_spi things are for allocating unique id ("spi")
++ * per xfrm_address_t.
++ */
++struct xfrm6_tunnel_spi {
++ struct hlist_node list_byaddr;
++ struct hlist_node list_byspi;
++ xfrm_address_t addr;
++ u32 spi;
++ atomic_t refcnt;
++#ifdef XFRM6_TUNNEL_SPI_MAGIC
++ u32 magic;
++#endif
++};
++
++#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG
++# define XFRM6_TUNNEL_SPI_MAGIC 0xdeadbeef
++#endif
++
++static rwlock_t xfrm6_tunnel_spi_lock = RW_LOCK_UNLOCKED;
++
++static u32 xfrm6_tunnel_spi;
++
++#define XFRM6_TUNNEL_SPI_MIN 1
++#define XFRM6_TUNNEL_SPI_MAX 0xffffffff
++
++static kmem_cache_t *xfrm6_tunnel_spi_kmem;
++
++#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
++#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
++
++static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE];
++static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE];
++
++#ifdef XFRM6_TUNNEL_SPI_MAGIC
++static int x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi,
++ const char *name)
++{
++ if (unlikely(x6spi->magic != XFRM6_TUNNEL_SPI_MAGIC)) {
++ X6TPRINTK3(KERN_DEBUG "%s(): x6spi object "
++ "at %p has corrupted magic %08x "
++ "(should be %08x)\n",
++ name, x6spi, x6spi->magic, XFRM6_TUNNEL_SPI_MAGIC);
++ return -1;
++ }
++ return 0;
++}
++#else
++static int inline x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi,
++ const char *name)
++{
++ return 0;
++}
++#endif
++
++#define X6SPI_CHECK_MAGIC(x6spi) x6spi_check_magic((x6spi), __FUNCTION__)
++
++
++static unsigned inline xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr)
++{
++ unsigned h;
++
++ X6TPRINTK3(KERN_DEBUG "%s(addr=%p)\n", __FUNCTION__, addr);
++
++ h = addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3];
++ h ^= h >> 16;
++ h ^= h >> 8;
++ h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1;
++
++ X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, h);
++
++ return h;
++}
++
++static unsigned inline xfrm6_tunnel_spi_hash_byspi(u32 spi)
++{
++ return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE;
++}
++
++
++static int xfrm6_tunnel_spi_init(void)
++{
++ int i;
++
++ X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__);
++
++ xfrm6_tunnel_spi = 0;
++ xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
++ sizeof(struct xfrm6_tunnel_spi),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++ if (!xfrm6_tunnel_spi_kmem) {
++ X6TPRINTK1(KERN_ERR
++ "%s(): failed to allocate xfrm6_tunnel_spi_kmem\n",
++ __FUNCTION__);
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
++ INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]);
++ for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++)
++ INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byspi[i]);
++ return 0;
++}
++
++static void xfrm6_tunnel_spi_fini(void)
++{
++ int i;
++
++ X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__);
++
++ for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) {
++ if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i]))
++ goto err;
++ }
++ for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) {
++ if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i]))
++ goto err;
++ }
++ kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
++ xfrm6_tunnel_spi_kmem = NULL;
++ return;
++err:
++ X6TPRINTK1(KERN_ERR "%s(): table is not empty\n", __FUNCTION__);
++ return;
++}
++
++static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr)
++{
++ struct xfrm6_tunnel_spi *x6spi;
++ struct hlist_node *pos;
++
++ X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr);
++
++ hlist_for_each_entry(x6spi, pos,
++ &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
++ list_byaddr) {
++ if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) {
++ X6SPI_CHECK_MAGIC(x6spi);
++ X6TPRINTK3(KERN_DEBUG "%s() = %p(%u)\n", __FUNCTION__, x6spi, x6spi->spi);
++ return x6spi;
++ }
++ }
++
++ X6TPRINTK3(KERN_DEBUG "%s() = NULL(0)\n", __FUNCTION__);
++ return NULL;
++}
++
++u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr)
++{
++ struct xfrm6_tunnel_spi *x6spi;
++ u32 spi;
++
++ X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr);
++
++ read_lock_bh(&xfrm6_tunnel_spi_lock);
++ x6spi = __xfrm6_tunnel_spi_lookup(saddr);
++ spi = x6spi ? x6spi->spi : 0;
++ read_unlock_bh(&xfrm6_tunnel_spi_lock);
++ return spi;
++}
++
++EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup);
++
++static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr)
++{
++ u32 spi;
++ struct xfrm6_tunnel_spi *x6spi;
++ struct hlist_node *pos;
++ unsigned index;
++
++ X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr);
++
++ if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN ||
++ xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX)
++ xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN;
++ else
++ xfrm6_tunnel_spi++;
++
++ for (spi = xfrm6_tunnel_spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) {
++ index = xfrm6_tunnel_spi_hash_byspi(spi);
++ hlist_for_each_entry(x6spi, pos,
++ &xfrm6_tunnel_spi_byspi[index],
++ list_byspi) {
++ if (x6spi->spi == spi)
++ goto try_next_1;
++ }
++ xfrm6_tunnel_spi = spi;
++ goto alloc_spi;
++try_next_1:;
++ }
++ for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tunnel_spi; spi++) {
++ index = xfrm6_tunnel_spi_hash_byspi(spi);
++ hlist_for_each_entry(x6spi, pos,
++ &xfrm6_tunnel_spi_byspi[index],
++ list_byspi) {
++ if (x6spi->spi == spi)
++ goto try_next_2;
++ }
++ xfrm6_tunnel_spi = spi;
++ goto alloc_spi;
++try_next_2:;
++ }
++ spi = 0;
++ goto out;
++alloc_spi:
++ X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for "
++ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
++ __FUNCTION__,
++ NIP6(*(struct in6_addr *)saddr));
++ x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC);
++ if (!x6spi) {
++ X6TPRINTK1(KERN_ERR "%s(): kmem_cache_alloc() failed\n",
++ __FUNCTION__);
++ goto out;
++ }
++#ifdef XFRM6_TUNNEL_SPI_MAGIC
++ x6spi->magic = XFRM6_TUNNEL_SPI_MAGIC;
++#endif
++ memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr));
++ x6spi->spi = spi;
++ atomic_set(&x6spi->refcnt, 1);
++
++ hlist_add_head(&x6spi->list_byspi, &xfrm6_tunnel_spi_byspi[index]);
++
++ index = xfrm6_tunnel_spi_hash_byaddr(saddr);
++ hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]);
++ X6SPI_CHECK_MAGIC(x6spi);
++out:
++ X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi);
++ return spi;
++}
++
++u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr)
++{
++ struct xfrm6_tunnel_spi *x6spi;
++ u32 spi;
++
++ X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr);
++
++ write_lock_bh(&xfrm6_tunnel_spi_lock);
++ x6spi = __xfrm6_tunnel_spi_lookup(saddr);
++ if (x6spi) {
++ atomic_inc(&x6spi->refcnt);
++ spi = x6spi->spi;
++ } else
++ spi = __xfrm6_tunnel_alloc_spi(saddr);
++ write_unlock_bh(&xfrm6_tunnel_spi_lock);
++
++ X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi);
++
++ return spi;
++}
++
++EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi);
++
++void xfrm6_tunnel_free_spi(xfrm_address_t *saddr)
++{
++ struct xfrm6_tunnel_spi *x6spi;
++ struct hlist_node *pos, *n;
++
++ X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr);
++
++ write_lock_bh(&xfrm6_tunnel_spi_lock);
++
++ hlist_for_each_entry_safe(x6spi, pos, n,
++ &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
++ list_byaddr)
++ {
++ if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) {
++ X6TPRINTK3(KERN_DEBUG "%s(): x6spi object "
++ "for %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
++ "found at %p\n",
++ __FUNCTION__,
++ NIP6(*(struct in6_addr *)saddr),
++ x6spi);
++ X6SPI_CHECK_MAGIC(x6spi);
++ if (atomic_dec_and_test(&x6spi->refcnt)) {
++ hlist_del(&x6spi->list_byaddr);
++ hlist_del(&x6spi->list_byspi);
++ kmem_cache_free(xfrm6_tunnel_spi_kmem, x6spi);
++ break;
++ }
++ }
++ }
++ write_unlock_bh(&xfrm6_tunnel_spi_lock);
++}
++
++EXPORT_SYMBOL(xfrm6_tunnel_free_spi);
++
++static int xfrm6_tunnel_output(struct sk_buff *skb)
++{
++ struct ipv6hdr *top_iph;
++
++ top_iph = (struct ipv6hdr *)skb->data;
++ top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
++
++ return 0;
++}
++
++static int xfrm6_tunnel_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
++{
++ return 0;
++}
++
++static struct xfrm6_tunnel *xfrm6_tunnel_handler;
++static DECLARE_MUTEX(xfrm6_tunnel_sem);
++
++int xfrm6_tunnel_register(struct xfrm6_tunnel *handler)
++{
++ int ret;
++
++ down(&xfrm6_tunnel_sem);
++ ret = 0;
++ if (xfrm6_tunnel_handler != NULL)
++ ret = -EINVAL;
++ if (!ret)
++ xfrm6_tunnel_handler = handler;
++ up(&xfrm6_tunnel_sem);
++
++ return ret;
++}
++
++EXPORT_SYMBOL(xfrm6_tunnel_register);
++
++int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler)
++{
++ int ret;
++
++ down(&xfrm6_tunnel_sem);
++ ret = 0;
++ if (xfrm6_tunnel_handler != handler)
++ ret = -EINVAL;
++ if (!ret)
++ xfrm6_tunnel_handler = NULL;
++ up(&xfrm6_tunnel_sem);
++
++ synchronize_net();
++
++ return ret;
++}
++
++EXPORT_SYMBOL(xfrm6_tunnel_deregister);
++
++static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
++{
++ struct sk_buff *skb = *pskb;
++ struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
++ struct ipv6hdr *iph = skb->nh.ipv6h;
++ u32 spi;
++
++ /* device-like_ip6ip6_handler() */
++ if (handler && handler->handler(pskb, nhoffp) == 0)
++ return 0;
++
++ spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
++ return xfrm6_rcv_spi(pskb, nhoffp, spi);
++}
++
++static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
++ int type, int code, int offset, __u32 info)
++{
++ struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
++
++ /* call here first for device-like ip6ip6 err handling */
++ if (handler) {
++ handler->err_handler(skb, opt, type, code, offset, info);
++ return;
++ }
++
++ /* xfrm6_tunnel native err handling */
++ switch (type) {
++ case ICMPV6_DEST_UNREACH:
++ switch (code) {
++ case ICMPV6_NOROUTE:
++ case ICMPV6_ADM_PROHIBITED:
++ case ICMPV6_NOT_NEIGHBOUR:
++ case ICMPV6_ADDR_UNREACH:
++ case ICMPV6_PORT_UNREACH:
++ default:
++ X6TPRINTK3(KERN_DEBUG
++ "xfrm6_tunnel: Destination Unreach.\n");
++ break;
++ }
++ break;
++ case ICMPV6_PKT_TOOBIG:
++ X6TPRINTK3(KERN_DEBUG
++ "xfrm6_tunnel: Packet Too Big.\n");
++ break;
++ case ICMPV6_TIME_EXCEED:
++ switch (code) {
++ case ICMPV6_EXC_HOPLIMIT:
++ X6TPRINTK3(KERN_DEBUG
++ "xfrm6_tunnel: Too small Hoplimit.\n");
++ break;
++ case ICMPV6_EXC_FRAGTIME:
++ default:
++ break;
++ }
++ break;
++ case ICMPV6_PARAMPROB:
++ switch (code) {
++ case ICMPV6_HDR_FIELD: break;
++ case ICMPV6_UNK_NEXTHDR: break;
++ case ICMPV6_UNK_OPTION: break;
++ }
++ break;
++ default:
++ break;
++ }
++ return;
++}
++
++static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args)
++{
++ if (!x->props.mode)
++ return -EINVAL;
++
++ if (x->encap)
++ return -EINVAL;
++
++ x->props.header_len = sizeof(struct ipv6hdr);
++
++ return 0;
++}
++
++static void xfrm6_tunnel_destroy(struct xfrm_state *x)
++{
++ xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
++}
++
++static struct xfrm_type xfrm6_tunnel_type = {
++ .description = "IP6IP6",
++ .owner = THIS_MODULE,
++ .proto = IPPROTO_IPV6,
++ .init_state = xfrm6_tunnel_init_state,
++ .destructor = xfrm6_tunnel_destroy,
++ .input = xfrm6_tunnel_input,
++ .output = xfrm6_tunnel_output,
++};
++
++static struct inet6_protocol xfrm6_tunnel_protocol = {
++ .handler = xfrm6_tunnel_rcv,
++ .err_handler = xfrm6_tunnel_err,
++ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
++};
++
++static int __init xfrm6_tunnel_init(void)
++{
++ X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__);
++
++ if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) {
++ X6TPRINTK1(KERN_ERR
++ "xfrm6_tunnel init: can't add xfrm type\n");
++ return -EAGAIN;
++ }
++ if (inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) {
++ X6TPRINTK1(KERN_ERR
++ "xfrm6_tunnel init(): can't add protocol\n");
++ xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
++ return -EAGAIN;
++ }
++ if (xfrm6_tunnel_spi_init() < 0) {
++ X6TPRINTK1(KERN_ERR
++ "xfrm6_tunnel init: failed to initialize spi\n");
++ inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
++ xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
++ return -EAGAIN;
++ }
++ return 0;
++}
++
++static void __exit xfrm6_tunnel_fini(void)
++{
++ X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__);
++
++ xfrm6_tunnel_spi_fini();
++ if (inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0)
++ X6TPRINTK1(KERN_ERR
++ "xfrm6_tunnel close: can't remove protocol\n");
++ if (xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6) < 0)
++ X6TPRINTK1(KERN_ERR
++ "xfrm6_tunnel close: can't remove xfrm type\n");
++}
++
++module_init(xfrm6_tunnel_init);
++module_exit(xfrm6_tunnel_fini);
++MODULE_LICENSE("GPL");
+diff -Nru a/net/key/Makefile b/net/key/Makefile
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/key/Makefile 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,9 @@
++#
++# Makefile for the key AF.
++#
++
++O_TARGET := key.o
++
++obj-$(CONFIG_NET_KEY) += af_key.o
++
++include $(TOPDIR)/Rules.make
+diff -Nru a/net/key/af_key.c b/net/key/af_key.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/key/af_key.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,2881 @@
++/*
++ * net/key/af_key.c An implementation of PF_KEYv2 sockets.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ *
++ * Authors: Maxim Giryaev <gem at asplinux.ru>
++ * David S. Miller <davem at redhat.com>
++ * Alexey Kuznetsov <kuznet at ms2.inr.ac.ru>
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * Kazunori MIYAZAWA / USAGI Project <miyazawa at linux-ipv6.org>
++ * Derek Atkins <derek at ihtfp.com>
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/socket.h>
++#include <linux/pfkeyv2.h>
++#include <linux/ipsec.h>
++#include <linux/skbuff.h>
++#include <linux/rtnetlink.h>
++#include <linux/in.h>
++#include <linux/in6.h>
++#include <linux/proc_fs.h>
++#include <linux/init.h>
++#include <net/xfrm.h>
++
++#include <net/sock.h>
++
++#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
++#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
++
++
++/* List of all pfkey sockets. */
++static struct sock * pfkey_table;
++static DECLARE_WAIT_QUEUE_HEAD(pfkey_table_wait);
++static rwlock_t pfkey_table_lock = RW_LOCK_UNLOCKED;
++static atomic_t pfkey_table_users = ATOMIC_INIT(0);
++
++static atomic_t pfkey_socks_nr = ATOMIC_INIT(0);
++
++static void pfkey_sock_destruct(struct sock *sk)
++{
++ skb_queue_purge(&sk->receive_queue);
++
++ if (!sk->dead) {
++ printk("Attempt to release alive pfkey socket: %p\n", sk);
++ return;
++ }
++
++ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0);
++ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0);
++
++ kfree(pfkey_sk(sk));
++
++ atomic_dec(&pfkey_socks_nr);
++
++ MOD_DEC_USE_COUNT;
++}
++
++static void pfkey_table_grab(void)
++{
++ write_lock_bh(&pfkey_table_lock);
++
++ if (atomic_read(&pfkey_table_users)) {
++ DECLARE_WAITQUEUE(wait, current);
++
++ add_wait_queue_exclusive(&pfkey_table_wait, &wait);
++ for(;;) {
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ if (atomic_read(&pfkey_table_users) == 0)
++ break;
++ write_unlock_bh(&pfkey_table_lock);
++ schedule();
++ write_lock_bh(&pfkey_table_lock);
++ }
++
++ __set_current_state(TASK_RUNNING);
++ remove_wait_queue(&pfkey_table_wait, &wait);
++ }
++}
++
++static __inline__ void pfkey_table_ungrab(void)
++{
++ write_unlock_bh(&pfkey_table_lock);
++ wake_up(&pfkey_table_wait);
++}
++
++static __inline__ void pfkey_lock_table(void)
++{
++ /* read_lock() synchronizes us to pfkey_table_grab */
++
++ read_lock(&pfkey_table_lock);
++ atomic_inc(&pfkey_table_users);
++ read_unlock(&pfkey_table_lock);
++}
++
++static __inline__ void pfkey_unlock_table(void)
++{
++ if (atomic_dec_and_test(&pfkey_table_users))
++ wake_up(&pfkey_table_wait);
++}
++
++
++static struct proto_ops pfkey_ops;
++
++static void pfkey_insert(struct sock *sk)
++{
++ pfkey_table_grab();
++ sk->next = pfkey_table;
++ pfkey_table = sk;
++ sock_hold(sk);
++ pfkey_table_ungrab();
++}
++
++static void pfkey_remove(struct sock *sk)
++{
++ struct sock **skp;
++
++ pfkey_table_grab();
++ for (skp = &pfkey_table; *skp; skp = &((*skp)->next)) {
++ if (*skp == sk) {
++ *skp = sk->next;
++ __sock_put(sk);
++ break;
++ }
++ }
++ pfkey_table_ungrab();
++}
++
++static int pfkey_create(struct socket *sock, int protocol)
++{
++ struct sock *sk;
++ struct pfkey_opt *pfk;
++ int err;
++
++ if (!capable(CAP_NET_ADMIN))
++ return -EPERM;
++ if (sock->type != SOCK_RAW)
++ return -ESOCKTNOSUPPORT;
++ if (protocol != PF_KEY_V2)
++ return -EPROTONOSUPPORT;
++
++ MOD_INC_USE_COUNT;
++
++ err = -ENOMEM;
++ sk = sk_alloc(PF_KEY, GFP_KERNEL, 1);
++ if (sk == NULL)
++ goto out;
++
++ sock->ops = &pfkey_ops;
++ sock_init_data(sock, sk);
++
++ err = -ENOMEM;
++ pfk = pfkey_sk(sk) = kmalloc(sizeof(*pfk), GFP_KERNEL);
++ if (!pfk) {
++ sk_free(sk);
++ goto out;
++ }
++ memset(pfk, 0, sizeof(*pfk));
++
++ sk->family = PF_KEY;
++ sk->destruct = pfkey_sock_destruct;
++
++ atomic_inc(&pfkey_socks_nr);
++
++ pfkey_insert(sk);
++
++ return 0;
++
++out:
++ MOD_DEC_USE_COUNT;
++ return err;
++}
++
++static int pfkey_release(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++
++ if (!sk)
++ return 0;
++
++ pfkey_remove(sk);
++
++ sock_orphan(sk);
++ sock->sk = NULL;
++ skb_queue_purge(&sk->write_queue);
++ sock_put(sk);
++
++ return 0;
++}
++
++static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
++ int allocation, struct sock *sk)
++{
++ int err = -ENOBUFS;
++
++ sock_hold(sk);
++ if (*skb2 == NULL) {
++ if (atomic_read(&skb->users) != 1) {
++ *skb2 = skb_clone(skb, allocation);
++ } else {
++ *skb2 = skb;
++ atomic_inc(&skb->users);
++ }
++ }
++ if (*skb2 != NULL) {
++ if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
++ skb_orphan(*skb2);
++ skb_set_owner_r(*skb2, sk);
++ skb_queue_tail(&sk->receive_queue, *skb2);
++ sk->data_ready(sk, (*skb2)->len);
++ *skb2 = NULL;
++ err = 0;
++ }
++ }
++ sock_put(sk);
++ return err;
++}
++
++/* Send SKB to all pfkey sockets matching selected criteria. */
++#define BROADCAST_ALL 0
++#define BROADCAST_ONE 1
++#define BROADCAST_REGISTERED 2
++#define BROADCAST_PROMISC_ONLY 4
++static int pfkey_broadcast(struct sk_buff *skb, int allocation,
++ int broadcast_flags, struct sock *one_sk)
++{
++ struct sock *sk;
++ struct sk_buff *skb2 = NULL;
++ int err = -ESRCH;
++
++ /* XXX Do we need something like netlink_overrun? I think
++ * XXX PF_KEY socket apps will not mind current behavior.
++ */
++ if (!skb)
++ return -ENOMEM;
++
++ pfkey_lock_table();
++ for (sk = pfkey_table; sk; sk = sk->next) {
++ struct pfkey_opt *pfk = pfkey_sk(sk);
++ int err2;
++
++ /* Yes, it means that if you are meant to receive this
++ * pfkey message you receive it twice as promiscuous
++ * socket.
++ */
++ if (pfk->promisc)
++ pfkey_broadcast_one(skb, &skb2, allocation, sk);
++
++ /* the exact target will be processed later */
++ if (sk == one_sk)
++ continue;
++ if (broadcast_flags != BROADCAST_ALL) {
++ if (broadcast_flags & BROADCAST_PROMISC_ONLY)
++ continue;
++ if ((broadcast_flags & BROADCAST_REGISTERED) &&
++ !pfk->registered)
++ continue;
++ if (broadcast_flags & BROADCAST_ONE)
++ continue;
++ }
++
++ err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk);
++
++ /* Error is cleare after succecful sending to at least one
++ * registered KM */
++ if ((broadcast_flags & BROADCAST_REGISTERED) && err)
++ err = err2;
++ }
++ pfkey_unlock_table();
++
++ if (one_sk != NULL)
++ err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
++
++ if (skb2)
++ kfree_skb(skb2);
++ kfree_skb(skb);
++ return err;
++}
++
++static inline void pfkey_hdr_dup(struct sadb_msg *new, struct sadb_msg *orig)
++{
++ *new = *orig;
++}
++
++static int pfkey_error(struct sadb_msg *orig, int err, struct sock *sk)
++{
++ struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
++ struct sadb_msg *hdr;
++
++ if (!skb)
++ return -ENOBUFS;
++
++ /* Woe be to the platform trying to support PFKEY yet
++ * having normal errnos outside the 1-255 range, inclusive.
++ */
++ err = -err;
++ if (err == ERESTARTSYS ||
++ err == ERESTARTNOHAND ||
++ err == ERESTARTNOINTR)
++ err = EINTR;
++ if (err >= 512)
++ err = EINVAL;
++ if (err <= 0 || err >= 256)
++ BUG();
++
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
++ pfkey_hdr_dup(hdr, orig);
++ hdr->sadb_msg_errno = (uint8_t) err;
++ hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
++ sizeof(uint64_t));
++
++ pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk);
++
++ return 0;
++}
++
++static u8 sadb_ext_min_len[] = {
++ [SADB_EXT_RESERVED] = (u8) 0,
++ [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa),
++ [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime),
++ [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime),
++ [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime),
++ [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address),
++ [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address),
++ [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address),
++ [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key),
++ [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key),
++ [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident),
++ [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident),
++ [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens),
++ [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop),
++ [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported),
++ [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported),
++ [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange),
++ [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate),
++ [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy),
++ [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2),
++ [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type),
++ [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
++ [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
++ [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
++};
++
++/* Verify sadb_address_{len,prefixlen} against sa_family. */
++static int verify_address_len(void *p)
++{
++ struct sadb_address *sp = p;
++ struct sockaddr *addr = (struct sockaddr *)(sp + 1);
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++ int len;
++
++ switch (addr->sa_family) {
++ case AF_INET:
++ len = sizeof(*sp) + sizeof(*sin) + (sizeof(uint64_t) - 1);
++ len /= sizeof(uint64_t);
++ if (sp->sadb_address_len != len ||
++ sp->sadb_address_prefixlen > 32)
++ return -EINVAL;
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ len = sizeof(*sp) + sizeof(*sin6) + (sizeof(uint64_t) - 1);
++ len /= sizeof(uint64_t);
++ if (sp->sadb_address_len != len ||
++ sp->sadb_address_prefixlen > 128)
++ return -EINVAL;
++ break;
++#endif
++ default:
++ /* It is user using kernel to keep track of security
++ * associations for another protocol, such as
++ * OSPF/RSVP/RIPV2/MIP. It is user's job to verify
++ * lengths.
++ *
++ * XXX Actually, association/policy database is not yet
++ * XXX able to cope with arbitrary sockaddr families.
++ * XXX When it can, remove this -EINVAL. -DaveM
++ */
++ return -EINVAL;
++ break;
++ };
++
++ return 0;
++}
++
++static int present_and_same_family(struct sadb_address *src,
++ struct sadb_address *dst)
++{
++ struct sockaddr *s_addr, *d_addr;
++
++ if (!src || !dst)
++ return 0;
++
++ s_addr = (struct sockaddr *)(src + 1);
++ d_addr = (struct sockaddr *)(dst + 1);
++ if (s_addr->sa_family != d_addr->sa_family)
++ return 0;
++ if (s_addr->sa_family != AF_INET
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ && s_addr->sa_family != AF_INET6
++#endif
++ )
++ return 0;
++
++ return 1;
++}
++
++static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ char *p = (char *) hdr;
++ int len = skb->len;
++
++ len -= sizeof(*hdr);
++ p += sizeof(*hdr);
++ while (len > 0) {
++ struct sadb_ext *ehdr = (struct sadb_ext *) p;
++ uint16_t ext_type;
++ int ext_len;
++
++ ext_len = ehdr->sadb_ext_len;
++ ext_len *= sizeof(uint64_t);
++ ext_type = ehdr->sadb_ext_type;
++ if (ext_len < sizeof(uint64_t) ||
++ ext_len > len ||
++ ext_type == SADB_EXT_RESERVED)
++ return -EINVAL;
++
++ if (ext_type <= SADB_EXT_MAX) {
++ int min = (int) sadb_ext_min_len[ext_type];
++ if (ext_len < min)
++ return -EINVAL;
++ if (ext_hdrs[ext_type-1] != NULL)
++ return -EINVAL;
++ if (ext_type == SADB_EXT_ADDRESS_SRC ||
++ ext_type == SADB_EXT_ADDRESS_DST ||
++ ext_type == SADB_EXT_ADDRESS_PROXY ||
++ ext_type == SADB_X_EXT_NAT_T_OA) {
++ if (verify_address_len(p))
++ return -EINVAL;
++ }
++ ext_hdrs[ext_type-1] = p;
++ }
++ p += ext_len;
++ len -= ext_len;
++ }
++
++ return 0;
++}
++
++static uint16_t
++pfkey_satype2proto(uint8_t satype)
++{
++ switch (satype) {
++ case SADB_SATYPE_UNSPEC:
++ return IPSEC_PROTO_ANY;
++ case SADB_SATYPE_AH:
++ return IPPROTO_AH;
++ case SADB_SATYPE_ESP:
++ return IPPROTO_ESP;
++ case SADB_X_SATYPE_IPCOMP:
++ return IPPROTO_COMP;
++ break;
++ default:
++ return 0;
++ }
++ /* NOTREACHED */
++}
++
++static uint8_t
++pfkey_proto2satype(uint16_t proto)
++{
++ switch (proto) {
++ case IPPROTO_AH:
++ return SADB_SATYPE_AH;
++ case IPPROTO_ESP:
++ return SADB_SATYPE_ESP;
++ case IPPROTO_COMP:
++ return SADB_X_SATYPE_IPCOMP;
++ break;
++ default:
++ return 0;
++ }
++ /* NOTREACHED */
++}
++
++/* BTW, this scheme means that there is no way with PFKEY2 sockets to
++ * say specifically 'just raw sockets' as we encode them as 255.
++ */
++
++static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
++{
++ return (proto == IPSEC_PROTO_ANY ? 0 : proto);
++}
++
++static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
++{
++ return (proto ? proto : IPSEC_PROTO_ANY);
++}
++
++static int pfkey_sadb_addr2xfrm_addr(struct sadb_address *addr,
++ xfrm_address_t *xaddr)
++{
++ switch (((struct sockaddr*)(addr + 1))->sa_family) {
++ case AF_INET:
++ xaddr->a4 =
++ ((struct sockaddr_in *)(addr + 1))->sin_addr.s_addr;
++ return AF_INET;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ memcpy(xaddr->a6,
++ &((struct sockaddr_in6 *)(addr + 1))->sin6_addr,
++ sizeof(struct in6_addr));
++ return AF_INET6;
++#endif
++ default:
++ return 0;
++ }
++ /* NOTREACHED */
++}
++
++static struct xfrm_state *pfkey_xfrm_state_lookup(struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct sadb_sa *sa;
++ struct sadb_address *addr;
++ uint16_t proto;
++ unsigned short family;
++ xfrm_address_t *xaddr;
++
++ sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
++ if (sa == NULL)
++ return NULL;
++
++ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
++ if (proto == 0)
++ return NULL;
++
++ /* sadb_address_len should be checked by caller */
++ addr = (struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1];
++ if (addr == NULL)
++ return NULL;
++
++ family = ((struct sockaddr *)(addr + 1))->sa_family;
++ switch (family) {
++ case AF_INET:
++ xaddr = (xfrm_address_t *)&((struct sockaddr_in *)(addr + 1))->sin_addr;
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ xaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(addr + 1))->sin6_addr;
++ break;
++#endif
++ default:
++ xaddr = NULL;
++ }
++
++ if (!xaddr)
++ return NULL;
++
++ return xfrm_state_lookup(xaddr, sa->sadb_sa_spi, proto, family);
++}
++
++#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1)))
++static int
++pfkey_sockaddr_size(sa_family_t family)
++{
++ switch (family) {
++ case AF_INET:
++ return PFKEY_ALIGN8(sizeof(struct sockaddr_in));
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ return PFKEY_ALIGN8(sizeof(struct sockaddr_in6));
++#endif
++ default:
++ return 0;
++ }
++ /* NOTREACHED */
++}
++
++static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, int hsc)
++{
++ struct sk_buff *skb;
++ struct sadb_msg *hdr;
++ struct sadb_sa *sa;
++ struct sadb_lifetime *lifetime;
++ struct sadb_address *addr;
++ struct sadb_key *key;
++ struct sadb_x_sa2 *sa2;
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++ int size;
++ int auth_key_size = 0;
++ int encrypt_key_size = 0;
++ int sockaddr_size;
++ struct xfrm_encap_tmpl *natt = NULL;
++
++ /* address family check */
++ sockaddr_size = pfkey_sockaddr_size(x->props.family);
++ if (!sockaddr_size)
++ ERR_PTR(-EINVAL);
++
++ /* base, SA, (lifetime (HSC),) address(SD), (address(P),)
++ key(AE), (identity(SD),) (sensitivity)> */
++ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) +
++ sizeof(struct sadb_lifetime) +
++ ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) +
++ ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) +
++ sizeof(struct sadb_address)*2 +
++ sockaddr_size*2 +
++ sizeof(struct sadb_x_sa2);
++ /* identity & sensitivity */
++
++ if ((x->props.family == AF_INET &&
++ x->sel.saddr.a4 != x->props.saddr.a4)
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ || (x->props.family == AF_INET6 &&
++ memcmp (x->sel.saddr.a6, x->props.saddr.a6, sizeof (struct in6_addr)))
++#endif
++ )
++ size += sizeof(struct sadb_address) + sockaddr_size;
++
++ if (add_keys) {
++ if (x->aalg && x->aalg->alg_key_len) {
++ auth_key_size =
++ PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8);
++ size += sizeof(struct sadb_key) + auth_key_size;
++ }
++ if (x->ealg && x->ealg->alg_key_len) {
++ encrypt_key_size =
++ PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8);
++ size += sizeof(struct sadb_key) + encrypt_key_size;
++ }
++ }
++ if (x->encap)
++ natt = x->encap;
++
++ if (natt && natt->encap_type) {
++ size += sizeof(struct sadb_x_nat_t_type);
++ size += sizeof(struct sadb_x_nat_t_port);
++ size += sizeof(struct sadb_x_nat_t_port);
++ }
++
++ skb = alloc_skb(size + 16, GFP_ATOMIC);
++ if (skb == NULL)
++ return ERR_PTR(-ENOBUFS);
++
++ /* call should fill header later */
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
++ memset(hdr, 0, size); /* XXX do we need this ? */
++ hdr->sadb_msg_len = size / sizeof(uint64_t);
++
++ /* sa */
++ sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa));
++ sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
++ sa->sadb_sa_exttype = SADB_EXT_SA;
++ sa->sadb_sa_spi = x->id.spi;
++ sa->sadb_sa_replay = x->props.replay_window;
++ sa->sadb_sa_state = SADB_SASTATE_DYING;
++ if (x->km.state == XFRM_STATE_VALID && !x->km.dying)
++ sa->sadb_sa_state = SADB_SASTATE_MATURE;
++ else if (x->km.state == XFRM_STATE_ACQ)
++ sa->sadb_sa_state = SADB_SASTATE_LARVAL;
++ else if (x->km.state == XFRM_STATE_EXPIRED)
++ sa->sadb_sa_state = SADB_SASTATE_DEAD;
++ sa->sadb_sa_auth = 0;
++ if (x->aalg) {
++ struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name);
++ sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0;
++ }
++ sa->sadb_sa_encrypt = 0;
++ BUG_ON(x->ealg && x->calg);
++ if (x->ealg) {
++ struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name);
++ sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0;
++ }
++ /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */
++ if (x->calg) {
++ struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name);
++ sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0;
++ }
++
++ sa->sadb_sa_flags = 0;
++ if (x->props.flags & XFRM_STATE_NOECN)
++ sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
++
++ /* hard time */
++ if (hsc & 2) {
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
++ lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit);
++ lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit);
++ lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds;
++ lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds;
++ }
++ /* soft time */
++ if (hsc & 1) {
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
++ lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit);
++ lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit);
++ lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds;
++ lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds;
++ }
++ /* current time */
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
++ lifetime->sadb_lifetime_allocations = x->curlft.packets;
++ lifetime->sadb_lifetime_bytes = x->curlft.bytes;
++ lifetime->sadb_lifetime_addtime = x->curlft.add_time;
++ lifetime->sadb_lifetime_usetime = x->curlft.use_time;
++ /* src address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
++ /* "if the ports are non-zero, then the sadb_address_proto field,
++ normally zero, MUST be filled in with the transport
++ protocol's number." - RFC2367 */
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ addr->sadb_address_prefixlen = 32;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->props.saddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, x->props.saddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* dst address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_prefixlen = 32; /* XXX */
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->id.daddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++
++ if (x->sel.saddr.a4 != x->props.saddr.a4) {
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY;
++ addr->sadb_address_proto =
++ pfkey_proto_from_xfrm(x->sel.proto);
++ addr->sadb_address_prefixlen = x->sel.prefixlen_s;
++ addr->sadb_address_reserved = 0;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->sel.saddr.a4;
++ sin->sin_port = x->sel.sport;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, x->id.daddr.a6, sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++
++ if (memcmp (x->sel.saddr.a6, x->props.saddr.a6,
++ sizeof(struct in6_addr))) {
++ addr = (struct sadb_address *) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY;
++ addr->sadb_address_proto =
++ pfkey_proto_from_xfrm(x->sel.proto);
++ addr->sadb_address_prefixlen = x->sel.prefixlen_s;
++ addr->sadb_address_reserved = 0;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = x->sel.sport;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, x->sel.saddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++ }
++#endif
++ else
++ BUG();
++
++ /* auth key */
++ if (add_keys && auth_key_size) {
++ key = (struct sadb_key *) skb_put(skb,
++ sizeof(struct sadb_key)+auth_key_size);
++ key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) /
++ sizeof(uint64_t);
++ key->sadb_key_exttype = SADB_EXT_KEY_AUTH;
++ key->sadb_key_bits = x->aalg->alg_key_len;
++ key->sadb_key_reserved = 0;
++ memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8);
++ }
++ /* encrypt key */
++ if (add_keys && encrypt_key_size) {
++ key = (struct sadb_key *) skb_put(skb,
++ sizeof(struct sadb_key)+encrypt_key_size);
++ key->sadb_key_len = (sizeof(struct sadb_key) +
++ encrypt_key_size) / sizeof(uint64_t);
++ key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
++ key->sadb_key_bits = x->ealg->alg_key_len;
++ key->sadb_key_reserved = 0;
++ memcpy(key + 1, x->ealg->alg_key,
++ (x->ealg->alg_key_len+7)/8);
++ }
++
++ /* sa */
++ sa2 = (struct sadb_x_sa2 *) skb_put(skb, sizeof(struct sadb_x_sa2));
++ sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t);
++ sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
++ sa2->sadb_x_sa2_mode = x->props.mode + 1;
++ sa2->sadb_x_sa2_reserved1 = 0;
++ sa2->sadb_x_sa2_reserved2 = 0;
++ sa2->sadb_x_sa2_sequence = 0;
++ sa2->sadb_x_sa2_reqid = x->props.reqid;
++
++ if (natt && natt->encap_type) {
++ struct sadb_x_nat_t_type *n_type;
++ struct sadb_x_nat_t_port *n_port;
++
++ /* type */
++ n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type));
++ n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t);
++ n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
++ n_type->sadb_x_nat_t_type_type = natt->encap_type;
++ n_type->sadb_x_nat_t_type_reserved[0] = 0;
++ n_type->sadb_x_nat_t_type_reserved[1] = 0;
++ n_type->sadb_x_nat_t_type_reserved[2] = 0;
++
++ /* source port */
++ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
++ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
++ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
++ n_port->sadb_x_nat_t_port_port = natt->encap_sport;
++ n_port->sadb_x_nat_t_port_reserved = 0;
++
++ /* dest port */
++ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
++ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
++ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
++ n_port->sadb_x_nat_t_port_port = natt->encap_dport;
++ n_port->sadb_x_nat_t_port_reserved = 0;
++ }
++
++ return skb;
++}
++
++static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
++ void **ext_hdrs)
++{
++ struct xfrm_state *x;
++ struct sadb_lifetime *lifetime;
++ struct sadb_sa *sa;
++ struct sadb_key *key;
++ uint16_t proto;
++ int err;
++
++
++ sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
++ if (!sa ||
++ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
++ return ERR_PTR(-EINVAL);
++ if (hdr->sadb_msg_satype == SADB_SATYPE_ESP &&
++ !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1])
++ return ERR_PTR(-EINVAL);
++ if (hdr->sadb_msg_satype == SADB_SATYPE_AH &&
++ !ext_hdrs[SADB_EXT_KEY_AUTH-1])
++ return ERR_PTR(-EINVAL);
++ if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] !=
++ !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1])
++ return ERR_PTR(-EINVAL);
++
++ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
++ if (proto == 0)
++ return ERR_PTR(-EINVAL);
++
++ /* default error is no buffer space */
++ err = -ENOBUFS;
++
++ /* RFC2367:
++
++ Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message.
++ SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not
++ sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state.
++ Therefore, the sadb_sa_state field of all submitted SAs MUST be
++ SADB_SASTATE_MATURE and the kernel MUST return an error if this is
++ not true.
++
++ However, KAME setkey always uses SADB_SASTATE_LARVAL.
++ Hence, we have to _ignore_ sadb_sa_state, which is also reasonable.
++ */
++ if (sa->sadb_sa_auth > SADB_AALG_MAX ||
++ (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP &&
++ sa->sadb_sa_encrypt > SADB_X_CALG_MAX) ||
++ sa->sadb_sa_encrypt > SADB_EALG_MAX)
++ return ERR_PTR(-EINVAL);
++ key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
++ if (key != NULL &&
++ sa->sadb_sa_auth != SADB_X_AALG_NULL &&
++ ((key->sadb_key_bits+7) / 8 == 0 ||
++ (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
++ return ERR_PTR(-EINVAL);
++ key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
++ if (key != NULL &&
++ sa->sadb_sa_encrypt != SADB_EALG_NULL &&
++ ((key->sadb_key_bits+7) / 8 == 0 ||
++ (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
++ return ERR_PTR(-EINVAL);
++
++ x = xfrm_state_alloc();
++ if (x == NULL)
++ return ERR_PTR(-ENOBUFS);
++
++ x->id.proto = proto;
++ x->id.spi = sa->sadb_sa_spi;
++ x->props.replay_window = sa->sadb_sa_replay;
++ if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN)
++ x->props.flags |= XFRM_STATE_NOECN;
++
++ lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
++ if (lifetime != NULL) {
++ x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
++ x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
++ x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
++ x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
++ }
++ lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1];
++ if (lifetime != NULL) {
++ x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
++ x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
++ x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
++ x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
++ }
++ key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
++ if (sa->sadb_sa_auth) {
++ int keysize = 0;
++ struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth);
++ if (!a) {
++ err = -ENOSYS;
++ goto out;
++ }
++ if (key)
++ keysize = (key->sadb_key_bits + 7) / 8;
++ x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL);
++ if (!x->aalg)
++ goto out;
++ strcpy(x->aalg->alg_name, a->name);
++ x->aalg->alg_key_len = 0;
++ if (key) {
++ x->aalg->alg_key_len = key->sadb_key_bits;
++ memcpy(x->aalg->alg_key, key+1, keysize);
++ }
++ x->props.aalgo = sa->sadb_sa_auth;
++ /* x->algo.flags = sa->sadb_sa_flags; */
++ }
++ if (sa->sadb_sa_encrypt) {
++ if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) {
++ struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt);
++ if (!a) {
++ err = -ENOSYS;
++ goto out;
++ }
++ x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL);
++ if (!x->calg)
++ goto out;
++ strcpy(x->calg->alg_name, a->name);
++ x->props.calgo = sa->sadb_sa_encrypt;
++ } else {
++ int keysize = 0;
++ struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt);
++ if (!a) {
++ err = -ENOSYS;
++ goto out;
++ }
++ key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
++ if (key)
++ keysize = (key->sadb_key_bits + 7) / 8;
++ x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL);
++ if (!x->ealg)
++ goto out;
++ strcpy(x->ealg->alg_name, a->name);
++ x->ealg->alg_key_len = 0;
++ if (key) {
++ x->ealg->alg_key_len = key->sadb_key_bits;
++ memcpy(x->ealg->alg_key, key+1, keysize);
++ }
++ x->props.ealgo = sa->sadb_sa_encrypt;
++ }
++ }
++ /* x->algo.flags = sa->sadb_sa_flags; */
++
++ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ &x->props.saddr);
++ if (!x->props.family) {
++ err = -EAFNOSUPPORT;
++ goto out;
++ }
++ pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1],
++ &x->id.daddr);
++
++ if (ext_hdrs[SADB_X_EXT_SA2-1]) {
++ struct sadb_x_sa2 *sa2 = (void*)ext_hdrs[SADB_X_EXT_SA2-1];
++ x->props.mode = sa2->sadb_x_sa2_mode;
++ if (x->props.mode)
++ x->props.mode--;
++ x->props.reqid = sa2->sadb_x_sa2_reqid;
++ }
++
++ if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) {
++ struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1];
++
++ /* Nobody uses this, but we try. */
++ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr);
++ x->sel.prefixlen_s = addr->sadb_address_prefixlen;
++ }
++
++ if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
++ struct sadb_x_nat_t_type* n_type;
++ struct xfrm_encap_tmpl *natt;
++
++ x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
++ if (!x->encap)
++ goto out;
++
++ natt = x->encap;
++ n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1];
++ natt->encap_type = n_type->sadb_x_nat_t_type_type;
++
++ if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) {
++ struct sadb_x_nat_t_port* n_port =
++ ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1];
++ natt->encap_sport = n_port->sadb_x_nat_t_port_port;
++ }
++ if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) {
++ struct sadb_x_nat_t_port* n_port =
++ ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1];
++ natt->encap_dport = n_port->sadb_x_nat_t_port_port;
++ }
++ }
++
++ x->type = xfrm_get_type(proto, x->props.family);
++ if (x->type == NULL) {
++ err = -ENOPROTOOPT;
++ goto out;
++ }
++ if (x->type->init_state(x, NULL)) {
++ err = -EINVAL;
++ goto out;
++ }
++ x->km.seq = hdr->sadb_msg_seq;
++ x->km.state = XFRM_STATE_VALID;
++ return x;
++
++out:
++ x->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(x);
++ return ERR_PTR(err);
++}
++
++static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ return -EOPNOTSUPP;
++}
++
++static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct sk_buff *resp_skb;
++ struct sadb_x_sa2 *sa2;
++ struct sadb_address *saddr, *daddr;
++ struct sadb_msg *out_hdr;
++ struct xfrm_state *x = NULL;
++ u8 mode;
++ u32 reqid;
++ u8 proto;
++ unsigned short family;
++ xfrm_address_t *xsaddr = NULL, *xdaddr = NULL;
++
++ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
++ return -EINVAL;
++
++ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
++ if (proto == 0)
++ return -EINVAL;
++
++ if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) {
++ mode = sa2->sadb_x_sa2_mode - 1;
++ reqid = sa2->sadb_x_sa2_reqid;
++ } else {
++ mode = 0;
++ reqid = 0;
++ }
++
++ saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
++ daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
++
++ family = ((struct sockaddr *)(saddr + 1))->sa_family;
++ switch (family) {
++ case AF_INET:
++ xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr;
++ xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr;
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr;
++ xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr;
++ break;
++#endif
++ }
++
++ if (hdr->sadb_msg_seq) {
++ x = xfrm_find_acq_byseq(hdr->sadb_msg_seq);
++ if (x && xfrm_addr_cmp(&x->id.daddr, xdaddr, family)) {
++ xfrm_state_put(x);
++ x = NULL;
++ }
++ }
++
++ if (!x)
++ x = xfrm_find_acq(mode, reqid, proto, xdaddr, xsaddr, 1, family);
++
++ if (x == NULL)
++ return -ENOENT;
++
++ resp_skb = ERR_PTR(-ENOENT);
++
++ spin_lock_bh(&x->lock);
++ if (x->km.state != XFRM_STATE_DEAD) {
++ struct sadb_spirange *range = ext_hdrs[SADB_EXT_SPIRANGE-1];
++ u32 min_spi, max_spi;
++
++ if (range != NULL) {
++ min_spi = range->sadb_spirange_min;
++ max_spi = range->sadb_spirange_max;
++ } else {
++ min_spi = htonl(0x100);
++ max_spi = htonl(0x0fffffff);
++ }
++ xfrm_alloc_spi(x, min_spi, max_spi);
++ if (x->id.spi)
++ resp_skb = pfkey_xfrm_state2msg(x, 0, 3);
++ }
++ spin_unlock_bh(&x->lock);
++
++ if (IS_ERR(resp_skb)) {
++ xfrm_state_put(x);
++ return PTR_ERR(resp_skb);
++ }
++
++ out_hdr = (struct sadb_msg *) resp_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = SADB_GETSPI;
++ out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_reserved = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++
++ xfrm_state_put(x);
++
++ pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk);
++
++ return 0;
++}
++
++static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct xfrm_state *x;
++
++ if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8)
++ return -EOPNOTSUPP;
++
++ if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0)
++ return 0;
++
++ x = xfrm_find_acq_byseq(hdr->sadb_msg_seq);
++ if (x == NULL)
++ return 0;
++
++ spin_lock_bh(&x->lock);
++ if (x->km.state == XFRM_STATE_ACQ) {
++ x->km.state = XFRM_STATE_ERROR;
++ wake_up(&km_waitq);
++ }
++ spin_unlock_bh(&x->lock);
++ xfrm_state_put(x);
++ return 0;
++}
++
++
++static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++ struct xfrm_state *x;
++ int err;
++
++ xfrm_probe_algs();
++
++ x = pfkey_msg2xfrm_state(hdr, ext_hdrs);
++ if (IS_ERR(x))
++ return PTR_ERR(x);
++
++ if (hdr->sadb_msg_type == SADB_ADD)
++ err = xfrm_state_add(x);
++ else
++ err = xfrm_state_update(x);
++
++ if (err < 0) {
++ x->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(x);
++ return err;
++ }
++
++ out_skb = pfkey_xfrm_state2msg(x, 0, 3);
++ if (IS_ERR(out_skb))
++ return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = hdr->sadb_msg_type;
++ out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_reserved = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
++
++ return 0;
++}
++
++static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct xfrm_state *x;
++
++ if (!ext_hdrs[SADB_EXT_SA-1] ||
++ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
++ return -EINVAL;
++
++ x = pfkey_xfrm_state_lookup(hdr, ext_hdrs);
++ if (x == NULL)
++ return -ESRCH;
++
++ if (xfrm_state_kern(x)) {
++ xfrm_state_put(x);
++ return -EPERM;
++ }
++
++ xfrm_state_delete(x);
++ xfrm_state_put(x);
++
++ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
++ BROADCAST_ALL, sk);
++
++ return 0;
++}
++
++static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ __u8 proto;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++ struct xfrm_state *x;
++
++ if (!ext_hdrs[SADB_EXT_SA-1] ||
++ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
++ return -EINVAL;
++
++ x = pfkey_xfrm_state_lookup(hdr, ext_hdrs);
++ if (x == NULL)
++ return -ESRCH;
++
++ out_skb = pfkey_xfrm_state2msg(x, 1, 3);
++ proto = x->id.proto;
++ xfrm_state_put(x);
++ if (IS_ERR(out_skb))
++ return PTR_ERR(out_skb);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = SADB_DUMP;
++ out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_reserved = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk);
++
++ return 0;
++}
++
++static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig, int allocation)
++{
++ struct sk_buff *skb;
++ struct sadb_msg *hdr;
++ int len, auth_len, enc_len, i;
++
++ auth_len = xfrm_count_auth_supported();
++ if (auth_len) {
++ auth_len *= sizeof(struct sadb_alg);
++ auth_len += sizeof(struct sadb_supported);
++ }
++
++ enc_len = xfrm_count_enc_supported();
++ if (enc_len) {
++ enc_len *= sizeof(struct sadb_alg);
++ enc_len += sizeof(struct sadb_supported);
++ }
++
++ len = enc_len + auth_len + sizeof(struct sadb_msg);
++
++ skb = alloc_skb(len + 16, allocation);
++ if (!skb)
++ goto out_put_algs;
++
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr));
++ pfkey_hdr_dup(hdr, orig);
++ hdr->sadb_msg_errno = 0;
++ hdr->sadb_msg_len = len / sizeof(uint64_t);
++
++ if (auth_len) {
++ struct sadb_supported *sp;
++ struct sadb_alg *ap;
++
++ sp = (struct sadb_supported *) skb_put(skb, auth_len);
++ ap = (struct sadb_alg *) (sp + 1);
++
++ sp->sadb_supported_len = auth_len / sizeof(uint64_t);
++ sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
++
++ for (i = 0; ; i++) {
++ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
++ if (!aalg)
++ break;
++ if (aalg->available)
++ *ap++ = aalg->desc;
++ }
++ }
++
++ if (enc_len) {
++ struct sadb_supported *sp;
++ struct sadb_alg *ap;
++
++ sp = (struct sadb_supported *) skb_put(skb, enc_len);
++ ap = (struct sadb_alg *) (sp + 1);
++
++ sp->sadb_supported_len = enc_len / sizeof(uint64_t);
++ sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
++
++ for (i = 0; ; i++) {
++ struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
++ if (!ealg)
++ break;
++ if (ealg->available)
++ *ap++ = ealg->desc;
++ }
++ }
++
++out_put_algs:
++ return skb;
++}
++
++static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct pfkey_opt *pfk = pfkey_sk(sk);
++ struct sk_buff *supp_skb;
++
++ if (hdr->sadb_msg_satype > SADB_SATYPE_MAX)
++ return -EINVAL;
++
++ if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) {
++ if (pfk->registered&(1<<hdr->sadb_msg_satype))
++ return -EEXIST;
++ pfk->registered |= (1<<hdr->sadb_msg_satype);
++ }
++
++ xfrm_probe_algs();
++
++ supp_skb = compose_sadb_supported(hdr, GFP_KERNEL);
++ if (!supp_skb) {
++ if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC)
++ pfk->registered &= ~(1<<hdr->sadb_msg_satype);
++
++ return -ENOBUFS;
++ }
++
++ pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk);
++
++ return 0;
++}
++
++static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ unsigned proto;
++ struct sk_buff *skb_out;
++ struct sadb_msg *hdr_out;
++
++ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
++ if (proto == 0)
++ return -EINVAL;
++
++ skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
++ if (!skb_out)
++ return -ENOBUFS;
++
++ xfrm_state_flush(proto);
++
++ hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
++ pfkey_hdr_dup(hdr_out, hdr);
++ hdr_out->sadb_msg_errno = (uint8_t) 0;
++ hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
++
++ pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
++
++ return 0;
++}
++
++struct pfkey_dump_data
++{
++ struct sk_buff *skb;
++ struct sadb_msg *hdr;
++ struct sock *sk;
++};
++
++static int dump_sa(struct xfrm_state *x, int count, void *ptr)
++{
++ struct pfkey_dump_data *data = ptr;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++
++ out_skb = pfkey_xfrm_state2msg(x, 1, 3);
++ if (IS_ERR(out_skb))
++ return PTR_ERR(out_skb);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = data->hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = SADB_DUMP;
++ out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_reserved = 0;
++ out_hdr->sadb_msg_seq = count;
++ out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk);
++ return 0;
++}
++
++static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ u8 proto;
++ struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
++
++ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
++ if (proto == 0)
++ return -EINVAL;
++
++ return xfrm_state_walk(proto, dump_sa, &data);
++}
++
++static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct pfkey_opt *pfk = pfkey_sk(sk);
++ int satype = hdr->sadb_msg_satype;
++
++ if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) {
++ /* XXX we mangle packet... */
++ hdr->sadb_msg_errno = 0;
++ if (satype != 0 && satype != 1)
++ return -EINVAL;
++ pfk->promisc = satype;
++ }
++ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_ALL, NULL);
++ return 0;
++}
++
++static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr)
++{
++ int i;
++ u32 reqid = *(u32*)ptr;
++
++ for (i=0; i<xp->xfrm_nr; i++) {
++ if (xp->xfrm_vec[i].reqid == reqid)
++ return -EEXIST;
++ }
++ return 0;
++}
++
++static u32 gen_reqid(void)
++{
++ u32 start;
++ static u32 reqid = IPSEC_MANUAL_REQID_MAX;
++
++ start = reqid;
++ do {
++ ++reqid;
++ if (reqid == 0)
++ reqid = IPSEC_MANUAL_REQID_MAX+1;
++ if (xfrm_policy_walk(check_reqid, (void*)&reqid) != -EEXIST)
++ return reqid;
++ } while (reqid != start);
++ return 0;
++}
++
++static int
++parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
++{
++ struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr;
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++
++ if (xp->xfrm_nr >= XFRM_MAX_DEPTH)
++ return -ELOOP;
++
++ if (rq->sadb_x_ipsecrequest_mode == 0)
++ return -EINVAL;
++
++ t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */
++ t->mode = rq->sadb_x_ipsecrequest_mode-1;
++ if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE)
++ t->optional = 1;
++ else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) {
++ t->reqid = rq->sadb_x_ipsecrequest_reqid;
++ if (t->reqid > IPSEC_MANUAL_REQID_MAX)
++ t->reqid = 0;
++ if (!t->reqid && !(t->reqid = gen_reqid()))
++ return -ENOBUFS;
++ }
++
++ /* addresses present only in tunnel mode */
++ if (t->mode) {
++ switch (xp->family) {
++ case AF_INET:
++ sin = (void*)(rq+1);
++ if (sin->sin_family != AF_INET)
++ return -EINVAL;
++ t->saddr.a4 = sin->sin_addr.s_addr;
++ sin++;
++ if (sin->sin_family != AF_INET)
++ return -EINVAL;
++ t->id.daddr.a4 = sin->sin_addr.s_addr;
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ sin6 = (void *)(rq+1);
++ if (sin6->sin6_family != AF_INET6)
++ return -EINVAL;
++ memcpy(t->saddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr));
++ sin6++;
++ if (sin6->sin6_family != AF_INET6)
++ return -EINVAL;
++ memcpy(t->id.daddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr));
++ break;
++#endif
++ default:
++ return -EINVAL;
++ }
++ }
++ /* No way to set this via kame pfkey */
++ t->aalgos = t->ealgos = t->calgos = ~0;
++ xp->xfrm_nr++;
++ return 0;
++}
++
++static int
++parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
++{
++ int err;
++ int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy);
++ struct sadb_x_ipsecrequest *rq = (void*)(pol+1);
++
++ while (len >= sizeof(struct sadb_x_ipsecrequest)) {
++ if ((err = parse_ipsecrequest(xp, rq)) < 0)
++ return err;
++ len -= rq->sadb_x_ipsecrequest_len;
++ rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len);
++ }
++ return 0;
++}
++
++static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
++{
++ int sockaddr_size = pfkey_sockaddr_size(xp->family);
++ int socklen = (xp->family == AF_INET ?
++ sizeof(struct sockaddr_in) :
++ sizeof(struct sockaddr_in6));
++
++ return sizeof(struct sadb_msg) +
++ (sizeof(struct sadb_lifetime) * 3) +
++ (sizeof(struct sadb_address) * 2) +
++ (sockaddr_size * 2) +
++ sizeof(struct sadb_x_policy) +
++ (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
++ (socklen * 2)));
++}
++
++static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
++{
++ struct sk_buff *skb;
++ int size;
++
++ size = pfkey_xfrm_policy2msg_size(xp);
++
++ skb = alloc_skb(size + 16, GFP_ATOMIC);
++ if (skb == NULL)
++ return ERR_PTR(-ENOBUFS);
++
++ return skb;
++}
++
++static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, int dir)
++{
++ struct sadb_msg *hdr;
++ struct sadb_address *addr;
++ struct sadb_lifetime *lifetime;
++ struct sadb_x_policy *pol;
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++ int i;
++ int size;
++ int sockaddr_size = pfkey_sockaddr_size(xp->family);
++ int socklen = (xp->family == AF_INET ?
++ sizeof(struct sockaddr_in) :
++ sizeof(struct sockaddr_in6));
++
++ size = pfkey_xfrm_policy2msg_size(xp);
++
++ /* call should fill header later */
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
++ memset(hdr, 0, size); /* XXX do we need this ? */
++
++ /* src address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
++ addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
++ addr->sadb_address_prefixlen = xp->selector.prefixlen_s;
++ addr->sadb_address_reserved = 0;
++ /* src address */
++ if (xp->family == AF_INET) {
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = xp->selector.saddr.a4;
++ sin->sin_port = xp->selector.sport;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (xp->family == AF_INET6) {
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = xp->selector.sport;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, xp->selector.saddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* dst address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
++ addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
++ addr->sadb_address_prefixlen = xp->selector.prefixlen_d;
++ addr->sadb_address_reserved = 0;
++ if (xp->family == AF_INET) {
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = xp->selector.daddr.a4;
++ sin->sin_port = xp->selector.dport;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (xp->family == AF_INET6) {
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = xp->selector.dport;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, xp->selector.daddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* hard time */
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
++ lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit);
++ lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit);
++ lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds;
++ lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds;
++ /* soft time */
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
++ lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit);
++ lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit);
++ lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds;
++ lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds;
++ /* current time */
++ lifetime = (struct sadb_lifetime *) skb_put(skb,
++ sizeof(struct sadb_lifetime));
++ lifetime->sadb_lifetime_len =
++ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
++ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
++ lifetime->sadb_lifetime_allocations = xp->curlft.packets;
++ lifetime->sadb_lifetime_bytes = xp->curlft.bytes;
++ lifetime->sadb_lifetime_addtime = xp->curlft.add_time;
++ lifetime->sadb_lifetime_usetime = xp->curlft.use_time;
++
++ pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy));
++ pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
++ pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
++ pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD;
++ if (xp->action == XFRM_POLICY_ALLOW) {
++ if (xp->xfrm_nr)
++ pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
++ else
++ pol->sadb_x_policy_type = IPSEC_POLICY_NONE;
++ }
++ pol->sadb_x_policy_dir = dir+1;
++ pol->sadb_x_policy_id = xp->index;
++ pol->sadb_x_policy_priority = xp->priority;
++
++ for (i=0; i<xp->xfrm_nr; i++) {
++ struct sadb_x_ipsecrequest *rq;
++ struct xfrm_tmpl *t = xp->xfrm_vec + i;
++ int req_size;
++
++ req_size = sizeof(struct sadb_x_ipsecrequest);
++ if (t->mode)
++ req_size += 2*socklen;
++ else
++ size -= 2*socklen;
++ rq = (void*)skb_put(skb, req_size);
++ pol->sadb_x_policy_len += req_size/8;
++ memset(rq, 0, sizeof(*rq));
++ rq->sadb_x_ipsecrequest_len = req_size;
++ rq->sadb_x_ipsecrequest_proto = t->id.proto;
++ rq->sadb_x_ipsecrequest_mode = t->mode+1;
++ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE;
++ if (t->reqid)
++ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE;
++ if (t->optional)
++ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE;
++ rq->sadb_x_ipsecrequest_reqid = t->reqid;
++ if (t->mode) {
++ switch (xp->family) {
++ case AF_INET:
++ sin = (void*)(rq+1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = t->saddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ sin++;
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = t->id.daddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ sin6 = (void*)(rq+1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, t->saddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++
++ sin6++;
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, t->id.daddr.a6,
++ sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ break;
++#endif
++ default:
++ break;
++ }
++ }
++ }
++ hdr->sadb_msg_len = size / sizeof(uint64_t);
++ hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
++}
++
++static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ int err;
++ struct sadb_lifetime *lifetime;
++ struct sadb_address *sa;
++ struct sadb_x_policy *pol;
++ struct xfrm_policy *xp;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++
++ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
++ !ext_hdrs[SADB_X_EXT_POLICY-1])
++ return -EINVAL;
++
++ pol = ext_hdrs[SADB_X_EXT_POLICY-1];
++ if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC)
++ return -EINVAL;
++ if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
++ return -EINVAL;
++
++ xp = xfrm_policy_alloc(GFP_KERNEL);
++ if (xp == NULL)
++ return -ENOBUFS;
++
++ xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
++ XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
++ xp->priority = pol->sadb_x_policy_priority;
++
++ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr);
++ if (!xp->family) {
++ err = -EINVAL;
++ goto out;
++ }
++ xp->selector.family = xp->family;
++ xp->selector.prefixlen_s = sa->sadb_address_prefixlen;
++ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
++ xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
++ if (xp->selector.sport)
++ xp->selector.sport_mask = ~0;
++
++ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
++ pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr);
++ xp->selector.prefixlen_d = sa->sadb_address_prefixlen;
++
++ /* Amusing, we set this twice. KAME apps appear to set same value
++ * in both addresses.
++ */
++ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
++
++ xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
++ if (xp->selector.dport)
++ xp->selector.dport_mask = ~0;
++
++ xp->lft.soft_byte_limit = XFRM_INF;
++ xp->lft.hard_byte_limit = XFRM_INF;
++ xp->lft.soft_packet_limit = XFRM_INF;
++ xp->lft.hard_packet_limit = XFRM_INF;
++ if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) {
++ xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
++ xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
++ xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
++ xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
++ }
++ if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) {
++ xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
++ xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
++ xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
++ xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
++ }
++ xp->xfrm_nr = 0;
++ if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
++ (err = parse_ipsecrequests(xp, pol)) < 0)
++ goto out;
++
++ out_skb = pfkey_xfrm_policy2msg_prep(xp);
++ if (IS_ERR(out_skb)) {
++ err = PTR_ERR(out_skb);
++ goto out;
++ }
++
++ err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
++ hdr->sadb_msg_type != SADB_X_SPDUPDATE);
++ if (err) {
++ kfree_skb(out_skb);
++ goto out;
++ }
++
++ pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
++
++ xfrm_pol_put(xp);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = hdr->sadb_msg_type;
++ out_hdr->sadb_msg_satype = 0;
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
++ return 0;
++
++out:
++ kfree(xp);
++ return err;
++}
++
++static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ int err;
++ struct sadb_address *sa;
++ struct sadb_x_policy *pol;
++ struct xfrm_policy *xp;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++ struct xfrm_selector sel;
++
++ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
++ !ext_hdrs[SADB_X_EXT_POLICY-1])
++ return -EINVAL;
++
++ pol = ext_hdrs[SADB_X_EXT_POLICY-1];
++ if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
++ return -EINVAL;
++
++ memset(&sel, 0, sizeof(sel));
++
++ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
++ sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
++ sel.prefixlen_s = sa->sadb_address_prefixlen;
++ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
++ sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
++ if (sel.sport)
++ sel.sport_mask = ~0;
++
++ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
++ pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
++ sel.prefixlen_d = sa->sadb_address_prefixlen;
++ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
++ sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
++ if (sel.dport)
++ sel.dport_mask = ~0;
++
++ xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1);
++ if (xp == NULL)
++ return -ENOENT;
++
++ err = 0;
++
++ out_skb = pfkey_xfrm_policy2msg_prep(xp);
++ if (IS_ERR(out_skb)) {
++ err = PTR_ERR(out_skb);
++ goto out;
++ }
++ pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = SADB_X_SPDDELETE;
++ out_hdr->sadb_msg_satype = 0;
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
++ err = 0;
++
++out:
++ xfrm_pol_put(xp);
++ return err;
++}
++
++static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ int err;
++ struct sadb_x_policy *pol;
++ struct xfrm_policy *xp;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++
++ if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
++ return -EINVAL;
++
++ xp = xfrm_policy_byid(0, pol->sadb_x_policy_id,
++ hdr->sadb_msg_type == SADB_X_SPDDELETE2);
++ if (xp == NULL)
++ return -ENOENT;
++
++ err = 0;
++
++ out_skb = pfkey_xfrm_policy2msg_prep(xp);
++ if (IS_ERR(out_skb)) {
++ err = PTR_ERR(out_skb);
++ goto out;
++ }
++ pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = hdr->sadb_msg_type;
++ out_hdr->sadb_msg_satype = 0;
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
++ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
++ err = 0;
++
++out:
++ xfrm_pol_put(xp);
++ return err;
++}
++
++static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
++{
++ struct pfkey_dump_data *data = ptr;
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++
++ out_skb = pfkey_xfrm_policy2msg_prep(xp);
++ if (IS_ERR(out_skb))
++ return PTR_ERR(out_skb);
++
++ pfkey_xfrm_policy2msg(out_skb, xp, dir);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = data->hdr->sadb_msg_version;
++ out_hdr->sadb_msg_type = SADB_X_SPDDUMP;
++ out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_seq = count;
++ out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid;
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk);
++ return 0;
++}
++
++static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
++
++ return xfrm_policy_walk(dump_sp, &data);
++}
++
++static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
++{
++ struct sk_buff *skb_out;
++ struct sadb_msg *hdr_out;
++
++ skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
++ if (!skb_out)
++ return -ENOBUFS;
++
++ xfrm_policy_flush();
++
++ hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
++ pfkey_hdr_dup(hdr_out, hdr);
++ hdr_out->sadb_msg_errno = (uint8_t) 0;
++ hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
++ pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
++
++ return 0;
++}
++
++typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb,
++ struct sadb_msg *hdr, void **ext_hdrs);
++static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
++ [SADB_RESERVED] = pfkey_reserved,
++ [SADB_GETSPI] = pfkey_getspi,
++ [SADB_UPDATE] = pfkey_add,
++ [SADB_ADD] = pfkey_add,
++ [SADB_DELETE] = pfkey_delete,
++ [SADB_GET] = pfkey_get,
++ [SADB_ACQUIRE] = pfkey_acquire,
++ [SADB_REGISTER] = pfkey_register,
++ [SADB_EXPIRE] = NULL,
++ [SADB_FLUSH] = pfkey_flush,
++ [SADB_DUMP] = pfkey_dump,
++ [SADB_X_PROMISC] = pfkey_promisc,
++ [SADB_X_PCHANGE] = NULL,
++ [SADB_X_SPDUPDATE] = pfkey_spdadd,
++ [SADB_X_SPDADD] = pfkey_spdadd,
++ [SADB_X_SPDDELETE] = pfkey_spddelete,
++ [SADB_X_SPDGET] = pfkey_spdget,
++ [SADB_X_SPDACQUIRE] = NULL,
++ [SADB_X_SPDDUMP] = pfkey_spddump,
++ [SADB_X_SPDFLUSH] = pfkey_spdflush,
++ [SADB_X_SPDSETIDX] = pfkey_spdadd,
++ [SADB_X_SPDDELETE2] = pfkey_spdget,
++};
++
++static int pfkey_process(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr)
++{
++ void *ext_hdrs[SADB_EXT_MAX];
++ int err;
++
++ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
++ BROADCAST_PROMISC_ONLY, NULL);
++
++ memset(ext_hdrs, 0, sizeof(ext_hdrs));
++ err = parse_exthdrs(skb, hdr, ext_hdrs);
++ if (!err) {
++ err = -EOPNOTSUPP;
++ if (pfkey_funcs[hdr->sadb_msg_type])
++ err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs);
++ }
++ return err;
++}
++
++static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp)
++{
++ struct sadb_msg *hdr = NULL;
++
++ if (skb->len < sizeof(*hdr)) {
++ *errp = -EMSGSIZE;
++ } else {
++ hdr = (struct sadb_msg *) skb->data;
++ if (hdr->sadb_msg_version != PF_KEY_V2 ||
++ hdr->sadb_msg_reserved != 0 ||
++ (hdr->sadb_msg_type <= SADB_RESERVED ||
++ hdr->sadb_msg_type > SADB_MAX)) {
++ hdr = NULL;
++ *errp = -EINVAL;
++ } else if (hdr->sadb_msg_len != (skb->len /
++ sizeof(uint64_t)) ||
++ hdr->sadb_msg_len < (sizeof(struct sadb_msg) /
++ sizeof(uint64_t))) {
++ hdr = NULL;
++ *errp = -EMSGSIZE;
++ } else {
++ *errp = 0;
++ }
++ }
++ return hdr;
++}
++
++static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
++{
++ return t->aalgos & (1 << d->desc.sadb_alg_id);
++}
++
++static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d)
++{
++ return t->ealgos & (1 << d->desc.sadb_alg_id);
++}
++
++static int count_ah_combs(struct xfrm_tmpl *t)
++{
++ int i, sz = 0;
++
++ for (i = 0; ; i++) {
++ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
++ if (!aalg)
++ break;
++ if (aalg_tmpl_set(t, aalg) && aalg->available)
++ sz += sizeof(struct sadb_comb);
++ }
++ return sz + sizeof(struct sadb_prop);
++}
++
++static int count_esp_combs(struct xfrm_tmpl *t)
++{
++ int i, k, sz = 0;
++
++ for (i = 0; ; i++) {
++ struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
++ if (!ealg)
++ break;
++
++ if (!(ealg_tmpl_set(t, ealg) && ealg->available))
++ continue;
++
++ for (k = 1; ; k++) {
++ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
++ if (!aalg)
++ break;
++
++ if (aalg_tmpl_set(t, aalg) && aalg->available)
++ sz += sizeof(struct sadb_comb);
++ }
++ }
++ return sz + sizeof(struct sadb_prop);
++}
++
++static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
++{
++ struct sadb_prop *p;
++ int i;
++
++ p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
++ p->sadb_prop_len = sizeof(struct sadb_prop)/8;
++ p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
++ p->sadb_prop_replay = 32;
++ memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
++
++ for (i = 0; ; i++) {
++ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
++ if (!aalg)
++ break;
++
++ if (aalg_tmpl_set(t, aalg) && aalg->available) {
++ struct sadb_comb *c;
++ c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
++ memset(c, 0, sizeof(*c));
++ p->sadb_prop_len += sizeof(struct sadb_comb)/8;
++ c->sadb_comb_auth = aalg->desc.sadb_alg_id;
++ c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
++ c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
++ c->sadb_comb_hard_addtime = 24*60*60;
++ c->sadb_comb_soft_addtime = 20*60*60;
++ c->sadb_comb_hard_usetime = 8*60*60;
++ c->sadb_comb_soft_usetime = 7*60*60;
++ }
++ }
++}
++
++static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
++{
++ struct sadb_prop *p;
++ int i, k;
++
++ p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
++ p->sadb_prop_len = sizeof(struct sadb_prop)/8;
++ p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
++ p->sadb_prop_replay = 32;
++ memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
++
++ for (i=0; ; i++) {
++ struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
++ if (!ealg)
++ break;
++
++ if (!(ealg_tmpl_set(t, ealg) && ealg->available))
++ continue;
++
++ for (k = 1; ; k++) {
++ struct sadb_comb *c;
++ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
++ if (!aalg)
++ break;
++ if (!(aalg_tmpl_set(t, aalg) && aalg->available))
++ continue;
++ c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
++ memset(c, 0, sizeof(*c));
++ p->sadb_prop_len += sizeof(struct sadb_comb)/8;
++ c->sadb_comb_auth = aalg->desc.sadb_alg_id;
++ c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
++ c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
++ c->sadb_comb_encrypt = ealg->desc.sadb_alg_id;
++ c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits;
++ c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits;
++ c->sadb_comb_hard_addtime = 24*60*60;
++ c->sadb_comb_soft_addtime = 20*60*60;
++ c->sadb_comb_hard_usetime = 8*60*60;
++ c->sadb_comb_soft_usetime = 7*60*60;
++ }
++ }
++}
++
++static int pfkey_send_notify(struct xfrm_state *x, int hard)
++{
++ struct sk_buff *out_skb;
++ struct sadb_msg *out_hdr;
++ int hsc = (hard ? 2 : 1);
++
++ out_skb = pfkey_xfrm_state2msg(x, 0, hsc);
++ if (IS_ERR(out_skb))
++ return PTR_ERR(out_skb);
++
++ out_hdr = (struct sadb_msg *) out_skb->data;
++ out_hdr->sadb_msg_version = PF_KEY_V2;
++ out_hdr->sadb_msg_type = SADB_EXPIRE;
++ out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
++ out_hdr->sadb_msg_errno = 0;
++ out_hdr->sadb_msg_reserved = 0;
++ out_hdr->sadb_msg_seq = 0;
++ out_hdr->sadb_msg_pid = 0;
++
++ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL);
++ return 0;
++}
++
++static u32 get_acqseq(void)
++{
++ u32 res;
++ static u32 acqseq;
++ static spinlock_t acqseq_lock = SPIN_LOCK_UNLOCKED;
++
++ spin_lock_bh(&acqseq_lock);
++ res = (++acqseq ? : ++acqseq);
++ spin_unlock_bh(&acqseq_lock);
++ return res;
++}
++
++static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp, int dir)
++{
++ struct sk_buff *skb;
++ struct sadb_msg *hdr;
++ struct sadb_address *addr;
++ struct sadb_x_policy *pol;
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++ int sockaddr_size;
++ int size;
++
++ sockaddr_size = pfkey_sockaddr_size(x->props.family);
++ if (!sockaddr_size)
++ return -EINVAL;
++
++ size = sizeof(struct sadb_msg) +
++ (sizeof(struct sadb_address) * 2) +
++ (sockaddr_size * 2) +
++ sizeof(struct sadb_x_policy);
++
++ if (x->id.proto == IPPROTO_AH)
++ size += count_ah_combs(t);
++ else if (x->id.proto == IPPROTO_ESP)
++ size += count_esp_combs(t);
++
++ skb = alloc_skb(size + 16, GFP_ATOMIC);
++ if (skb == NULL)
++ return -ENOMEM;
++
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
++ hdr->sadb_msg_version = PF_KEY_V2;
++ hdr->sadb_msg_type = SADB_ACQUIRE;
++ hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
++ hdr->sadb_msg_len = size / sizeof(uint64_t);
++ hdr->sadb_msg_errno = 0;
++ hdr->sadb_msg_reserved = 0;
++ hdr->sadb_msg_seq = x->km.seq = get_acqseq();
++ hdr->sadb_msg_pid = 0;
++
++ /* src address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ addr->sadb_address_prefixlen = 32;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->props.saddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr,
++ x->props.saddr.a6, sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* dst address */
++ addr = (struct sadb_address*) skb_put(skb,
++ sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ addr->sadb_address_prefixlen = 32;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->id.daddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr,
++ x->id.daddr.a6, sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy));
++ pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
++ pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
++ pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
++ pol->sadb_x_policy_dir = dir+1;
++ pol->sadb_x_policy_id = xp->index;
++
++ /* Set sadb_comb's. */
++ if (x->id.proto == IPPROTO_AH)
++ dump_ah_combs(skb, t);
++ else if (x->id.proto == IPPROTO_ESP)
++ dump_esp_combs(skb, t);
++
++ return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL);
++}
++
++static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
++ u8 *data, int len, int *dir)
++{
++ struct xfrm_policy *xp;
++ struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
++
++ switch (family) {
++ case AF_INET:
++ if (opt != IP_IPSEC_POLICY) {
++ *dir = -EOPNOTSUPP;
++ return NULL;
++ }
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ if (opt != IPV6_IPSEC_POLICY) {
++ *dir = -EOPNOTSUPP;
++ return NULL;
++ }
++ break;
++#endif
++ default:
++ *dir = -EINVAL;
++ return NULL;
++ }
++
++ *dir = -EINVAL;
++
++ if (len < sizeof(struct sadb_x_policy) ||
++ pol->sadb_x_policy_len*8 > len ||
++ pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS ||
++ (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND))
++ return NULL;
++
++ xp = xfrm_policy_alloc(GFP_ATOMIC);
++ if (xp == NULL) {
++ *dir = -ENOBUFS;
++ return NULL;
++ }
++
++ xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
++ XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
++
++ xp->lft.soft_byte_limit = XFRM_INF;
++ xp->lft.hard_byte_limit = XFRM_INF;
++ xp->lft.soft_packet_limit = XFRM_INF;
++ xp->lft.hard_packet_limit = XFRM_INF;
++ xp->family = family;
++
++ xp->xfrm_nr = 0;
++ if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
++ (*dir = parse_ipsecrequests(xp, pol)) < 0)
++ goto out;
++
++ *dir = pol->sadb_x_policy_dir-1;
++ return xp;
++
++out:
++ kfree(xp);
++ return NULL;
++}
++
++static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport)
++{
++ struct sk_buff *skb;
++ struct sadb_msg *hdr;
++ struct sadb_sa *sa;
++ struct sadb_address *addr;
++ struct sadb_x_nat_t_port *n_port;
++ struct sockaddr_in *sin;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct sockaddr_in6 *sin6;
++#endif
++ int sockaddr_size;
++ int size;
++ __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0);
++ struct xfrm_encap_tmpl *natt = NULL;
++
++ sockaddr_size = pfkey_sockaddr_size(x->props.family);
++ if (!sockaddr_size)
++ return -EINVAL;
++
++ if (!satype)
++ return -EINVAL;
++
++ if (!x->encap)
++ return -EINVAL;
++
++ natt = x->encap;
++
++ /* Build an SADB_X_NAT_T_NEW_MAPPING message:
++ *
++ * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) |
++ * ADDRESS_DST (new addr) | NAT_T_DPORT (new port)
++ */
++
++ size = sizeof(struct sadb_msg) +
++ sizeof(struct sadb_sa) +
++ (sizeof(struct sadb_address) * 2) +
++ (sockaddr_size * 2) +
++ (sizeof(struct sadb_x_nat_t_port) * 2);
++
++ skb = alloc_skb(size + 16, GFP_ATOMIC);
++ if (skb == NULL)
++ return -ENOMEM;
++
++ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
++ hdr->sadb_msg_version = PF_KEY_V2;
++ hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING;
++ hdr->sadb_msg_satype = satype;
++ hdr->sadb_msg_len = size / sizeof(uint64_t);
++ hdr->sadb_msg_errno = 0;
++ hdr->sadb_msg_reserved = 0;
++ hdr->sadb_msg_seq = x->km.seq = get_acqseq();
++ hdr->sadb_msg_pid = 0;
++
++ /* SA */
++ sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa));
++ sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
++ sa->sadb_sa_exttype = SADB_EXT_SA;
++ sa->sadb_sa_spi = x->id.spi;
++ sa->sadb_sa_replay = 0;
++ sa->sadb_sa_state = 0;
++ sa->sadb_sa_auth = 0;
++ sa->sadb_sa_encrypt = 0;
++ sa->sadb_sa_flags = 0;
++
++ /* ADDRESS_SRC (old addr) */
++ addr = (struct sadb_address*)
++ skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ addr->sadb_address_prefixlen = 32;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = x->props.saddr.a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr,
++ x->props.saddr.a6, sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* NAT_T_SPORT (old port) */
++ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
++ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
++ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
++ n_port->sadb_x_nat_t_port_port = natt->encap_sport;
++ n_port->sadb_x_nat_t_port_reserved = 0;
++
++ /* ADDRESS_DST (new addr) */
++ addr = (struct sadb_address*)
++ skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
++ addr->sadb_address_len =
++ (sizeof(struct sadb_address)+sockaddr_size)/
++ sizeof(uint64_t);
++ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
++ addr->sadb_address_proto = 0;
++ addr->sadb_address_reserved = 0;
++ if (x->props.family == AF_INET) {
++ addr->sadb_address_prefixlen = 32;
++
++ sin = (struct sockaddr_in *) (addr + 1);
++ sin->sin_family = AF_INET;
++ sin->sin_addr.s_addr = ipaddr->a4;
++ sin->sin_port = 0;
++ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
++ }
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (x->props.family == AF_INET6) {
++ addr->sadb_address_prefixlen = 128;
++
++ sin6 = (struct sockaddr_in6 *) (addr + 1);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = 0;
++ sin6->sin6_flowinfo = 0;
++ memcpy(&sin6->sin6_addr, &ipaddr->a6, sizeof(struct in6_addr));
++ sin6->sin6_scope_id = 0;
++ }
++#endif
++ else
++ BUG();
++
++ /* NAT_T_DPORT (new port) */
++ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
++ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
++ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
++ n_port->sadb_x_nat_t_port_port = sport;
++ n_port->sadb_x_nat_t_port_reserved = 0;
++
++ return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL);
++}
++
++static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, int len,
++ struct scm_cookie *scm)
++{
++ struct sock *sk = sock->sk;
++ struct sk_buff *skb = NULL;
++ struct sadb_msg *hdr = NULL;
++ int err;
++
++ err = -EOPNOTSUPP;
++ if (msg->msg_flags & MSG_OOB)
++ goto out;
++
++ err = -EMSGSIZE;
++ if ((unsigned)len > sk->sndbuf-32)
++ goto out;
++
++ err = -ENOBUFS;
++ skb = alloc_skb(len, GFP_KERNEL);
++ if (skb == NULL)
++ goto out;
++
++ err = -EFAULT;
++ if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len))
++ goto out;
++
++ hdr = pfkey_get_base_msg(skb, &err);
++ if (!hdr)
++ goto out;
++
++ down(&xfrm_cfg_sem);
++ err = pfkey_process(sk, skb, hdr);
++ up(&xfrm_cfg_sem);
++
++out:
++ if (err && hdr && pfkey_error(hdr, err, sk) == 0)
++ err = 0;
++ if (skb)
++ kfree_skb(skb);
++
++ return err ? : len;
++}
++
++static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, int len,
++ int flags, struct scm_cookie *scm)
++{
++ struct sock *sk = sock->sk;
++ struct sk_buff *skb;
++ int copied, err;
++
++ err = -EINVAL;
++ if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC))
++ goto out;
++
++ msg->msg_namelen = 0;
++ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
++ if (skb == NULL)
++ goto out;
++
++ copied = skb->len;
++ if (copied > len) {
++ msg->msg_flags |= MSG_TRUNC;
++ copied = len;
++ }
++
++ skb->h.raw = skb->data;
++ err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
++ if (err)
++ goto out_free;
++
++ sock_recv_timestamp(msg, sk, skb);
++
++ err = (flags & MSG_TRUNC) ? skb->len : copied;
++
++out_free:
++ skb_free_datagram(sk, skb);
++out:
++ return err;
++}
++
++static struct proto_ops pfkey_ops = {
++ .family = PF_KEY,
++
++ /* Operations that make no sense on pfkey sockets. */
++ .bind = sock_no_bind,
++ .connect = sock_no_connect,
++ .socketpair = sock_no_socketpair,
++ .accept = sock_no_accept,
++ .getname = sock_no_getname,
++ .ioctl = sock_no_ioctl,
++ .listen = sock_no_listen,
++ .shutdown = sock_no_shutdown,
++ .setsockopt = sock_no_setsockopt,
++ .getsockopt = sock_no_getsockopt,
++ .mmap = sock_no_mmap,
++ .sendpage = sock_no_sendpage,
++
++ /* Now the operations that really occur. */
++ .release = pfkey_release,
++ .poll = datagram_poll,
++ .sendmsg = pfkey_sendmsg,
++ .recvmsg = pfkey_recvmsg,
++};
++
++static struct net_proto_family pfkey_family_ops = {
++ .family = PF_KEY,
++ .create = pfkey_create,
++};
++
++#ifdef CONFIG_PROC_FS
++static int pfkey_read_proc(char *buffer, char **start, off_t offset,
++ int length, int *eof, void *data)
++{
++ off_t pos = 0;
++ off_t begin = 0;
++ int len = 0;
++ struct sock *s;
++
++ len += sprintf(buffer,"sk RefCnt Rmem Wmem User Inode\n");
++
++ read_lock(&pfkey_table_lock);
++
++ for (s = pfkey_table; s; s = s->next) {
++ len += sprintf(buffer+len,"%p %-6d %-6u %-6u %-6u %-6lu",
++ s,
++ atomic_read(&s->refcnt),
++ atomic_read(&s->rmem_alloc),
++ atomic_read(&s->wmem_alloc),
++ sock_i_uid(s),
++ sock_i_ino(s)
++ );
++
++ buffer[len++] = '\n';
++
++ pos = begin + len;
++ if (pos < offset) {
++ len = 0;
++ begin = pos;
++ }
++ if(pos > offset + length)
++ goto done;
++ }
++ *eof = 1;
++
++done:
++ read_unlock(&pfkey_table_lock);
++
++ *start = buffer + (offset - begin);
++ len -= (offset - begin);
++
++ if (len > length)
++ len = length;
++ if (len < 0)
++ len = 0;
++
++ return len;
++}
++#endif
++
++static struct xfrm_mgr pfkeyv2_mgr =
++{
++ .id = "pfkeyv2",
++ .notify = pfkey_send_notify,
++ .acquire = pfkey_send_acquire,
++ .compile_policy = pfkey_compile_policy,
++ .new_mapping = pfkey_send_new_mapping,
++};
++
++static void __exit ipsec_pfkey_exit(void)
++{
++ xfrm_unregister_km(&pfkeyv2_mgr);
++ remove_proc_entry("net/pfkey", 0);
++ sock_unregister(PF_KEY);
++}
++
++static int __init ipsec_pfkey_init(void)
++{
++ sock_register(&pfkey_family_ops);
++#ifdef CONFIG_PROC_FS
++ create_proc_read_entry("net/pfkey", 0, 0, pfkey_read_proc, NULL);
++#endif
++ xfrm_register_km(&pfkeyv2_mgr);
++ return 0;
++}
++
++module_init(ipsec_pfkey_init);
++module_exit(ipsec_pfkey_exit);
++MODULE_LICENSE("GPL");
+diff -Nru a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+--- a/net/netlink/af_netlink.c 2005-02-13 21:25:09 +11:00
++++ b/net/netlink/af_netlink.c 2005-02-13 21:25:09 +11:00
+@@ -658,6 +658,7 @@
+ u32 pid;
+ u32 group;
+ int failure;
++ int delivered;
+ int allocation;
+ struct sk_buff *skb, *skb2;
+ };
+@@ -694,16 +695,18 @@
+ p->failure = 1;
+ } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+ netlink_overrun(sk);
+- } else
++ } else {
++ p->delivered = 1;
+ p->skb2 = NULL;
++ }
+ sock_put(sk);
+
+ out:
+ return 0;
+ }
+
+-void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
+- u32 group, int allocation)
++int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
++ u32 group, int allocation)
+ {
+ struct netlink_broadcast_data info;
+ struct sock *sk;
+@@ -712,6 +715,7 @@
+ info.pid = pid;
+ info.group = group;
+ info.failure = 0;
++ info.delivered = 0;
+ info.allocation = allocation;
+ info.skb = skb;
+ info.skb2 = NULL;
+@@ -728,6 +732,12 @@
+ if (info.skb2)
+ kfree_skb(info.skb2);
+ kfree_skb(skb);
++
++ if (info.delivered)
++ return 0;
++ if (info.failure)
++ return -ENOBUFS;
++ return -ESRCH;
+ }
+
+ struct netlink_set_err_data {
+diff -Nru a/net/netsyms.c b/net/netsyms.c
+--- a/net/netsyms.c 2005-02-13 21:25:09 +11:00
++++ b/net/netsyms.c 2005-02-13 21:25:09 +11:00
+@@ -57,6 +57,12 @@
+ #include <linux/inet.h>
+ #include <linux/mroute.h>
+ #include <linux/igmp.h>
++#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE)
++#include <net/ah.h>
++#endif
++#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
++#include <net/esp.h>
++#endif
+
+ extern struct net_proto_family inet_family_ops;
+
+@@ -191,6 +197,7 @@
+ #endif
+ #ifdef CONFIG_SYSCTL
+ EXPORT_SYMBOL(neigh_sysctl_register);
++EXPORT_SYMBOL(neigh_sysctl_unregister);
+ #endif
+ EXPORT_SYMBOL(pneigh_lookup);
+ EXPORT_SYMBOL(pneigh_enqueue);
+@@ -287,6 +294,7 @@
+ EXPORT_SYMBOL(inetdev_by_index);
+ EXPORT_SYMBOL(in_dev_finish_destroy);
+ EXPORT_SYMBOL(ip_defrag);
++EXPORT_SYMBOL(inet_peer_idlock);
+ EXPORT_SYMBOL(ipfrag_flush);
+
+ /* Route manipulation */
+@@ -303,6 +311,14 @@
+ EXPORT_SYMBOL(dlci_ioctl_hook);
+ #endif
+
++#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
++EXPORT_SYMBOL_GPL(skb_cow_data);
++EXPORT_SYMBOL_GPL(pskb_put);
++EXPORT_SYMBOL_GPL(skb_to_sgvec);
++#endif
++
++EXPORT_SYMBOL(flow_cache_lookup);
++EXPORT_SYMBOL(flow_cache_genid);
+
+ #if defined (CONFIG_IPV6_MODULE) || defined (CONFIG_KHTTPD) || defined (CONFIG_KHTTPD_MODULE) || defined (CONFIG_IP_SCTP_MODULE)
+ /* inet functions common to v4 and v6 */
+@@ -417,8 +433,9 @@
+ EXPORT_SYMBOL(secure_ipv6_id);
+ #endif
+
+-#endif
++EXPORT_SYMBOL(ip_generic_getfrag);
+
++#endif
+ EXPORT_SYMBOL(tcp_read_sock);
+
+ #ifdef CONFIG_IP_SCTP_MODULE
+@@ -495,6 +512,7 @@
+ EXPORT_SYMBOL(loopback_dev);
+ EXPORT_SYMBOL(register_netdevice);
+ EXPORT_SYMBOL(unregister_netdevice);
++EXPORT_SYMBOL(synchronize_net);
+ EXPORT_SYMBOL(netdev_state_change);
+ EXPORT_SYMBOL(dev_new_index);
+ EXPORT_SYMBOL(dev_get_by_flags);
+diff -Nru a/net/sched/cls_route.c b/net/sched/cls_route.c
+--- a/net/sched/cls_route.c 2005-02-13 21:25:09 +11:00
++++ b/net/sched/cls_route.c 2005-02-13 21:25:09 +11:00
+@@ -154,7 +154,7 @@
+ if (head == NULL)
+ goto old_method;
+
+- iif = ((struct rtable*)dst)->key.iif;
++ iif = ((struct rtable*)dst)->fl.iif;
+
+ h = route4_fastmap_hash(id, iif);
+ if (id == head->fastmap[h].id &&
+diff -Nru a/net/sctp/input.c b/net/sctp/input.c
+--- a/net/sctp/input.c 2005-02-13 21:25:09 +11:00
++++ b/net/sctp/input.c 2005-02-13 21:25:09 +11:00
+@@ -58,6 +58,7 @@
+ #include <net/snmp.h>
+ #include <net/sock.h>
+ #include <linux/ipsec.h>
++#include <net/xfrm.h>
+ #include <net/sctp/sctp.h>
+ #include <net/sctp/sm.h>
+
+@@ -183,7 +184,7 @@
+ rcvr = asoc ? &asoc->base : &ep->base;
+ sk = rcvr->sk;
+
+- if (!ipsec_sk_policy(sk, skb))
++ if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
+ goto discard_release;
+
+ ret = sk_filter(sk, skb, 1);
+diff -Nru a/net/sctp/ipv6.c b/net/sctp/ipv6.c
+--- a/net/sctp/ipv6.c 2005-02-13 21:25:09 +11:00
++++ b/net/sctp/ipv6.c 2005-02-13 21:25:09 +11:00
+@@ -83,17 +83,6 @@
+ .notifier_call = sctp_inetaddr_event,
+ };
+
+-/* FIXME: This macro needs to be moved to a common header file. */
+-#define NIP6(addr) \
+- ntohs((addr)->s6_addr16[0]), \
+- ntohs((addr)->s6_addr16[1]), \
+- ntohs((addr)->s6_addr16[2]), \
+- ntohs((addr)->s6_addr16[3]), \
+- ntohs((addr)->s6_addr16[4]), \
+- ntohs((addr)->s6_addr16[5]), \
+- ntohs((addr)->s6_addr16[6]), \
+- ntohs((addr)->s6_addr16[7])
+-
+ /* ICMP error handler. */
+ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ int type, int code, int offset, __u32 info)
+@@ -174,12 +163,12 @@
+ /* Fill in the dest address from the route entry passed with the skb
+ * and the source address from the transport.
+ */
+- fl.fl6_dst = &transport->ipaddr.v6.sin6_addr;
+- fl.fl6_src = &transport->saddr.v6.sin6_addr;
++ ipv6_addr_copy(&fl.fl6_dst, &transport->ipaddr.v6.sin6_addr);
++ ipv6_addr_copy(&fl.fl6_src, &transport->saddr.v6.sin6_addr);
+
+ fl.fl6_flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+- if (ipv6_addr_type(fl.fl6_src) & IPV6_ADDR_LINKLOCAL)
++ if (ipv6_addr_type(&fl.fl6_src) & IPV6_ADDR_LINKLOCAL)
+ fl.oif = transport->saddr.v6.sin6_scope_id;
+ else
+ fl.oif = sk->bound_dev_if;
+@@ -188,7 +177,7 @@
+
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+- fl.fl6_dst = rt0->addr;
++ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ }
+
+ SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, "
+@@ -213,7 +202,7 @@
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+- fl.fl6_dst = &daddr->v6.sin6_addr;
++ ipv6_addr_copy(&fl.fl6_dst, &daddr->v6.sin6_addr);
+ if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ fl.oif = daddr->v6.sin6_scope_id;
+
+@@ -222,7 +211,7 @@
+ __FUNCTION__, NIP6(fl.fl6_dst));
+
+ if (saddr) {
+- fl.fl6_src = &saddr->v6.sin6_addr;
++ ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr);
+ SCTP_DEBUG_PRINTK(
+ "SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x - ",
+ NIP6(fl.fl6_src));
+@@ -235,7 +224,7 @@
+ SCTP_DEBUG_PRINTK(
+ "rt6_dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
+ "rt6_src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+- NIP6(&rt->rt6i_dst.addr), NIP6(&rt->rt6i_src.addr));
++ NIP6(rt->rt6i_dst.addr), NIP6(rt->rt6i_src.addr));
+ } else {
+ SCTP_DEBUG_PRINTK("NO ROUTE\n");
+ }
+@@ -284,13 +273,13 @@
+
+ SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p "
+ "daddr:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ",
+- __FUNCTION__, asoc, dst, NIP6(&daddr->v6.sin6_addr));
++ __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr));
+
+ if (!asoc) {
+ ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr);
+ SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: "
+ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+- NIP6(&saddr->v6.sin6_addr));
++ NIP6(saddr->v6.sin6_addr));
+ return;
+ }
+
+@@ -319,12 +308,12 @@
+ memcpy(saddr, baddr, sizeof(union sctp_addr));
+ SCTP_DEBUG_PRINTK("saddr: "
+ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+- NIP6(&saddr->v6.sin6_addr));
++ NIP6(saddr->v6.sin6_addr));
+ } else {
+ printk(KERN_ERR "%s: asoc:%p Could not find a valid source "
+ "address for the "
+ "dest:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+- __FUNCTION__, asoc, NIP6(&daddr->v6.sin6_addr));
++ __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr));
+ }
+
+ sctp_read_unlock(addr_lock);
+@@ -640,7 +629,7 @@
+ /* Init the ipv4 part of the socket since we can have sockets
+ * using v6 API for ipv4.
+ */
+- newinet->ttl = sysctl_ip_default_ttl;
++ newinet->uc_ttl = -1;
+ newinet->mc_loop = 1;
+ newinet->mc_ttl = 1;
+ newinet->mc_index = 0;
+@@ -689,7 +678,7 @@
+ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
+ {
+ seq_printf(seq, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ",
+- NIP6(&addr->v6.sin6_addr));
++ NIP6(addr->v6.sin6_addr));
+ }
+
+ /* Initialize a PF_INET6 socket msg_name. */
+@@ -923,14 +912,15 @@
+ .flags = SCTP_PROTOSW_FLAG,
+ };
+
++static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
++{
++ return sctp_rcv(*pskb) ? -1 : 0;
++}
++
+ static struct inet6_protocol sctpv6_protocol = {
+- .handler = sctp_rcv,
++ .handler = sctp6_rcv,
+ .err_handler = sctp_v6_err,
+- .next = NULL,
+- .protocol = IPPROTO_SCTP,
+- .copy = 0,
+- .data = NULL,
+- .name = "SCTPv6",
++ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+ };
+
+ static struct sctp_af sctp_ipv6_specific = {
+@@ -978,7 +968,8 @@
+ int sctp_v6_init(void)
+ {
+ /* Register inet6 protocol. */
+- inet6_add_protocol(&sctpv6_protocol);
++ if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0)
++ return -EAGAIN;
+
+ /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */
+ inet6_register_protosw(&sctpv6_seqpacket_protosw);
+@@ -1000,7 +991,7 @@
+ void sctp_v6_exit(void)
+ {
+ list_del(&sctp_ipv6_specific.list);
+- inet6_del_protocol(&sctpv6_protocol);
++ inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP);
+ inet6_unregister_protosw(&sctpv6_seqpacket_protosw);
+ inet6_unregister_protosw(&sctpv6_stream_protosw);
+ unregister_inet6addr_notifier(&sctp_inet6addr_notifier);
+diff -Nru a/net/sctp/protocol.c b/net/sctp/protocol.c
+--- a/net/sctp/protocol.c 2005-02-13 21:25:09 +11:00
++++ b/net/sctp/protocol.c 2005-02-13 21:25:09 +11:00
+@@ -433,7 +433,7 @@
+ union sctp_addr *saddr)
+ {
+ struct rtable *rt;
+- struct rt_key key;
++ struct flowi fl;
+ struct sctp_bind_addr *bp;
+ rwlock_t *addr_lock;
+ struct sctp_sockaddr_entry *laddr;
+@@ -441,21 +441,21 @@
+ struct dst_entry *dst = NULL;
+ union sctp_addr dst_saddr;
+
+- memset(&key, 0x0, sizeof(struct rt_key));
+- key.dst = daddr->v4.sin_addr.s_addr;
+-
++ memset(&fl, 0x0, sizeof(struct flowi));
++ fl.fl4_dst = daddr->v4.sin_addr.s_addr;
++ fl.proto = IPPROTO_SCTP;
+ if (asoc) {
+- key.tos = RT_CONN_FLAGS(asoc->base.sk);
+- key.oif = asoc->base.sk->bound_dev_if;
++ fl.fl4_tos = RT_CONN_FLAGS(asoc->base.sk);
++ fl.oif = asoc->base.sk->bound_dev_if;
+ }
+ if (saddr)
+- key.src = saddr->v4.sin_addr.s_addr;
++ fl.fl4_src = saddr->v4.sin_addr.s_addr;
+
+ SCTP_DEBUG_PRINTK("%s: DST:%u.%u.%u.%u, SRC:%u.%u.%u.%u - ",
+- __FUNCTION__, NIPQUAD(key.dst),
+- NIPQUAD(key.src));
++ __FUNCTION__, NIPQUAD(fl.fl4_dst),
++ NIPQUAD(fl.fl4_src));
+
+- if (!ip_route_output_key(&rt, &key)) {
++ if (!ip_route_output_key(&rt, &fl)) {
+ dst = &rt->u.dst;
+ }
+
+@@ -497,8 +497,8 @@
+ laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
+
+ if (AF_INET == laddr->a.sa.sa_family) {
+- key.src = laddr->a.v4.sin_addr.s_addr;
+- if (!ip_route_output_key(&rt, &key)) {
++ fl.fl4_src = laddr->a.v4.sin_addr.s_addr;
++ if (!ip_route_output_key(&rt, &fl)) {
+ dst = &rt->u.dst;
+ goto out_unlock;
+ }
+@@ -587,7 +587,7 @@
+ newinet->pmtudisc = inet->pmtudisc;
+ newinet->id = 0;
+
+- newinet->ttl = sysctl_ip_default_ttl;
++ newinet->uc_ttl = -1;
+ newinet->mc_loop = 1;
+ newinet->mc_ttl = 1;
+ newinet->mc_index = 0;
+@@ -656,7 +656,7 @@
+ return err;
+ }
+ sctp_ctl_socket->sk->allocation = GFP_ATOMIC;
+- inet_sk(sctp_ctl_socket->sk)->ttl = MAXTTL;
++ inet_sk(sctp_ctl_socket->sk)->uc_ttl = -1;
+
+ return 0;
+ }
+@@ -872,8 +872,7 @@
+ static struct inet_protocol sctp_protocol = {
+ .handler = sctp_rcv,
+ .err_handler = sctp_v4_err,
+- .protocol = IPPROTO_SCTP,
+- .name = "SCTP"
++ .no_policy = 1,
+ };
+
+ /* IPv4 address related functions. */
+@@ -960,7 +959,8 @@
+ return -EINVAL;
+
+ /* Add SCTP to inet_protos hash table. */
+- inet_add_protocol(&sctp_protocol);
++ if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0)
++ return -EAGAIN;
+
+ /* Add SCTP(TCP and UDP style) to inetsw linked list. */
+ inet_register_protosw(&sctp_seqpacket_protosw);
+@@ -1154,7 +1154,7 @@
+ err_init_mibs:
+ kmem_cache_destroy(sctp_chunk_cachep);
+ err_chunk_cachep:
+- inet_del_protocol(&sctp_protocol);
++ inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
+ inet_unregister_protosw(&sctp_seqpacket_protosw);
+ inet_unregister_protosw(&sctp_stream_protosw);
+ return status;
+@@ -1194,7 +1194,7 @@
+ sctp_proc_exit();
+ cleanup_sctp_mibs();
+
+- inet_del_protocol(&sctp_protocol);
++ inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
+ inet_unregister_protosw(&sctp_seqpacket_protosw);
+ inet_unregister_protosw(&sctp_stream_protosw);
+ }
+diff -Nru a/net/xfrm/Config.in b/net/xfrm/Config.in
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/Config.in 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,4 @@
++#
++# XFRM configuration
++#
++tristate ' IP: IPsec user configuration interface' CONFIG_XFRM_USER
+diff -Nru a/net/xfrm/Makefile b/net/xfrm/Makefile
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/Makefile 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,13 @@
++#
++# Makefile for the XFRM subsystem.
++#
++
++O_TARGET := xfrm.o
++
++export-objs = xfrm_export.o
++
++obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_input.o xfrm_algo.o \
++ xfrm_export.o
++obj-$(CONFIG_XFRM_USER) += xfrm_user.o
++
++include $(TOPDIR)/Rules.make
+diff -Nru a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_algo.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,729 @@
++/*
++ * xfrm algorithm interface
++ *
++ * Copyright (c) 2002 James Morris <jmorris at intercode.com.au>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2 of the License, or (at your option)
++ * any later version.
++ */
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/pfkeyv2.h>
++#include <net/xfrm.h>
++#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE)
++#include <net/ah.h>
++#endif
++#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
++#include <net/esp.h>
++#endif
++#include <asm/scatterlist.h>
++
++/*
++ * Algorithms supported by IPsec. These entries contain properties which
++ * are used in key negotiation and xfrm processing, and are used to verify
++ * that instantiated crypto transforms have correct parameters for IPsec
++ * purposes.
++ */
++static struct xfrm_algo_desc aalg_list[] = {
++{
++ .name = "digest_null",
++
++ .uinfo = {
++ .auth = {
++ .icv_truncbits = 0,
++ .icv_fullbits = 0,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_AALG_NULL,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 0,
++ .sadb_alg_maxbits = 0
++ }
++},
++{
++ .name = "md5",
++
++ .uinfo = {
++ .auth = {
++ .icv_truncbits = 96,
++ .icv_fullbits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_AALG_MD5HMAC,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 128,
++ .sadb_alg_maxbits = 128
++ }
++},
++{
++ .name = "sha1",
++
++ .uinfo = {
++ .auth = {
++ .icv_truncbits = 96,
++ .icv_fullbits = 160,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_AALG_SHA1HMAC,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 160,
++ .sadb_alg_maxbits = 160
++ }
++},
++{
++ .name = "sha256",
++
++ .uinfo = {
++ .auth = {
++ .icv_truncbits = 96,
++ .icv_fullbits = 256,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 256,
++ .sadb_alg_maxbits = 256
++ }
++},
++{
++ .name = "ripemd160",
++
++ .uinfo = {
++ .auth = {
++ .icv_truncbits = 96,
++ .icv_fullbits = 160,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 160,
++ .sadb_alg_maxbits = 160
++ }
++},
++};
++
++static struct xfrm_algo_desc ealg_list[] = {
++{
++ .name = "cipher_null",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 8,
++ .defkeybits = 0,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_EALG_NULL,
++ .sadb_alg_ivlen = 0,
++ .sadb_alg_minbits = 0,
++ .sadb_alg_maxbits = 0
++ }
++},
++{
++ .name = "des",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 64,
++ .defkeybits = 64,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_EALG_DESCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 64,
++ .sadb_alg_maxbits = 64
++ }
++},
++{
++ .name = "des3_ede",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 64,
++ .defkeybits = 192,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_EALG_3DESCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 192,
++ .sadb_alg_maxbits = 192
++ }
++},
++{
++ .name = "cast128",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 64,
++ .defkeybits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_EALG_CASTCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 40,
++ .sadb_alg_maxbits = 128
++ }
++},
++{
++ .name = "blowfish",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 64,
++ .defkeybits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 40,
++ .sadb_alg_maxbits = 448
++ }
++},
++{
++ .name = "aes",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 128,
++ .defkeybits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_EALG_AESCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 128,
++ .sadb_alg_maxbits = 256
++ }
++},
++{
++ .name = "serpent",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 128,
++ .defkeybits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_EALG_SERPENTCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 128,
++ .sadb_alg_maxbits = 256,
++ }
++},
++{
++ .name = "twofish",
++
++ .uinfo = {
++ .encr = {
++ .blockbits = 128,
++ .defkeybits = 128,
++ }
++ },
++
++ .desc = {
++ .sadb_alg_id = SADB_X_EALG_TWOFISHCBC,
++ .sadb_alg_ivlen = 8,
++ .sadb_alg_minbits = 128,
++ .sadb_alg_maxbits = 256
++ }
++},
++};
++
++static struct xfrm_algo_desc calg_list[] = {
++{
++ .name = "deflate",
++ .uinfo = {
++ .comp = {
++ .threshold = 90,
++ }
++ },
++ .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE }
++},
++{
++ .name = "lzs",
++ .uinfo = {
++ .comp = {
++ .threshold = 90,
++ }
++ },
++ .desc = { .sadb_alg_id = SADB_X_CALG_LZS }
++},
++{
++ .name = "lzjh",
++ .uinfo = {
++ .comp = {
++ .threshold = 50,
++ }
++ },
++ .desc = { .sadb_alg_id = SADB_X_CALG_LZJH }
++},
++};
++
++static inline int aalg_entries(void)
++{
++ return sizeof(aalg_list) / sizeof(aalg_list[0]);
++}
++
++static inline int ealg_entries(void)
++{
++ return sizeof(ealg_list) / sizeof(ealg_list[0]);
++}
++
++static inline int calg_entries(void)
++{
++ return sizeof(calg_list) / sizeof(calg_list[0]);
++}
++
++/* Todo: generic iterators */
++struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id)
++{
++ int i;
++
++ for (i = 0; i < aalg_entries(); i++) {
++ if (aalg_list[i].desc.sadb_alg_id == alg_id) {
++ if (aalg_list[i].available)
++ return &aalg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id)
++{
++ int i;
++
++ for (i = 0; i < ealg_entries(); i++) {
++ if (ealg_list[i].desc.sadb_alg_id == alg_id) {
++ if (ealg_list[i].available)
++ return &ealg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id)
++{
++ int i;
++
++ for (i = 0; i < calg_entries(); i++) {
++ if (calg_list[i].desc.sadb_alg_id == alg_id) {
++ if (calg_list[i].available)
++ return &calg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name)
++{
++ int i;
++
++ if (!name)
++ return NULL;
++
++ for (i=0; i < aalg_entries(); i++) {
++ if (strcmp(name, aalg_list[i].name) == 0) {
++ if (aalg_list[i].available)
++ return &aalg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name)
++{
++ int i;
++
++ if (!name)
++ return NULL;
++
++ for (i=0; i < ealg_entries(); i++) {
++ if (strcmp(name, ealg_list[i].name) == 0) {
++ if (ealg_list[i].available)
++ return &ealg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_calg_get_byname(char *name)
++{
++ int i;
++
++ if (!name)
++ return NULL;
++
++ for (i=0; i < calg_entries(); i++) {
++ if (strcmp(name, calg_list[i].name) == 0) {
++ if (calg_list[i].available)
++ return &calg_list[i];
++ else
++ break;
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx)
++{
++ if (idx >= aalg_entries())
++ return NULL;
++
++ return &aalg_list[idx];
++}
++
++struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx)
++{
++ if (idx >= ealg_entries())
++ return NULL;
++
++ return &ealg_list[idx];
++}
++
++struct xfrm_algo_desc *xfrm_calg_get_byidx(unsigned int idx)
++{
++ if (idx >= calg_entries())
++ return NULL;
++
++ return &calg_list[idx];
++}
++
++/*
++ * Probe for the availability of crypto algorithms, and set the available
++ * flag for any algorithms found on the system. This is typically called by
++ * pfkey during userspace SA add, update or register.
++ */
++void xfrm_probe_algs(void)
++{
++#ifdef CONFIG_CRYPTO
++ int i, status;
++
++ BUG_ON(in_softirq());
++
++ for (i = 0; i < aalg_entries(); i++) {
++ status = crypto_alg_available(aalg_list[i].name, 0);
++ if (aalg_list[i].available != status)
++ aalg_list[i].available = status;
++ }
++
++ for (i = 0; i < ealg_entries(); i++) {
++ status = crypto_alg_available(ealg_list[i].name, 0);
++ if (ealg_list[i].available != status)
++ ealg_list[i].available = status;
++ }
++
++ for (i = 0; i < calg_entries(); i++) {
++ status = crypto_alg_available(calg_list[i].name, 0);
++ if (calg_list[i].available != status)
++ calg_list[i].available = status;
++ }
++#endif
++}
++
++int xfrm_count_auth_supported(void)
++{
++ int i, n;
++
++ for (i = 0, n = 0; i < aalg_entries(); i++)
++ if (aalg_list[i].available)
++ n++;
++ return n;
++}
++
++int xfrm_count_enc_supported(void)
++{
++ int i, n;
++
++ for (i = 0, n = 0; i < ealg_entries(); i++)
++ if (ealg_list[i].available)
++ n++;
++ return n;
++}
++
++/* Move to common area: it is shared with AH. */
++
++void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm,
++ int offset, int len, icv_update_fn_t icv_update)
++{
++ int start = skb->len - skb->data_len;
++ int i, copy = start - offset;
++ struct scatterlist sg;
++
++ /* Checksum header. */
++ if (copy > 0) {
++ if (copy > len)
++ copy = len;
++
++ sg.page = virt_to_page(skb->data + offset);
++ sg.offset = (unsigned long)(skb->data + offset) % PAGE_SIZE;
++ sg.length = copy;
++
++ icv_update(tfm, &sg, 1);
++
++ if ((len -= copy) == 0)
++ return;
++ offset += copy;
++ }
++
++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++ int end;
++
++ BUG_TRAP(start <= offset + len);
++
++ end = start + skb_shinfo(skb)->frags[i].size;
++ if ((copy = end - offset) > 0) {
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++ if (copy > len)
++ copy = len;
++
++ sg.page = frag->page;
++ sg.offset = frag->page_offset + offset-start;
++ sg.length = copy;
++
++ icv_update(tfm, &sg, 1);
++
++ if (!(len -= copy))
++ return;
++ offset += copy;
++ }
++ start = end;
++ }
++
++ if (skb_shinfo(skb)->frag_list) {
++ struct sk_buff *list = skb_shinfo(skb)->frag_list;
++
++ for (; list; list = list->next) {
++ int end;
++
++ BUG_TRAP(start <= offset + len);
++
++ end = start + list->len;
++ if ((copy = end - offset) > 0) {
++ if (copy > len)
++ copy = len;
++ skb_icv_walk(list, tfm, offset-start, copy, icv_update);
++ if ((len -= copy) == 0)
++ return;
++ offset += copy;
++ }
++ start = end;
++ }
++ }
++ if (len)
++ BUG();
++}
++
++#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
++
++/* Looking generic it is not used in another places. */
++
++int
++skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
++{
++ int start = skb->len - skb->data_len;
++ int i, copy = start - offset;
++ int elt = 0;
++
++ if (copy > 0) {
++ if (copy > len)
++ copy = len;
++ sg[elt].page = virt_to_page(skb->data + offset);
++ sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE;
++ sg[elt].length = copy;
++ elt++;
++ if ((len -= copy) == 0)
++ return elt;
++ offset += copy;
++ }
++
++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++ int end;
++
++ BUG_TRAP(start <= offset + len);
++
++ end = start + skb_shinfo(skb)->frags[i].size;
++ if ((copy = end - offset) > 0) {
++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++
++ if (copy > len)
++ copy = len;
++ sg[elt].page = frag->page;
++ sg[elt].offset = frag->page_offset+offset-start;
++ sg[elt].length = copy;
++ elt++;
++ if (!(len -= copy))
++ return elt;
++ offset += copy;
++ }
++ start = end;
++ }
++
++ if (skb_shinfo(skb)->frag_list) {
++ struct sk_buff *list = skb_shinfo(skb)->frag_list;
++
++ for (; list; list = list->next) {
++ int end;
++
++ BUG_TRAP(start <= offset + len);
++
++ end = start + list->len;
++ if ((copy = end - offset) > 0) {
++ if (copy > len)
++ copy = len;
++ elt += skb_to_sgvec(list, sg+elt, offset - start, copy);
++ if ((len -= copy) == 0)
++ return elt;
++ offset += copy;
++ }
++ start = end;
++ }
++ }
++ if (len)
++ BUG();
++ return elt;
++}
++
++/* Check that skb data bits are writable. If they are not, copy data
++ * to newly created private area. If "tailbits" is given, make sure that
++ * tailbits bytes beyond current end of skb are writable.
++ *
++ * Returns amount of elements of scatterlist to load for subsequent
++ * transformations and pointer to writable trailer skb.
++ */
++
++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
++{
++ int copyflag;
++ int elt;
++ struct sk_buff *skb1, **skb_p;
++
++ /* If skb is cloned or its head is paged, reallocate
++ * head pulling out all the pages (pages are considered not writable
++ * at the moment even if they are anonymous).
++ */
++ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
++ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
++ return -ENOMEM;
++
++ /* Easy case. Most of packets will go this way. */
++ if (!skb_shinfo(skb)->frag_list) {
++ /* A little of trouble, not enough of space for trailer.
++ * This should not happen, when stack is tuned to generate
++ * good frames. OK, on miss we reallocate and reserve even more
++ * space, 128 bytes is fair. */
++
++ if (skb_tailroom(skb) < tailbits &&
++ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
++ return -ENOMEM;
++
++ /* Voila! */
++ *trailer = skb;
++ return 1;
++ }
++
++ /* Misery. We are in troubles, going to mincer fragments... */
++
++ elt = 1;
++ skb_p = &skb_shinfo(skb)->frag_list;
++ copyflag = 0;
++
++ while ((skb1 = *skb_p) != NULL) {
++ int ntail = 0;
++
++ /* The fragment is partially pulled by someone,
++ * this can happen on input. Copy it and everything
++ * after it. */
++
++ if (skb_shared(skb1))
++ copyflag = 1;
++
++ /* If the skb is the last, worry about trailer. */
++
++ if (skb1->next == NULL && tailbits) {
++ if (skb_shinfo(skb1)->nr_frags ||
++ skb_shinfo(skb1)->frag_list ||
++ skb_tailroom(skb1) < tailbits)
++ ntail = tailbits + 128;
++ }
++
++ if (copyflag ||
++ skb_cloned(skb1) ||
++ ntail ||
++ skb_shinfo(skb1)->nr_frags ||
++ skb_shinfo(skb1)->frag_list) {
++ struct sk_buff *skb2;
++
++ /* Fuck, we are miserable poor guys... */
++ if (ntail == 0)
++ skb2 = skb_copy(skb1, GFP_ATOMIC);
++ else
++ skb2 = skb_copy_expand(skb1,
++ skb_headroom(skb1),
++ ntail,
++ GFP_ATOMIC);
++ if (unlikely(skb2 == NULL))
++ return -ENOMEM;
++
++ if (skb1->sk)
++ skb_set_owner_w(skb, skb1->sk);
++
++ /* Looking around. Are we still alive?
++ * OK, link new skb, drop old one */
++
++ skb2->next = skb1->next;
++ *skb_p = skb2;
++ kfree_skb(skb1);
++ skb1 = skb2;
++ }
++ elt++;
++ *trailer = skb1;
++ skb_p = &skb1->next;
++ }
++
++ return elt;
++}
++
++void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
++{
++ if (tail != skb) {
++ skb->data_len += len;
++ skb->len += len;
++ }
++ return skb_put(tail, len);
++}
++#endif
+diff -Nru a/net/xfrm/xfrm_export.c b/net/xfrm/xfrm_export.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_export.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,63 @@
++#include <linux/module.h>
++#include <net/xfrm.h>
++
++EXPORT_SYMBOL(xfrm_user_policy);
++EXPORT_SYMBOL(km_waitq);
++EXPORT_SYMBOL(km_new_mapping);
++EXPORT_SYMBOL(xfrm_cfg_sem);
++EXPORT_SYMBOL(xfrm_policy_alloc);
++EXPORT_SYMBOL(__xfrm_policy_destroy);
++EXPORT_SYMBOL(xfrm_lookup);
++EXPORT_SYMBOL(__xfrm_policy_check);
++EXPORT_SYMBOL(__xfrm_route_forward);
++EXPORT_SYMBOL(xfrm_state_alloc);
++EXPORT_SYMBOL(__xfrm_state_destroy);
++EXPORT_SYMBOL(xfrm_state_insert);
++EXPORT_SYMBOL(xfrm_state_add);
++EXPORT_SYMBOL(xfrm_state_update);
++EXPORT_SYMBOL(xfrm_state_check_expire);
++EXPORT_SYMBOL(xfrm_state_check);
++EXPORT_SYMBOL(xfrm_state_lookup);
++EXPORT_SYMBOL(xfrm_state_register_afinfo);
++EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
++EXPORT_SYMBOL(xfrm_state_delete_tunnel);
++EXPORT_SYMBOL(xfrm_replay_check);
++EXPORT_SYMBOL(xfrm_replay_advance);
++EXPORT_SYMBOL(__secpath_destroy);
++EXPORT_SYMBOL(secpath_dup);
++EXPORT_SYMBOL(xfrm_get_acqseq);
++EXPORT_SYMBOL(xfrm_parse_spi);
++EXPORT_SYMBOL(xfrm_register_type);
++EXPORT_SYMBOL(xfrm_unregister_type);
++EXPORT_SYMBOL(xfrm_get_type);
++EXPORT_SYMBOL(xfrm_register_km);
++EXPORT_SYMBOL(xfrm_unregister_km);
++EXPORT_SYMBOL(xfrm_state_delete);
++EXPORT_SYMBOL(xfrm_state_walk);
++EXPORT_SYMBOL(xfrm_find_acq_byseq);
++EXPORT_SYMBOL(xfrm_find_acq);
++EXPORT_SYMBOL(xfrm_alloc_spi);
++EXPORT_SYMBOL(xfrm_state_flush);
++EXPORT_SYMBOL(xfrm_policy_bysel);
++EXPORT_SYMBOL(xfrm_policy_insert);
++EXPORT_SYMBOL(xfrm_policy_walk);
++EXPORT_SYMBOL(xfrm_policy_flush);
++EXPORT_SYMBOL(xfrm_policy_byid);
++EXPORT_SYMBOL(xfrm_policy_list);
++EXPORT_SYMBOL(xfrm_dst_lookup);
++EXPORT_SYMBOL(xfrm_policy_register_afinfo);
++EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
++
++EXPORT_SYMBOL_GPL(xfrm_probe_algs);
++EXPORT_SYMBOL_GPL(xfrm_count_auth_supported);
++EXPORT_SYMBOL_GPL(xfrm_count_enc_supported);
++EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx);
++EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx);
++EXPORT_SYMBOL_GPL(xfrm_calg_get_byidx);
++EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid);
++EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid);
++EXPORT_SYMBOL_GPL(xfrm_calg_get_byid);
++EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname);
++EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname);
++EXPORT_SYMBOL_GPL(xfrm_calg_get_byname);
++EXPORT_SYMBOL_GPL(skb_icv_walk);
+diff -Nru a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_input.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,85 @@
++/*
++ * xfrm_input.c
++ *
++ * Changes:
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific portion
++ *
++ */
++
++#include <linux/slab.h>
++#include <net/ip.h>
++#include <net/xfrm.h>
++
++static kmem_cache_t *secpath_cachep;
++
++void __secpath_destroy(struct sec_path *sp)
++{
++ int i;
++ for (i = 0; i < sp->len; i++)
++ xfrm_state_put(sp->x[i].xvec);
++ kmem_cache_free(secpath_cachep, sp);
++}
++
++struct sec_path *secpath_dup(struct sec_path *src)
++{
++ struct sec_path *sp;
++
++ sp = kmem_cache_alloc(secpath_cachep, SLAB_ATOMIC);
++ if (!sp)
++ return NULL;
++
++ sp->len = 0;
++ if (src) {
++ int i;
++
++ memcpy(sp, src, sizeof(*sp));
++ for (i = 0; i < sp->len; i++)
++ xfrm_state_hold(sp->x[i].xvec);
++ }
++ atomic_set(&sp->refcnt, 1);
++ return sp;
++}
++
++/* Fetch spi and seq from ipsec header */
++
++int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
++{
++ int offset, offset_seq;
++
++ switch (nexthdr) {
++ case IPPROTO_AH:
++ offset = offsetof(struct ip_auth_hdr, spi);
++ offset_seq = offsetof(struct ip_auth_hdr, seq_no);
++ break;
++ case IPPROTO_ESP:
++ offset = offsetof(struct ip_esp_hdr, spi);
++ offset_seq = offsetof(struct ip_esp_hdr, seq_no);
++ break;
++ case IPPROTO_COMP:
++ if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr)))
++ return -EINVAL;
++ *spi = ntohl(ntohs(*(u16*)(skb->h.raw + 2)));
++ *seq = 0;
++ return 0;
++ default:
++ return 1;
++ }
++
++ if (!pskb_may_pull(skb, 16))
++ return -EINVAL;
++
++ *spi = *(u32*)(skb->h.raw + offset);
++ *seq = *(u32*)(skb->h.raw + offset_seq);
++ return 0;
++}
++
++void __init xfrm_input_init(void)
++{
++ secpath_cachep = kmem_cache_create("secpath_cache",
++ sizeof(struct sec_path),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++ if (!secpath_cachep)
++ panic("XFRM: failed to allocate secpath_cache\n");
++}
+diff -Nru a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_output.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,46 @@
++/*
++ * generic xfrm output routines
++ *
++ * Copyright (c) 2003 James Morris <jmorris at intercode.com.au>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2 of the License, or (at your option)
++ * any later version.
++ */
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/skbuff.h>
++#include <net/xfrm.h>
++
++int xfrm_check_output(struct xfrm_state *x,
++ struct sk_buff *skb, unsigned short family)
++{
++ int err;
++
++ err = xfrm_state_check_expire(x);
++ if (err)
++ goto out;
++
++ if (x->props.mode) {
++ switch (family) {
++ case AF_INET:
++ err = xfrm4_tunnel_check_size(skb);
++ break;
++
++ case AF_INET6:
++ err = xfrm6_tunnel_check_size(skb);
++ break;
++
++ default:
++ err = -EINVAL;
++ }
++
++ if (err)
++ goto out;
++ }
++
++ err = xfrm_state_check_space(x, skb);
++out:
++ return err;
++}
+diff -Nru a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_policy.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,1259 @@
++/*
++ * xfrm_policy.c
++ *
++ * Changes:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * IPv6 support
++ * Kazunori MIYAZAWA @USAGI
++ * YOSHIFUJI Hideaki
++ * Split up af-specific portion
++ * Derek Atkins <derek at ihtfp.com> Add the post_input processor
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/slab.h>
++#include <linux/kmod.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/tqueue.h>
++#include <linux/notifier.h>
++#include <linux/netdevice.h>
++#include <net/xfrm.h>
++#include <net/ip.h>
++
++DECLARE_MUTEX(xfrm_cfg_sem);
++
++static rwlock_t xfrm_policy_lock = RW_LOCK_UNLOCKED;
++
++struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
++
++static rwlock_t xfrm_policy_afinfo_lock = RW_LOCK_UNLOCKED;
++static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
++
++kmem_cache_t *xfrm_dst_cache;
++
++static struct tq_struct xfrm_policy_gc_work;
++static struct list_head xfrm_policy_gc_list =
++ LIST_HEAD_INIT(xfrm_policy_gc_list);
++static spinlock_t xfrm_policy_gc_lock = SPIN_LOCK_UNLOCKED;
++
++static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
++static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
++
++int xfrm_register_type(struct xfrm_type *type, unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++ struct xfrm_type_map *typemap;
++ int err = 0;
++
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++ typemap = afinfo->type_map;
++
++ write_lock(&typemap->lock);
++ if (likely(typemap->map[type->proto] == NULL))
++ typemap->map[type->proto] = type;
++ else
++ err = -EEXIST;
++ write_unlock(&typemap->lock);
++ xfrm_policy_put_afinfo(afinfo);
++ return err;
++}
++
++int xfrm_unregister_type(struct xfrm_type *type, unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++ struct xfrm_type_map *typemap;
++ int err = 0;
++
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++ typemap = afinfo->type_map;
++
++ write_lock(&typemap->lock);
++ if (unlikely(typemap->map[type->proto] != type))
++ err = -ENOENT;
++ else
++ typemap->map[type->proto] = NULL;
++ write_unlock(&typemap->lock);
++ xfrm_policy_put_afinfo(afinfo);
++ return err;
++}
++
++struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo;
++ struct xfrm_type_map *typemap;
++ struct xfrm_type *type;
++ int modload_attempted = 0;
++
++retry:
++ afinfo = xfrm_policy_get_afinfo(family);
++ if (unlikely(afinfo == NULL))
++ return NULL;
++ typemap = afinfo->type_map;
++
++ read_lock(&typemap->lock);
++ type = typemap->map[proto];
++ if (type && type->owner)
++ __MOD_INC_USE_COUNT(type->owner);
++ read_unlock(&typemap->lock);
++ if (!type && !modload_attempted) {
++ char module_name[36];
++
++ xfrm_policy_put_afinfo(afinfo);
++ sprintf(module_name, "xfrm-type-%d-%d",
++ (int) family, (int) proto);
++ request_module(module_name);
++ modload_attempted = 1;
++ goto retry;
++ }
++
++ xfrm_policy_put_afinfo(afinfo);
++ return type;
++}
++
++int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
++ unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++ int err = 0;
++
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++
++ if (likely(afinfo->dst_lookup != NULL))
++ err = afinfo->dst_lookup(dst, fl);
++ else
++ err = -EINVAL;
++ xfrm_policy_put_afinfo(afinfo);
++ return err;
++}
++
++void xfrm_put_type(struct xfrm_type *type)
++{
++ if (type->owner)
++ __MOD_DEC_USE_COUNT(type->owner);
++}
++
++static inline unsigned long make_jiffies(long secs)
++{
++ if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
++ return MAX_SCHEDULE_TIMEOUT-1;
++ else
++ return secs*HZ;
++}
++
++static void xfrm_policy_timer(unsigned long data)
++{
++ struct xfrm_policy *xp = (struct xfrm_policy*)data;
++ unsigned long now = (unsigned long)xtime.tv_sec;
++ long next = LONG_MAX;
++ int warn = 0;
++ int dir;
++
++ read_lock(&xp->lock);
++
++ if (xp->dead)
++ goto out;
++
++ dir = xp->index & 7;
++
++ if (xp->lft.hard_add_expires_seconds) {
++ long tmo = xp->lft.hard_add_expires_seconds +
++ xp->curlft.add_time - now;
++ if (tmo <= 0)
++ goto expired;
++ if (tmo < next)
++ next = tmo;
++ }
++ if (xp->lft.hard_use_expires_seconds) {
++ long tmo = xp->lft.hard_use_expires_seconds +
++ (xp->curlft.use_time ? : xp->curlft.add_time) - now;
++ if (tmo <= 0)
++ goto expired;
++ if (tmo < next)
++ next = tmo;
++ }
++ if (xp->lft.soft_add_expires_seconds) {
++ long tmo = xp->lft.soft_add_expires_seconds +
++ xp->curlft.add_time - now;
++ if (tmo <= 0) {
++ warn = 1;
++ tmo = XFRM_KM_TIMEOUT;
++ }
++ if (tmo < next)
++ next = tmo;
++ }
++ if (xp->lft.soft_use_expires_seconds) {
++ long tmo = xp->lft.soft_use_expires_seconds +
++ (xp->curlft.use_time ? : xp->curlft.add_time) - now;
++ if (tmo <= 0) {
++ warn = 1;
++ tmo = XFRM_KM_TIMEOUT;
++ }
++ if (tmo < next)
++ next = tmo;
++ }
++
++ if (warn)
++ km_policy_expired(xp, dir, 0);
++ if (next != LONG_MAX &&
++ !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
++ xfrm_pol_hold(xp);
++
++out:
++ read_unlock(&xp->lock);
++ xfrm_pol_put(xp);
++ return;
++
++expired:
++ read_unlock(&xp->lock);
++ km_policy_expired(xp, dir, 1);
++ xfrm_policy_delete(xp, dir);
++ xfrm_pol_put(xp);
++}
++
++
++/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
++ * SPD calls.
++ */
++
++struct xfrm_policy *xfrm_policy_alloc(int gfp)
++{
++ struct xfrm_policy *policy;
++
++ policy = kmalloc(sizeof(struct xfrm_policy), gfp);
++
++ if (policy) {
++ memset(policy, 0, sizeof(struct xfrm_policy));
++ atomic_set(&policy->refcnt, 1);
++ policy->lock = RW_LOCK_UNLOCKED;
++ init_timer(&policy->timer);
++ policy->timer.data = (unsigned long)policy;
++ policy->timer.function = xfrm_policy_timer;
++ }
++ return policy;
++}
++
++/* Destroy xfrm_policy: descendant resources must be released to this moment. */
++
++void __xfrm_policy_destroy(struct xfrm_policy *policy)
++{
++ if (!policy->dead)
++ BUG();
++
++ if (policy->bundles)
++ BUG();
++
++ if (del_timer(&policy->timer))
++ BUG();
++
++ kfree(policy);
++}
++
++static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
++{
++ struct dst_entry *dst;
++
++ while ((dst = policy->bundles) != NULL) {
++ policy->bundles = dst->next;
++ dst_free(dst);
++ }
++
++ if (del_timer(&policy->timer))
++ atomic_dec(&policy->refcnt);
++
++ if (atomic_read(&policy->refcnt) > 1)
++ flow_cache_flush();
++
++ xfrm_pol_put(policy);
++}
++
++static void xfrm_policy_gc_task(void *data)
++{
++ struct xfrm_policy *policy;
++ struct list_head *entry, *tmp;
++ struct list_head gc_list = LIST_HEAD_INIT(gc_list);
++
++ spin_lock_bh(&xfrm_policy_gc_lock);
++ list_splice_init(&xfrm_policy_gc_list, &gc_list);
++ spin_unlock_bh(&xfrm_policy_gc_lock);
++
++ list_for_each_safe(entry, tmp, &gc_list) {
++ policy = list_entry(entry, struct xfrm_policy, list);
++ xfrm_policy_gc_kill(policy);
++ }
++}
++
++/* Rule must be locked. Release descentant resources, announce
++ * entry dead. The rule must be unlinked from lists to the moment.
++ */
++
++static void xfrm_policy_kill(struct xfrm_policy *policy)
++{
++ write_lock_bh(&policy->lock);
++ if (policy->dead)
++ goto out;
++
++ policy->dead = 1;
++
++ spin_lock(&xfrm_policy_gc_lock);
++ list_add(&policy->list, &xfrm_policy_gc_list);
++ spin_unlock(&xfrm_policy_gc_lock);
++ schedule_task(&xfrm_policy_gc_work);
++
++out:
++ write_unlock_bh(&policy->lock);
++}
++
++/* Generate new index... KAME seems to generate them ordered by cost
++ * of an absolute inpredictability of ordering of rules. This will not pass. */
++static u32 xfrm_gen_index(int dir)
++{
++ u32 idx;
++ struct xfrm_policy *p;
++ static u32 idx_generator;
++
++ for (;;) {
++ idx = (idx_generator | dir);
++ idx_generator += 8;
++ if (idx == 0)
++ idx = 8;
++ for (p = xfrm_policy_list[dir]; p; p = p->next) {
++ if (p->index == idx)
++ break;
++ }
++ if (!p)
++ return idx;
++ }
++}
++
++int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
++{
++ struct xfrm_policy *pol, **p;
++ struct xfrm_policy *delpol = NULL;
++ struct xfrm_policy **newpos = NULL;
++
++ write_lock_bh(&xfrm_policy_lock);
++ for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
++ if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
++ if (excl) {
++ write_unlock_bh(&xfrm_policy_lock);
++ return -EEXIST;
++ }
++ *p = pol->next;
++ delpol = pol;
++ if (policy->priority > pol->priority)
++ continue;
++ } else if (policy->priority >= pol->priority) {
++ p = &pol->next;
++ continue;
++ }
++ if (!newpos)
++ newpos = p;
++ if (delpol)
++ break;
++ p = &pol->next;
++ }
++ if (newpos)
++ p = newpos;
++ xfrm_pol_hold(policy);
++ policy->next = *p;
++ *p = policy;
++ atomic_inc(&flow_cache_genid);
++ policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
++ policy->curlft.add_time = (unsigned long)xtime.tv_sec;
++ policy->curlft.use_time = 0;
++ if (!mod_timer(&policy->timer, jiffies + HZ))
++ xfrm_pol_hold(policy);
++ write_unlock_bh(&xfrm_policy_lock);
++
++ if (delpol) {
++ xfrm_policy_kill(delpol);
++ }
++ wake_up(&km_waitq);
++ return 0;
++}
++
++struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
++ int delete)
++{
++ struct xfrm_policy *pol, **p;
++
++ write_lock_bh(&xfrm_policy_lock);
++ for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
++ if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
++ xfrm_pol_hold(pol);
++ if (delete)
++ *p = pol->next;
++ break;
++ }
++ }
++ write_unlock_bh(&xfrm_policy_lock);
++
++ if (pol && delete) {
++ atomic_inc(&flow_cache_genid);
++ xfrm_policy_kill(pol);
++ wake_up(&km_waitq);
++ }
++ return pol;
++}
++
++struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
++{
++ struct xfrm_policy *pol, **p;
++
++ write_lock_bh(&xfrm_policy_lock);
++ for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) {
++ if (pol->index == id) {
++ xfrm_pol_hold(pol);
++ if (delete)
++ *p = pol->next;
++ break;
++ }
++ }
++ write_unlock_bh(&xfrm_policy_lock);
++
++ if (pol && delete) {
++ atomic_inc(&flow_cache_genid);
++ xfrm_policy_kill(pol);
++ wake_up(&km_waitq);
++ }
++ return pol;
++}
++
++void xfrm_policy_flush()
++{
++ struct xfrm_policy *xp;
++ int dir;
++
++ write_lock_bh(&xfrm_policy_lock);
++ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
++ while ((xp = xfrm_policy_list[dir]) != NULL) {
++ xfrm_policy_list[dir] = xp->next;
++ write_unlock_bh(&xfrm_policy_lock);
++
++ xfrm_policy_kill(xp);
++
++ write_lock_bh(&xfrm_policy_lock);
++ }
++ }
++ atomic_inc(&flow_cache_genid);
++ write_unlock_bh(&xfrm_policy_lock);
++ wake_up(&km_waitq);
++}
++
++int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
++ void *data)
++{
++ struct xfrm_policy *xp;
++ int dir;
++ int count = 0;
++ int error = 0;
++
++ read_lock_bh(&xfrm_policy_lock);
++ for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
++ for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
++ count++;
++ }
++
++ if (count == 0) {
++ error = -ENOENT;
++ goto out;
++ }
++
++ for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
++ for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
++ error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
++ if (error)
++ goto out;
++ }
++ }
++
++out:
++ read_unlock_bh(&xfrm_policy_lock);
++ return error;
++}
++
++
++/* Find policy to apply to this flow. */
++
++static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
++ void **objp, atomic_t **obj_refp)
++{
++ struct xfrm_policy *pol;
++
++ read_lock_bh(&xfrm_policy_lock);
++ for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
++ struct xfrm_selector *sel = &pol->selector;
++ int match;
++
++ if (pol->family != family)
++ continue;
++
++ match = xfrm_selector_match(sel, fl, family);
++ if (match) {
++ xfrm_pol_hold(pol);
++ break;
++ }
++ }
++ read_unlock_bh(&xfrm_policy_lock);
++ if ((*objp = (void *) pol) != NULL)
++ *obj_refp = &pol->refcnt;
++}
++
++struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
++{
++ struct xfrm_policy *pol;
++
++ read_lock_bh(&xfrm_policy_lock);
++ if ((pol = sk->policy[dir]) != NULL) {
++ int match;
++
++ match = xfrm_selector_match(&pol->selector, fl, sk->family);
++ if (match)
++ xfrm_pol_hold(pol);
++ else
++ pol = NULL;
++ }
++ read_unlock_bh(&xfrm_policy_lock);
++ return pol;
++}
++
++static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
++{
++ pol->next = xfrm_policy_list[dir];
++ xfrm_policy_list[dir] = pol;
++ xfrm_pol_hold(pol);
++}
++
++static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
++ int dir)
++{
++ struct xfrm_policy **polp;
++
++ for (polp = &xfrm_policy_list[dir];
++ *polp != NULL; polp = &(*polp)->next) {
++ if (*polp == pol) {
++ *polp = pol->next;
++ return pol;
++ }
++ }
++ return NULL;
++}
++
++void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
++{
++ write_lock_bh(&xfrm_policy_lock);
++ pol = __xfrm_policy_unlink(pol, dir);
++ write_unlock_bh(&xfrm_policy_lock);
++ if (pol) {
++ if (dir < XFRM_POLICY_MAX)
++ atomic_inc(&flow_cache_genid);
++ xfrm_policy_kill(pol);
++ }
++}
++
++int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
++{
++ struct xfrm_policy *old_pol;
++
++ write_lock_bh(&xfrm_policy_lock);
++ old_pol = sk->policy[dir];
++ sk->policy[dir] = pol;
++ if (pol) {
++ pol->curlft.add_time = (unsigned long)xtime.tv_sec;
++ pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
++ __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
++ }
++ if (old_pol)
++ __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
++ write_unlock_bh(&xfrm_policy_lock);
++
++ if (old_pol) {
++ xfrm_policy_kill(old_pol);
++ }
++ wake_up(&km_waitq);
++ return 0;
++}
++
++static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
++{
++ struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC);
++
++ if (newp) {
++ newp->selector = old->selector;
++ newp->lft = old->lft;
++ newp->curlft = old->curlft;
++ newp->action = old->action;
++ newp->flags = old->flags;
++ newp->xfrm_nr = old->xfrm_nr;
++ newp->index = old->index;
++ memcpy(newp->xfrm_vec, old->xfrm_vec,
++ newp->xfrm_nr*sizeof(struct xfrm_tmpl));
++ write_lock_bh(&xfrm_policy_lock);
++ __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
++ write_unlock_bh(&xfrm_policy_lock);
++ xfrm_pol_put(newp);
++ }
++ return newp;
++}
++
++int __xfrm_sk_clone_policy(struct sock *sk)
++{
++ struct xfrm_policy *p0, *p1;
++ p0 = sk->policy[0];
++ p1 = sk->policy[1];
++ sk->policy[0] = NULL;
++ sk->policy[1] = NULL;
++ if (p0 && (sk->policy[0] = clone_policy(p0, 0)) == NULL)
++ return -ENOMEM;
++ if (p1 && (sk->policy[1] = clone_policy(p1, 1)) == NULL)
++ return -ENOMEM;
++ return 0;
++}
++
++/* Resolve list of templates for the flow, given policy. */
++
++static int
++xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
++ struct xfrm_state **xfrm,
++ unsigned short family)
++{
++ int nx;
++ int i, error;
++ xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
++ xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
++
++ for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
++ struct xfrm_state *x;
++ xfrm_address_t *remote = daddr;
++ xfrm_address_t *local = saddr;
++ struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
++
++ if (tmpl->mode) {
++ remote = &tmpl->id.daddr;
++ local = &tmpl->saddr;
++ }
++
++ x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
++
++ if (x && x->km.state == XFRM_STATE_VALID) {
++ xfrm[nx++] = x;
++ daddr = remote;
++ saddr = local;
++ continue;
++ }
++ if (x) {
++ error = (x->km.state == XFRM_STATE_ERROR ?
++ -EINVAL : -EAGAIN);
++ xfrm_state_put(x);
++ }
++
++ if (!tmpl->optional)
++ goto fail;
++ }
++ return nx;
++
++fail:
++ for (nx--; nx>=0; nx--)
++ xfrm_state_put(xfrm[nx]);
++ return error;
++}
++
++/* Check that the bundle accepts the flow and its components are
++ * still valid.
++ */
++
++static struct dst_entry *
++xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
++{
++ struct dst_entry *x;
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++ if (unlikely(afinfo == NULL))
++ return ERR_PTR(-EINVAL);
++ x = afinfo->find_bundle(fl, policy);
++ xfrm_policy_put_afinfo(afinfo);
++ return x;
++}
++
++/* Allocate chain of dst_entry's, attach known xfrm's, calculate
++ * all the metrics... Shortly, bundle a bundle.
++ */
++
++static int
++xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
++ struct flowi *fl, struct dst_entry **dst_p,
++ unsigned short family)
++{
++ int err;
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++ if (unlikely(afinfo == NULL))
++ return -EINVAL;
++ err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p);
++ xfrm_policy_put_afinfo(afinfo);
++ return err;
++}
++
++static inline int policy_to_flow_dir(int dir)
++{
++ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
++ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
++ XFRM_POLICY_FWD == FLOW_DIR_FWD)
++ return dir;
++ switch (dir) {
++ default:
++ case XFRM_POLICY_IN:
++ return FLOW_DIR_IN;
++ case XFRM_POLICY_OUT:
++ return FLOW_DIR_OUT;
++ case XFRM_POLICY_FWD:
++ return FLOW_DIR_FWD;
++ };
++}
++
++static int stale_bundle(struct dst_entry *dst);
++
++/* Main function: finds/creates a bundle for given flow.
++ *
++ * At the moment we eat a raw IP route. Mostly to speed up lookups
++ * on interfaces with disabled IPsec.
++ */
++int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
++ struct sock *sk, int flags)
++{
++ struct xfrm_policy *policy;
++ struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
++ struct dst_entry *dst, *dst_orig = *dst_p;
++ int nx = 0;
++ int err;
++ u32 genid;
++ u16 family = dst_orig->ops->family;
++restart:
++ genid = atomic_read(&flow_cache_genid);
++ policy = NULL;
++ if (sk && sk->policy[1])
++ policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
++
++ if (!policy) {
++ /* To accelerate a bit... */
++ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
++ return 0;
++
++ policy = flow_cache_lookup(fl, family,
++ policy_to_flow_dir(XFRM_POLICY_OUT),
++ xfrm_policy_lookup);
++ }
++
++ if (!policy)
++ return 0;
++
++ policy->curlft.use_time = (unsigned long)xtime.tv_sec;
++
++ switch (policy->action) {
++ case XFRM_POLICY_BLOCK:
++ /* Prohibit the flow */
++ xfrm_pol_put(policy);
++ return -EPERM;
++
++ case XFRM_POLICY_ALLOW:
++ if (policy->xfrm_nr == 0) {
++ /* Flow passes not transformed. */
++ xfrm_pol_put(policy);
++ return 0;
++ }
++
++ /* Try to find matching bundle.
++ *
++ * LATER: help from flow cache. It is optional, this
++ * is required only for output policy.
++ */
++ dst = xfrm_find_bundle(fl, policy, family);
++ if (IS_ERR(dst)) {
++ xfrm_pol_put(policy);
++ return PTR_ERR(dst);
++ }
++
++ if (dst)
++ break;
++
++ nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
++
++ if (unlikely(nx<0)) {
++ err = nx;
++ if (err == -EAGAIN && flags) {
++ DECLARE_WAITQUEUE(wait, current);
++
++ add_wait_queue(&km_waitq, &wait);
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule();
++ set_current_state(TASK_RUNNING);
++ remove_wait_queue(&km_waitq, &wait);
++
++ nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
++
++ if (nx == -EAGAIN && signal_pending(current)) {
++ err = -ERESTART;
++ goto error;
++ }
++ if (nx == -EAGAIN ||
++ genid != atomic_read(&flow_cache_genid)) {
++ xfrm_pol_put(policy);
++ goto restart;
++ }
++ err = nx;
++ }
++ if (err < 0)
++ goto error;
++ }
++ if (nx == 0) {
++ /* Flow passes not transformed. */
++ xfrm_pol_put(policy);
++ return 0;
++ }
++
++ dst = dst_orig;
++ err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family);
++
++ if (unlikely(err)) {
++ int i;
++ for (i=0; i<nx; i++)
++ xfrm_state_put(xfrm[i]);
++ goto error;
++ }
++
++ write_lock_bh(&policy->lock);
++ if (unlikely(policy->dead || stale_bundle(dst))) {
++ /* Wow! While we worked on resolving, this
++ * policy has gone. Retry. It is not paranoia,
++ * we just cannot enlist new bundle to dead object.
++ * We can't enlist stable bundles either.
++ */
++ write_unlock_bh(&policy->lock);
++
++ xfrm_pol_put(policy);
++ if (dst)
++ dst_free(dst);
++ goto restart;
++ }
++ dst->next = policy->bundles;
++ policy->bundles = dst;
++ dst_hold(dst);
++ write_unlock_bh(&policy->lock);
++ }
++ *dst_p = dst;
++ dst_release(dst_orig);
++ xfrm_pol_put(policy);
++ return 0;
++
++error:
++ dst_release(dst_orig);
++ xfrm_pol_put(policy);
++ *dst_p = NULL;
++ return err;
++}
++
++/* When skb is transformed back to its "native" form, we have to
++ * check policy restrictions. At the moment we make this in maximally
++ * stupid way. Shame on me. :-) Of course, connected sockets must
++ * have policy cached at them.
++ */
++
++static inline int
++xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
++ unsigned short family)
++{
++ if (xfrm_state_kern(x))
++ return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family);
++ return x->id.proto == tmpl->id.proto &&
++ (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
++ (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
++ x->props.mode == tmpl->mode &&
++ (tmpl->aalgos & (1<<x->props.aalgo)) &&
++ !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
++}
++
++static inline int
++xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
++ unsigned short family)
++{
++ int idx = start;
++
++ if (tmpl->optional) {
++ if (!tmpl->mode)
++ return start;
++ } else
++ start = -1;
++ for (; idx < sp->len; idx++) {
++ if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family))
++ return ++idx;
++ if (sp->x[idx].xvec->props.mode)
++ break;
++ }
++ return start;
++}
++
++static int
++_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
++
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++
++ afinfo->decode_session(skb, fl);
++ xfrm_policy_put_afinfo(afinfo);
++ return 0;
++}
++
++static inline int secpath_has_tunnel(struct sec_path *sp, int k)
++{
++ for (; k < sp->len; k++) {
++ if (sp->x[k].xvec->props.mode)
++ return 1;
++ }
++
++ return 0;
++}
++
++int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
++ unsigned short family)
++{
++ struct xfrm_policy *pol;
++ struct flowi fl;
++
++ if (_decode_session(skb, &fl, family) < 0)
++ return 0;
++
++ /* First, check used SA against their selectors. */
++ if (skb->sp) {
++ int i;
++
++ for (i=skb->sp->len-1; i>=0; i--) {
++ struct sec_decap_state *xvec = &(skb->sp->x[i]);
++ if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
++ return 0;
++
++ /* If there is a post_input processor, try running it */
++ if (xvec->xvec->type->post_input &&
++ (xvec->xvec->type->post_input)(xvec->xvec,
++ &(xvec->decap),
++ skb) != 0)
++ return 0;
++ }
++ }
++
++ pol = NULL;
++ if (sk && sk->policy[dir])
++ pol = xfrm_sk_policy_lookup(sk, dir, &fl);
++
++ if (!pol)
++ pol = flow_cache_lookup(&fl, family,
++ policy_to_flow_dir(dir),
++ xfrm_policy_lookup);
++
++ if (!pol)
++ return !skb->sp || !secpath_has_tunnel(skb->sp, 0);
++
++ pol->curlft.use_time = (unsigned long)xtime.tv_sec;
++
++ if (pol->action == XFRM_POLICY_ALLOW) {
++ struct sec_path *sp;
++ static struct sec_path dummy;
++ int i, k;
++
++ if ((sp = skb->sp) == NULL)
++ sp = &dummy;
++
++ /* For each tunnel xfrm, find the first matching tmpl.
++ * For each tmpl before that, find corresponding xfrm.
++ * Order is _important_. Later we will implement
++ * some barriers, but at the moment barriers
++ * are implied between each two transformations.
++ */
++ for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
++ k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
++ if (k < 0)
++ goto reject;
++ }
++
++ if (secpath_has_tunnel(sp, k))
++ goto reject;
++
++ xfrm_pol_put(pol);
++ return 1;
++ }
++
++reject:
++ xfrm_pol_put(pol);
++ return 0;
++}
++
++int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
++{
++ struct flowi fl;
++
++ if (_decode_session(skb, &fl, family) < 0)
++ return 0;
++
++ return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;
++}
++
++/* Optimize later using cookies and generation ids. */
++
++static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
++{
++ if (!stale_bundle(dst))
++ return dst;
++
++ dst_release(dst);
++ return NULL;
++}
++
++static int stale_bundle(struct dst_entry *dst)
++{
++ struct dst_entry *child = dst;
++
++ while (child) {
++ if (child->obsolete > 0 ||
++ (child->dev && !netif_running(child->dev)) ||
++ (child->xfrm && child->xfrm->km.state != XFRM_STATE_VALID)) {
++ return 1;
++ }
++ child = child->child;
++ }
++
++ return 0;
++}
++
++static void xfrm_dst_destroy(struct dst_entry *dst)
++{
++ if (!dst->xfrm)
++ return;
++ xfrm_state_put(dst->xfrm);
++ dst->xfrm = NULL;
++}
++
++static void xfrm_link_failure(struct sk_buff *skb)
++{
++ /* Impossible. Such dst must be popped before reaches point of failure. */
++ return;
++}
++
++static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
++{
++ if (dst) {
++ if (dst->obsolete) {
++ dst_release(dst);
++ dst = NULL;
++ }
++ }
++ return dst;
++}
++
++static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
++{
++ int i;
++ struct xfrm_policy *pol;
++ struct dst_entry *dst, **dstp, *gc_list = NULL;
++
++ read_lock_bh(&xfrm_policy_lock);
++ for (i=0; i<2*XFRM_POLICY_MAX; i++) {
++ for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
++ write_lock(&pol->lock);
++ dstp = &pol->bundles;
++ while ((dst=*dstp) != NULL) {
++ if (func(dst)) {
++ *dstp = dst->next;
++ dst->next = gc_list;
++ gc_list = dst;
++ } else {
++ dstp = &dst->next;
++ }
++ }
++ write_unlock(&pol->lock);
++ }
++ }
++ read_unlock_bh(&xfrm_policy_lock);
++
++ while (gc_list) {
++ dst = gc_list;
++ gc_list = dst->next;
++ dst_free(dst);
++ }
++}
++
++static int unused_bundle(struct dst_entry *dst)
++{
++ return !atomic_read(&dst->__refcnt);
++}
++
++static void __xfrm_garbage_collect(void)
++{
++ xfrm_prune_bundles(unused_bundle);
++}
++
++int xfrm_flush_bundles(void)
++{
++ xfrm_prune_bundles(stale_bundle);
++ return 0;
++}
++
++/* Well... that's _TASK_. We need to scan through transformation
++ * list and figure out what mss tcp should generate in order to
++ * final datagram fit to mtu. Mama mia... :-)
++ *
++ * Apparently, some easy way exists, but we used to choose the most
++ * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta.
++ *
++ * Consider this function as something like dark humour. :-)
++ */
++static int xfrm_get_mss(struct dst_entry *dst, u32 mtu)
++{
++ int res = mtu - dst->header_len;
++
++ for (;;) {
++ struct dst_entry *d = dst;
++ int m = res;
++
++ do {
++ struct xfrm_state *x = d->xfrm;
++ if (x) {
++ spin_lock_bh(&x->lock);
++ if (x->km.state == XFRM_STATE_VALID &&
++ x->type && x->type->get_max_size)
++ m = x->type->get_max_size(d->xfrm, m);
++ else
++ m += x->props.header_len;
++ spin_unlock_bh(&x->lock);
++ }
++ } while ((d = d->child) != NULL);
++
++ if (m <= mtu)
++ break;
++ res -= (m - mtu);
++ if (res < 88)
++ return mtu;
++ }
++
++ return res + dst->header_len;
++}
++
++int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
++{
++ int err = 0;
++ if (unlikely(afinfo == NULL))
++ return -EINVAL;
++ if (unlikely(afinfo->family >= NPROTO))
++ return -EAFNOSUPPORT;
++ write_lock(&xfrm_policy_afinfo_lock);
++ if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
++ err = -ENOBUFS;
++ else {
++ struct dst_ops *dst_ops = afinfo->dst_ops;
++ if (likely(dst_ops->kmem_cachep == NULL))
++ dst_ops->kmem_cachep = xfrm_dst_cache;
++ if (likely(dst_ops->check == NULL))
++ dst_ops->check = xfrm_dst_check;
++ if (likely(dst_ops->destroy == NULL))
++ dst_ops->destroy = xfrm_dst_destroy;
++ if (likely(dst_ops->negative_advice == NULL))
++ dst_ops->negative_advice = xfrm_negative_advice;
++ if (likely(dst_ops->link_failure == NULL))
++ dst_ops->link_failure = xfrm_link_failure;
++ if (likely(dst_ops->get_mss == NULL))
++ dst_ops->get_mss = xfrm_get_mss;
++ if (likely(afinfo->garbage_collect == NULL))
++ afinfo->garbage_collect = __xfrm_garbage_collect;
++ xfrm_policy_afinfo[afinfo->family] = afinfo;
++ }
++ write_unlock(&xfrm_policy_afinfo_lock);
++ return err;
++}
++
++int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
++{
++ int err = 0;
++ if (unlikely(afinfo == NULL))
++ return -EINVAL;
++ if (unlikely(afinfo->family >= NPROTO))
++ return -EAFNOSUPPORT;
++ write_lock(&xfrm_policy_afinfo_lock);
++ if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
++ if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
++ err = -EINVAL;
++ else {
++ struct dst_ops *dst_ops = afinfo->dst_ops;
++ xfrm_policy_afinfo[afinfo->family] = NULL;
++ dst_ops->kmem_cachep = NULL;
++ dst_ops->check = NULL;
++ dst_ops->destroy = NULL;
++ dst_ops->negative_advice = NULL;
++ dst_ops->link_failure = NULL;
++ dst_ops->get_mss = NULL;
++ afinfo->garbage_collect = NULL;
++ }
++ }
++ write_unlock(&xfrm_policy_afinfo_lock);
++ return err;
++}
++
++static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
++{
++ struct xfrm_policy_afinfo *afinfo;
++ if (unlikely(family >= NPROTO))
++ return NULL;
++ read_lock(&xfrm_policy_afinfo_lock);
++ afinfo = xfrm_policy_afinfo[family];
++ if (likely(afinfo != NULL))
++ read_lock(&afinfo->lock);
++ read_unlock(&xfrm_policy_afinfo_lock);
++ return afinfo;
++}
++
++static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
++{
++ if (unlikely(afinfo == NULL))
++ return;
++ read_unlock(&afinfo->lock);
++}
++
++static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
++{
++ switch (event) {
++ case NETDEV_DOWN:
++ xfrm_flush_bundles();
++ }
++ return NOTIFY_DONE;
++}
++
++struct notifier_block xfrm_dev_notifier = {
++ xfrm_dev_event,
++ NULL,
++ 0
++};
++
++void __init xfrm_policy_init(void)
++{
++ xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
++ sizeof(struct xfrm_dst),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++ if (!xfrm_dst_cache)
++ panic("XFRM: failed to allocate xfrm_dst_cache\n");
++
++ INIT_TQUEUE(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
++ register_netdevice_notifier(&xfrm_dev_notifier);
++}
++
++void __init xfrm_init(void)
++{
++ xfrm_state_init();
++ xfrm_policy_init();
++ xfrm_input_init();
++}
++
+diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_state.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,950 @@
++/*
++ * xfrm_state.c
++ *
++ * Changes:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * IPv6 support
++ * YOSHIFUJI Hideaki @USAGI
++ * Split up af-specific functions
++ * Derek Atkins <derek at ihtfp.com>
++ * Add UDP Encapsulation
++ *
++ */
++
++#include <net/xfrm.h>
++#include <linux/pfkeyv2.h>
++#include <linux/ipsec.h>
++#include <asm/uaccess.h>
++#include <linux/tqueue.h>
++
++/* Each xfrm_state may be linked to two tables:
++
++ 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
++ 2. Hash table by daddr to find what SAs exist for given
++ destination/tunnel endpoint. (output)
++ */
++
++static spinlock_t xfrm_state_lock = SPIN_LOCK_UNLOCKED;
++
++/* Hash table to find appropriate SA towards given target (endpoint
++ * of tunnel or destination of transport mode) allowed by selector.
++ *
++ * Main use is finding SA after policy selected tunnel or transport mode.
++ * Also, it can be used by ah/esp icmp error handler to find offending SA.
++ */
++static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE];
++static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE];
++
++DECLARE_WAIT_QUEUE_HEAD(km_waitq);
++
++static rwlock_t xfrm_state_afinfo_lock = RW_LOCK_UNLOCKED;
++static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
++
++static struct tq_struct xfrm_state_gc_work;
++static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list);
++static spinlock_t xfrm_state_gc_lock = SPIN_LOCK_UNLOCKED;
++
++static void __xfrm_state_delete(struct xfrm_state *x);
++
++static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
++static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
++
++static void xfrm_state_gc_destroy(struct xfrm_state *x)
++{
++ if (del_timer(&x->timer))
++ BUG();
++ if (x->aalg)
++ kfree(x->aalg);
++ if (x->ealg)
++ kfree(x->ealg);
++ if (x->calg)
++ kfree(x->calg);
++ if (x->encap)
++ kfree(x->encap);
++ if (x->type) {
++ x->type->destructor(x);
++ xfrm_put_type(x->type);
++ }
++ kfree(x);
++}
++
++static void xfrm_state_gc_task(void *data)
++{
++ struct xfrm_state *x;
++ struct list_head *entry, *tmp;
++ struct list_head gc_list = LIST_HEAD_INIT(gc_list);
++
++ spin_lock_bh(&xfrm_state_gc_lock);
++ list_splice_init(&xfrm_state_gc_list, &gc_list);
++ spin_unlock_bh(&xfrm_state_gc_lock);
++
++ list_for_each_safe(entry, tmp, &gc_list) {
++ x = list_entry(entry, struct xfrm_state, bydst);
++ xfrm_state_gc_destroy(x);
++ }
++ wake_up(&km_waitq);
++}
++
++static inline unsigned long make_jiffies(long secs)
++{
++ if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
++ return MAX_SCHEDULE_TIMEOUT-1;
++ else
++ return secs*HZ;
++}
++
++static void xfrm_timer_handler(unsigned long data)
++{
++ struct xfrm_state *x = (struct xfrm_state*)data;
++ unsigned long now = (unsigned long)xtime.tv_sec;
++ long next = LONG_MAX;
++ int warn = 0;
++
++ spin_lock(&x->lock);
++ if (x->km.state == XFRM_STATE_DEAD)
++ goto out;
++ if (x->km.state == XFRM_STATE_EXPIRED)
++ goto expired;
++ if (x->lft.hard_add_expires_seconds) {
++ long tmo = x->lft.hard_add_expires_seconds +
++ x->curlft.add_time - now;
++ if (tmo <= 0)
++ goto expired;
++ if (tmo < next)
++ next = tmo;
++ }
++ if (x->lft.hard_use_expires_seconds) {
++ long tmo = x->lft.hard_use_expires_seconds +
++ (x->curlft.use_time ? : now) - now;
++ if (tmo <= 0)
++ goto expired;
++ if (tmo < next)
++ next = tmo;
++ }
++ if (x->km.dying)
++ goto resched;
++ if (x->lft.soft_add_expires_seconds) {
++ long tmo = x->lft.soft_add_expires_seconds +
++ x->curlft.add_time - now;
++ if (tmo <= 0)
++ warn = 1;
++ else if (tmo < next)
++ next = tmo;
++ }
++ if (x->lft.soft_use_expires_seconds) {
++ long tmo = x->lft.soft_use_expires_seconds +
++ (x->curlft.use_time ? : now) - now;
++ if (tmo <= 0)
++ warn = 1;
++ else if (tmo < next)
++ next = tmo;
++ }
++
++ if (warn)
++ km_state_expired(x, 0);
++resched:
++ if (next != LONG_MAX &&
++ !mod_timer(&x->timer, jiffies + make_jiffies(next)))
++ xfrm_state_hold(x);
++ goto out;
++
++expired:
++ if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) {
++ x->km.state = XFRM_STATE_EXPIRED;
++ wake_up(&km_waitq);
++ next = 2;
++ goto resched;
++ }
++ if (x->id.spi != 0)
++ km_state_expired(x, 1);
++ __xfrm_state_delete(x);
++
++out:
++ spin_unlock(&x->lock);
++ xfrm_state_put(x);
++}
++
++struct xfrm_state *xfrm_state_alloc(void)
++{
++ struct xfrm_state *x;
++
++ x = kmalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
++
++ if (x) {
++ memset(x, 0, sizeof(struct xfrm_state));
++ atomic_set(&x->refcnt, 1);
++ atomic_set(&x->tunnel_users, 0);
++ INIT_LIST_HEAD(&x->bydst);
++ INIT_LIST_HEAD(&x->byspi);
++ init_timer(&x->timer);
++ x->timer.function = xfrm_timer_handler;
++ x->timer.data = (unsigned long)x;
++ x->curlft.add_time = (unsigned long)xtime.tv_sec;
++ x->lft.soft_byte_limit = XFRM_INF;
++ x->lft.soft_packet_limit = XFRM_INF;
++ x->lft.hard_byte_limit = XFRM_INF;
++ x->lft.hard_packet_limit = XFRM_INF;
++ x->lock = SPIN_LOCK_UNLOCKED;
++ }
++ return x;
++}
++
++void __xfrm_state_destroy(struct xfrm_state *x)
++{
++ BUG_TRAP(x->km.state == XFRM_STATE_DEAD);
++
++ spin_lock_bh(&xfrm_state_gc_lock);
++ list_add(&x->bydst, &xfrm_state_gc_list);
++ spin_unlock_bh(&xfrm_state_gc_lock);
++ schedule_task(&xfrm_state_gc_work);
++}
++
++static void __xfrm_state_delete(struct xfrm_state *x)
++{
++ if (x->km.state != XFRM_STATE_DEAD) {
++ x->km.state = XFRM_STATE_DEAD;
++ spin_lock(&xfrm_state_lock);
++ list_del(&x->bydst);
++ atomic_dec(&x->refcnt);
++ if (x->id.spi) {
++ list_del(&x->byspi);
++ atomic_dec(&x->refcnt);
++ }
++ spin_unlock(&xfrm_state_lock);
++ if (del_timer(&x->timer))
++ atomic_dec(&x->refcnt);
++
++ /* The number two in this test is the reference
++ * mentioned in the comment below plus the reference
++ * our caller holds. A larger value means that
++ * there are DSTs attached to this xfrm_state.
++ */
++ if (atomic_read(&x->refcnt) > 2)
++ xfrm_flush_bundles();
++
++ /* All xfrm_state objects are created by xfrm_state_alloc.
++ * The xfrm_state_alloc call gives a reference, and that
++ * is what we are dropping here.
++ */
++ atomic_dec(&x->refcnt);
++ }
++}
++
++void xfrm_state_delete(struct xfrm_state *x)
++{
++ spin_lock_bh(&x->lock);
++ __xfrm_state_delete(x);
++ spin_unlock_bh(&x->lock);
++}
++
++void xfrm_state_flush(u8 proto)
++{
++ int i;
++ struct xfrm_state *x;
++
++ spin_lock_bh(&xfrm_state_lock);
++ for (i = 0; i < XFRM_DST_HSIZE; i++) {
++restart:
++ list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
++ if (!xfrm_state_kern(x) &&
++ (proto == IPSEC_PROTO_ANY || x->id.proto == proto)) {
++ xfrm_state_hold(x);
++ spin_unlock_bh(&xfrm_state_lock);
++
++ xfrm_state_delete(x);
++ xfrm_state_put(x);
++
++ spin_lock_bh(&xfrm_state_lock);
++ goto restart;
++ }
++ }
++ }
++ spin_unlock_bh(&xfrm_state_lock);
++ wake_up(&km_waitq);
++}
++
++static int
++xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
++ struct xfrm_tmpl *tmpl,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ unsigned short family)
++{
++ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
++ if (!afinfo)
++ return -1;
++ afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
++ xfrm_state_put_afinfo(afinfo);
++ return 0;
++}
++
++struct xfrm_state *
++xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
++ struct flowi *fl, struct xfrm_tmpl *tmpl,
++ struct xfrm_policy *pol, int *err,
++ unsigned short family)
++{
++ unsigned h = xfrm_dst_hash(daddr, family);
++ struct xfrm_state *x;
++ int acquire_in_progress = 0;
++ int error = 0;
++ struct xfrm_state *best = NULL;
++
++ spin_lock_bh(&xfrm_state_lock);
++ list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
++ if (x->props.family == family &&
++ x->props.reqid == tmpl->reqid &&
++ xfrm_state_addr_check(x, daddr, saddr, family) &&
++ tmpl->mode == x->props.mode &&
++ tmpl->id.proto == x->id.proto) {
++ /* Resolution logic:
++ 1. There is a valid state with matching selector.
++ Done.
++ 2. Valid state with inappropriate selector. Skip.
++
++ Entering area of "sysdeps".
++
++ 3. If state is not valid, selector is temporary,
++ it selects only session which triggered
++ previous resolution. Key manager will do
++ something to install a state with proper
++ selector.
++ */
++ if (x->km.state == XFRM_STATE_VALID) {
++ if (!xfrm_selector_match(&x->sel, fl, family))
++ continue;
++ if (!best ||
++ best->km.dying > x->km.dying ||
++ (best->km.dying == x->km.dying &&
++ best->curlft.add_time < x->curlft.add_time))
++ best = x;
++ } else if (x->km.state == XFRM_STATE_ACQ) {
++ acquire_in_progress = 1;
++ } else if (x->km.state == XFRM_STATE_ERROR ||
++ x->km.state == XFRM_STATE_EXPIRED) {
++ if (xfrm_selector_match(&x->sel, fl, family))
++ error = 1;
++ }
++ }
++ }
++
++ x = best;
++ if (!x && !error && !acquire_in_progress &&
++ ((x = xfrm_state_alloc()) != NULL)) {
++ /* Initialize temporary selector matching only
++ * to current session. */
++ xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
++
++ if (km_query(x, tmpl, pol) == 0) {
++ x->km.state = XFRM_STATE_ACQ;
++ list_add_tail(&x->bydst, xfrm_state_bydst+h);
++ xfrm_state_hold(x);
++ if (x->id.spi) {
++ h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family);
++ list_add(&x->byspi, xfrm_state_byspi+h);
++ xfrm_state_hold(x);
++ }
++ x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
++ xfrm_state_hold(x);
++ x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
++ add_timer(&x->timer);
++ } else {
++ x->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(x);
++ x = NULL;
++ error = 1;
++ }
++ }
++ if (x)
++ xfrm_state_hold(x);
++ else
++ *err = acquire_in_progress ? -EAGAIN :
++ (error ? -ESRCH : -ENOMEM);
++ spin_unlock_bh(&xfrm_state_lock);
++ return x;
++}
++
++static void __xfrm_state_insert(struct xfrm_state *x)
++{
++ unsigned h = xfrm_dst_hash(&x->id.daddr, x->props.family);
++
++ list_add(&x->bydst, xfrm_state_bydst+h);
++ xfrm_state_hold(x);
++
++ h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family);
++
++ list_add(&x->byspi, xfrm_state_byspi+h);
++ xfrm_state_hold(x);
++
++ if (!mod_timer(&x->timer, jiffies + HZ))
++ xfrm_state_hold(x);
++
++ wake_up(&km_waitq);
++}
++
++void xfrm_state_insert(struct xfrm_state *x)
++{
++ spin_lock_bh(&xfrm_state_lock);
++ __xfrm_state_insert(x);
++ spin_unlock_bh(&xfrm_state_lock);
++}
++
++static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq);
++
++int xfrm_state_add(struct xfrm_state *x)
++{
++ struct xfrm_state_afinfo *afinfo;
++ struct xfrm_state *x1;
++ int family;
++ int err;
++
++ family = x->props.family;
++ afinfo = xfrm_state_get_afinfo(family);
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++
++ spin_lock_bh(&xfrm_state_lock);
++
++ x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
++ if (x1) {
++ xfrm_state_put(x1);
++ x1 = NULL;
++ err = -EEXIST;
++ goto out;
++ }
++
++ if (x->km.seq) {
++ x1 = __xfrm_find_acq_byseq(x->km.seq);
++ if (x1 && xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family)) {
++ xfrm_state_put(x1);
++ x1 = NULL;
++ }
++ }
++
++ if (!x1)
++ x1 = afinfo->find_acq(
++ x->props.mode, x->props.reqid, x->id.proto,
++ &x->id.daddr, &x->props.saddr, 0);
++
++ __xfrm_state_insert(x);
++ err = 0;
++
++out:
++ spin_unlock_bh(&xfrm_state_lock);
++ xfrm_state_put_afinfo(afinfo);
++
++ if (x1) {
++ xfrm_state_delete(x1);
++ xfrm_state_put(x1);
++ }
++
++ return err;
++}
++
++int xfrm_state_update(struct xfrm_state *x)
++{
++ struct xfrm_state_afinfo *afinfo;
++ struct xfrm_state *x1;
++ int err;
++
++ afinfo = xfrm_state_get_afinfo(x->props.family);
++ if (unlikely(afinfo == NULL))
++ return -EAFNOSUPPORT;
++
++ spin_lock_bh(&xfrm_state_lock);
++ x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
++
++ err = -ESRCH;
++ if (!x1)
++ goto out;
++
++ if (xfrm_state_kern(x1)) {
++ xfrm_state_put(x1);
++ err = -EEXIST;
++ goto out;
++ }
++
++ if (x1->km.state == XFRM_STATE_ACQ) {
++ __xfrm_state_insert(x);
++ x = NULL;
++ }
++ err = 0;
++
++out:
++ spin_unlock_bh(&xfrm_state_lock);
++ xfrm_state_put_afinfo(afinfo);
++
++ if (err)
++ return err;
++
++ if (!x) {
++ xfrm_state_delete(x1);
++ xfrm_state_put(x1);
++ return 0;
++ }
++
++ err = -EINVAL;
++ spin_lock_bh(&x1->lock);
++ if (likely(x1->km.state == XFRM_STATE_VALID)) {
++ if (x->encap && x1->encap)
++ memcpy(x1->encap, x->encap, sizeof(*x1->encap));
++ memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
++ x1->km.dying = 0;
++
++ if (!mod_timer(&x1->timer, jiffies + HZ))
++ xfrm_state_hold(x1);
++ if (x1->curlft.use_time)
++ xfrm_state_check_expire(x1);
++
++ err = 0;
++ }
++ spin_unlock_bh(&x1->lock);
++
++ xfrm_state_put(x1);
++
++ return err;
++}
++
++int xfrm_state_check_expire(struct xfrm_state *x)
++{
++ if (!x->curlft.use_time)
++ x->curlft.use_time = (unsigned long)xtime.tv_sec;
++
++ if (x->km.state != XFRM_STATE_VALID)
++ return -EINVAL;
++
++ if (x->curlft.bytes >= x->lft.hard_byte_limit ||
++ x->curlft.packets >= x->lft.hard_packet_limit) {
++ km_state_expired(x, 1);
++ if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ))
++ xfrm_state_hold(x);
++ return -EINVAL;
++ }
++
++ if (!x->km.dying &&
++ (x->curlft.bytes >= x->lft.soft_byte_limit ||
++ x->curlft.packets >= x->lft.soft_packet_limit))
++ km_state_expired(x, 0);
++ return 0;
++}
++
++static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb)
++{
++ int nhead = x->props.header_len + LL_RESERVED_SPACE(skb->dst->dev)
++ - skb_headroom(skb);
++
++ if (nhead > 0)
++ return pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
++
++ /* Check tail too... */
++ return 0;
++}
++
++int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb)
++{
++ int err = xfrm_state_check_expire(x);
++ if (err < 0)
++ goto err;
++ err = xfrm_state_check_space(x, skb);
++err:
++ return err;
++}
++
++struct xfrm_state *
++xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto,
++ unsigned short family)
++{
++ struct xfrm_state *x;
++ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
++ if (!afinfo)
++ return NULL;
++
++ spin_lock_bh(&xfrm_state_lock);
++ x = afinfo->state_lookup(daddr, spi, proto);
++ spin_unlock_bh(&xfrm_state_lock);
++ xfrm_state_put_afinfo(afinfo);
++ return x;
++}
++
++struct xfrm_state *
++xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
++ xfrm_address_t *daddr, xfrm_address_t *saddr,
++ int create, unsigned short family)
++{
++ struct xfrm_state *x;
++ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
++ if (!afinfo)
++ return NULL;
++
++ spin_lock_bh(&xfrm_state_lock);
++ x = afinfo->find_acq(mode, reqid, proto, daddr, saddr, create);
++ spin_unlock_bh(&xfrm_state_lock);
++ xfrm_state_put_afinfo(afinfo);
++ return x;
++}
++
++/* Silly enough, but I'm lazy to build resolution list */
++
++static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq)
++{
++ int i;
++ struct xfrm_state *x;
++
++ for (i = 0; i < XFRM_DST_HSIZE; i++) {
++ list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
++ if (x->km.seq == seq) {
++ xfrm_state_hold(x);
++ return x;
++ }
++ }
++ }
++ return NULL;
++}
++
++struct xfrm_state *xfrm_find_acq_byseq(u32 seq)
++{
++ struct xfrm_state *x;
++
++ spin_lock_bh(&xfrm_state_lock);
++ x = __xfrm_find_acq_byseq(seq);
++ spin_unlock_bh(&xfrm_state_lock);
++ return x;
++}
++
++u32 xfrm_get_acqseq(void)
++{
++ u32 res;
++ static u32 acqseq;
++ static spinlock_t acqseq_lock = SPIN_LOCK_UNLOCKED;
++
++ spin_lock_bh(&acqseq_lock);
++ res = (++acqseq ? : ++acqseq);
++ spin_unlock_bh(&acqseq_lock);
++ return res;
++}
++
++void
++xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
++{
++ u32 h;
++ struct xfrm_state *x0;
++
++ if (x->id.spi)
++ return;
++
++ if (minspi == maxspi) {
++ x0 = xfrm_state_lookup(&x->id.daddr, minspi, x->id.proto, x->props.family);
++ if (x0) {
++ xfrm_state_put(x0);
++ return;
++ }
++ x->id.spi = minspi;
++ } else {
++ u32 spi = 0;
++ minspi = ntohl(minspi);
++ maxspi = ntohl(maxspi);
++ for (h=0; h<maxspi-minspi+1; h++) {
++ spi = minspi + net_random()%(maxspi-minspi+1);
++ x0 = xfrm_state_lookup(&x->id.daddr, htonl(spi), x->id.proto, x->props.family);
++ if (x0 == NULL) {
++ x->id.spi = htonl(spi);
++ break;
++ }
++ xfrm_state_put(x0);
++ }
++ }
++ if (x->id.spi) {
++ spin_lock_bh(&xfrm_state_lock);
++ h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family);
++ list_add(&x->byspi, xfrm_state_byspi+h);
++ xfrm_state_hold(x);
++ spin_unlock_bh(&xfrm_state_lock);
++ wake_up(&km_waitq);
++ }
++}
++
++int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
++ void *data)
++{
++ int i;
++ struct xfrm_state *x;
++ int count = 0;
++ int err = 0;
++
++ spin_lock_bh(&xfrm_state_lock);
++ for (i = 0; i < XFRM_DST_HSIZE; i++) {
++ list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
++ if (proto == IPSEC_PROTO_ANY || x->id.proto == proto)
++ count++;
++ }
++ }
++ if (count == 0) {
++ err = -ENOENT;
++ goto out;
++ }
++
++ for (i = 0; i < XFRM_DST_HSIZE; i++) {
++ list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
++ if (proto != IPSEC_PROTO_ANY && x->id.proto != proto)
++ continue;
++ err = func(x, --count, data);
++ if (err)
++ goto out;
++ }
++ }
++out:
++ spin_unlock_bh(&xfrm_state_lock);
++ return err;
++}
++
++
++int xfrm_replay_check(struct xfrm_state *x, u32 seq)
++{
++ u32 diff;
++
++ seq = ntohl(seq);
++
++ if (unlikely(seq == 0))
++ return -EINVAL;
++
++ if (likely(seq > x->replay.seq))
++ return 0;
++
++ diff = x->replay.seq - seq;
++ if (diff >= x->props.replay_window) {
++ x->stats.replay_window++;
++ return -EINVAL;
++ }
++
++ if (x->replay.bitmap & (1U << diff)) {
++ x->stats.replay++;
++ return -EINVAL;
++ }
++ return 0;
++}
++
++void xfrm_replay_advance(struct xfrm_state *x, u32 seq)
++{
++ u32 diff;
++
++ seq = ntohl(seq);
++
++ if (seq > x->replay.seq) {
++ diff = seq - x->replay.seq;
++ if (diff < x->props.replay_window)
++ x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
++ else
++ x->replay.bitmap = 1;
++ x->replay.seq = seq;
++ } else {
++ diff = x->replay.seq - seq;
++ x->replay.bitmap |= (1U << diff);
++ }
++}
++
++static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list);
++static rwlock_t xfrm_km_lock = RW_LOCK_UNLOCKED;
++
++void km_state_expired(struct xfrm_state *x, int hard)
++{
++ struct xfrm_mgr *km;
++
++ if (hard)
++ x->km.state = XFRM_STATE_EXPIRED;
++ else
++ x->km.dying = 1;
++
++ read_lock(&xfrm_km_lock);
++ list_for_each_entry(km, &xfrm_km_list, list)
++ km->notify(x, hard);
++ read_unlock(&xfrm_km_lock);
++
++ if (hard)
++ wake_up(&km_waitq);
++}
++
++int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
++{
++ int err = -EINVAL;
++ struct xfrm_mgr *km;
++
++ read_lock(&xfrm_km_lock);
++ list_for_each_entry(km, &xfrm_km_list, list) {
++ err = km->acquire(x, t, pol, XFRM_POLICY_OUT);
++ if (!err)
++ break;
++ }
++ read_unlock(&xfrm_km_lock);
++ return err;
++}
++
++int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport)
++{
++ int err = -EINVAL;
++ struct xfrm_mgr *km;
++
++ read_lock(&xfrm_km_lock);
++ list_for_each_entry(km, &xfrm_km_list, list) {
++ if (km->new_mapping)
++ err = km->new_mapping(x, ipaddr, sport);
++ if (!err)
++ break;
++ }
++ read_unlock(&xfrm_km_lock);
++ return err;
++}
++
++void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
++{
++ struct xfrm_mgr *km;
++
++ read_lock(&xfrm_km_lock);
++ list_for_each_entry(km, &xfrm_km_list, list)
++ if (km->notify_policy)
++ km->notify_policy(pol, dir, hard);
++ read_unlock(&xfrm_km_lock);
++
++ if (hard)
++ wake_up(&km_waitq);
++}
++
++int xfrm_user_policy(struct sock *sk, int optname, u8 *optval, int optlen)
++{
++ int err;
++ u8 *data;
++ struct xfrm_mgr *km;
++ struct xfrm_policy *pol = NULL;
++
++ if (optlen <= 0 || optlen > PAGE_SIZE)
++ return -EMSGSIZE;
++
++ data = kmalloc(optlen, GFP_KERNEL);
++ if (!data)
++ return -ENOMEM;
++
++ err = -EFAULT;
++ if (copy_from_user(data, optval, optlen))
++ goto out;
++
++ err = -EINVAL;
++ read_lock(&xfrm_km_lock);
++ list_for_each_entry(km, &xfrm_km_list, list) {
++ pol = km->compile_policy(sk->family, optname, data, optlen, &err);
++ if (err >= 0)
++ break;
++ }
++ read_unlock(&xfrm_km_lock);
++
++ if (err >= 0) {
++ xfrm_sk_policy_insert(sk, err, pol);
++ xfrm_pol_put(pol);
++ err = 0;
++ }
++
++out:
++ kfree(data);
++ return err;
++}
++
++int xfrm_register_km(struct xfrm_mgr *km)
++{
++ write_lock_bh(&xfrm_km_lock);
++ list_add_tail(&km->list, &xfrm_km_list);
++ write_unlock_bh(&xfrm_km_lock);
++ return 0;
++}
++
++int xfrm_unregister_km(struct xfrm_mgr *km)
++{
++ write_lock_bh(&xfrm_km_lock);
++ list_del(&km->list);
++ write_unlock_bh(&xfrm_km_lock);
++ return 0;
++}
++
++int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
++{
++ int err = 0;
++ if (unlikely(afinfo == NULL))
++ return -EINVAL;
++ if (unlikely(afinfo->family >= NPROTO))
++ return -EAFNOSUPPORT;
++ write_lock(&xfrm_state_afinfo_lock);
++ if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
++ err = -ENOBUFS;
++ else {
++ afinfo->state_bydst = xfrm_state_bydst;
++ afinfo->state_byspi = xfrm_state_byspi;
++ xfrm_state_afinfo[afinfo->family] = afinfo;
++ }
++ write_unlock(&xfrm_state_afinfo_lock);
++ return err;
++}
++
++int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
++{
++ int err = 0;
++ if (unlikely(afinfo == NULL))
++ return -EINVAL;
++ if (unlikely(afinfo->family >= NPROTO))
++ return -EAFNOSUPPORT;
++ write_lock(&xfrm_state_afinfo_lock);
++ if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
++ if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
++ err = -EINVAL;
++ else {
++ xfrm_state_afinfo[afinfo->family] = NULL;
++ afinfo->state_byspi = NULL;
++ afinfo->state_bydst = NULL;
++ }
++ }
++ write_unlock(&xfrm_state_afinfo_lock);
++ return err;
++}
++
++static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
++{
++ struct xfrm_state_afinfo *afinfo;
++ if (unlikely(family >= NPROTO))
++ return NULL;
++ read_lock(&xfrm_state_afinfo_lock);
++ afinfo = xfrm_state_afinfo[family];
++ if (likely(afinfo != NULL))
++ read_lock(&afinfo->lock);
++ read_unlock(&xfrm_state_afinfo_lock);
++ return afinfo;
++}
++
++static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
++{
++ if (unlikely(afinfo == NULL))
++ return;
++ read_unlock(&afinfo->lock);
++}
++
++/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
++void xfrm_state_delete_tunnel(struct xfrm_state *x)
++{
++ if (x->tunnel) {
++ struct xfrm_state *t = x->tunnel;
++
++ if (atomic_read(&t->tunnel_users) == 2)
++ xfrm_state_delete(t);
++ atomic_dec(&t->tunnel_users);
++ xfrm_state_put(t);
++ x->tunnel = NULL;
++ }
++}
++
++void __init xfrm_state_init(void)
++{
++ int i;
++
++ for (i=0; i<XFRM_DST_HSIZE; i++) {
++ INIT_LIST_HEAD(&xfrm_state_bydst[i]);
++ INIT_LIST_HEAD(&xfrm_state_byspi[i]);
++ }
++ INIT_TQUEUE(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
++}
++
+diff -Nru a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
+--- /dev/null Wed Dec 31 16:00:00 196900
++++ b/net/xfrm/xfrm_user.c 2005-02-13 21:25:10 +11:00
+@@ -0,0 +1,1253 @@
++/* xfrm_user.c: User interface to configure xfrm engine.
++ *
++ * Copyright (C) 2002 David S. Miller (davem at redhat.com)
++ *
++ * Changes:
++ * Mitsuru KANDA @USAGI
++ * Kazunori MIYAZAWA @USAGI
++ * Kunihiro Ishiguro <kunihiro at ipinfusion.com>
++ * IPv6 support
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/socket.h>
++#include <linux/string.h>
++#include <linux/net.h>
++#include <linux/skbuff.h>
++#include <linux/netlink.h>
++#include <linux/rtnetlink.h>
++#include <linux/pfkeyv2.h>
++#include <linux/ipsec.h>
++#include <linux/init.h>
++#include <net/sock.h>
++#include <net/xfrm.h>
++#include <asm/uaccess.h>
++
++static struct sock *xfrm_nl;
++
++static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type)
++{
++ struct rtattr *rt = xfrma[type - 1];
++ struct xfrm_algo *algp;
++
++ if (!rt)
++ return 0;
++
++ if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp))
++ return -EINVAL;
++
++ algp = RTA_DATA(rt);
++ switch (type) {
++ case XFRMA_ALG_AUTH:
++ if (!algp->alg_key_len &&
++ strcmp(algp->alg_name, "digest_null") != 0)
++ return -EINVAL;
++ break;
++
++ case XFRMA_ALG_CRYPT:
++ if (!algp->alg_key_len &&
++ strcmp(algp->alg_name, "cipher_null") != 0)
++ return -EINVAL;
++ break;
++
++ case XFRMA_ALG_COMP:
++ /* Zero length keys are legal. */
++ break;
++
++ default:
++ return -EINVAL;
++ };
++
++ algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
++ return 0;
++}
++
++static int verify_encap_tmpl(struct rtattr **xfrma)
++{
++ struct rtattr *rt = xfrma[XFRMA_ENCAP - 1];
++ struct xfrm_encap_tmpl *encap;
++
++ if (!rt)
++ return 0;
++
++ if ((rt->rta_len - sizeof(*rt)) < sizeof(*encap))
++ return -EINVAL;
++
++ return 0;
++}
++
++static int verify_newsa_info(struct xfrm_usersa_info *p,
++ struct rtattr **xfrma)
++{
++ int err;
++
++ err = -EINVAL;
++ switch (p->family) {
++ case AF_INET:
++ break;
++
++ case AF_INET6:
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ break;
++#else
++ err = -EAFNOSUPPORT;
++ goto out;
++#endif
++
++ default:
++ goto out;
++ };
++
++ err = -EINVAL;
++ switch (p->id.proto) {
++ case IPPROTO_AH:
++ if (!xfrma[XFRMA_ALG_AUTH-1] ||
++ xfrma[XFRMA_ALG_CRYPT-1] ||
++ xfrma[XFRMA_ALG_COMP-1])
++ goto out;
++ break;
++
++ case IPPROTO_ESP:
++ if ((!xfrma[XFRMA_ALG_AUTH-1] &&
++ !xfrma[XFRMA_ALG_CRYPT-1]) ||
++ xfrma[XFRMA_ALG_COMP-1])
++ goto out;
++ break;
++
++ case IPPROTO_COMP:
++ if (!xfrma[XFRMA_ALG_COMP-1] ||
++ xfrma[XFRMA_ALG_AUTH-1] ||
++ xfrma[XFRMA_ALG_CRYPT-1])
++ goto out;
++ break;
++
++ default:
++ goto out;
++ };
++
++ if ((err = verify_one_alg(xfrma, XFRMA_ALG_AUTH)))
++ goto out;
++ if ((err = verify_one_alg(xfrma, XFRMA_ALG_CRYPT)))
++ goto out;
++ if ((err = verify_one_alg(xfrma, XFRMA_ALG_COMP)))
++ goto out;
++ if ((err = verify_encap_tmpl(xfrma)))
++ goto out;
++
++ err = -EINVAL;
++ switch (p->mode) {
++ case 0:
++ case 1:
++ break;
++
++ default:
++ goto out;
++ };
++
++ err = 0;
++
++out:
++ return err;
++}
++
++static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
++ struct xfrm_algo_desc *(*get_byname)(char *),
++ struct rtattr *u_arg)
++{
++ struct rtattr *rta = u_arg;
++ struct xfrm_algo *p, *ualg;
++ struct xfrm_algo_desc *algo;
++
++ if (!rta)
++ return 0;
++
++ ualg = RTA_DATA(rta);
++
++ algo = get_byname(ualg->alg_name);
++ if (!algo)
++ return -ENOSYS;
++ *props = algo->desc.sadb_alg_id;
++
++ p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
++
++ memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len);
++ *algpp = p;
++ return 0;
++}
++
++static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_arg)
++{
++ struct rtattr *rta = u_arg;
++ struct xfrm_encap_tmpl *p, *uencap;
++
++ if (!rta)
++ return 0;
++
++ uencap = RTA_DATA(rta);
++ p = kmalloc(sizeof(*p), GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
++
++ memcpy(p, uencap, sizeof(*p));
++ *encapp = p;
++ return 0;
++}
++
++static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
++{
++ memcpy(&x->id, &p->id, sizeof(x->id));
++ memcpy(&x->sel, &p->sel, sizeof(x->sel));
++ memcpy(&x->lft, &p->lft, sizeof(x->lft));
++ x->props.mode = p->mode;
++ x->props.replay_window = p->replay_window;
++ x->props.reqid = p->reqid;
++ x->props.family = p->family;
++ x->props.saddr = p->saddr;
++ x->props.flags = p->flags;
++}
++
++static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
++ struct rtattr **xfrma,
++ int *errp)
++{
++ struct xfrm_state *x = xfrm_state_alloc();
++ int err = -ENOMEM;
++
++ if (!x)
++ goto error_no_put;
++
++ copy_from_user_state(x, p);
++
++ if ((err = attach_one_algo(&x->aalg, &x->props.aalgo,
++ xfrm_aalg_get_byname,
++ xfrma[XFRMA_ALG_AUTH-1])))
++ goto error;
++ if ((err = attach_one_algo(&x->ealg, &x->props.ealgo,
++ xfrm_ealg_get_byname,
++ xfrma[XFRMA_ALG_CRYPT-1])))
++ goto error;
++ if ((err = attach_one_algo(&x->calg, &x->props.calgo,
++ xfrm_calg_get_byname,
++ xfrma[XFRMA_ALG_COMP-1])))
++ goto error;
++ if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
++ goto error;
++
++ err = -ENOENT;
++ x->type = xfrm_get_type(x->id.proto, x->props.family);
++ if (x->type == NULL)
++ goto error;
++
++ err = x->type->init_state(x, NULL);
++ if (err)
++ goto error;
++
++ x->curlft.add_time = (unsigned long) xtime.tv_sec;
++ x->km.state = XFRM_STATE_VALID;
++ x->km.seq = p->seq;
++
++ return x;
++
++error:
++ x->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(x);
++error_no_put:
++ *errp = err;
++ return NULL;
++}
++
++static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_usersa_info *p = NLMSG_DATA(nlh);
++ struct xfrm_state *x;
++ int err;
++
++ err = verify_newsa_info(p, (struct rtattr **) xfrma);
++ if (err)
++ return err;
++
++ xfrm_probe_algs();
++
++ x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err);
++ if (!x)
++ return err;
++
++ if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
++ err = xfrm_state_add(x);
++ else
++ err = xfrm_state_update(x);
++
++ if (err < 0) {
++ x->km.state = XFRM_STATE_DEAD;
++ xfrm_state_put(x);
++ }
++
++ return err;
++}
++
++static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_state *x;
++ struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
++
++ x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
++ if (x == NULL)
++ return -ESRCH;
++
++ if (xfrm_state_kern(x)) {
++ xfrm_state_put(x);
++ return -EPERM;
++ }
++
++ xfrm_state_delete(x);
++ xfrm_state_put(x);
++
++ return 0;
++}
++
++static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
++{
++ memcpy(&p->id, &x->id, sizeof(p->id));
++ memcpy(&p->sel, &x->sel, sizeof(p->sel));
++ memcpy(&p->lft, &x->lft, sizeof(p->lft));
++ memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
++ memcpy(&p->stats, &x->stats, sizeof(p->stats));
++ p->saddr = x->props.saddr;
++ p->mode = x->props.mode;
++ p->replay_window = x->props.replay_window;
++ p->reqid = x->props.reqid;
++ p->family = x->props.family;
++ p->flags = x->props.flags;
++ p->seq = x->km.seq;
++}
++
++struct xfrm_dump_info {
++ struct sk_buff *in_skb;
++ struct sk_buff *out_skb;
++ u32 nlmsg_seq;
++ u16 nlmsg_flags;
++ int start_idx;
++ int this_idx;
++};
++
++static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
++{
++ struct xfrm_dump_info *sp = ptr;
++ struct sk_buff *in_skb = sp->in_skb;
++ struct sk_buff *skb = sp->out_skb;
++ struct xfrm_usersa_info *p;
++ struct nlmsghdr *nlh;
++ unsigned char *b = skb->tail;
++
++ if (sp->this_idx < sp->start_idx)
++ goto out;
++
++ nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid,
++ sp->nlmsg_seq,
++ XFRM_MSG_NEWSA, sizeof(*p));
++ nlh->nlmsg_flags = sp->nlmsg_flags;
++
++ p = NLMSG_DATA(nlh);
++ copy_to_user_state(x, p);
++
++ if (x->aalg)
++ RTA_PUT(skb, XFRMA_ALG_AUTH,
++ sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg);
++ if (x->ealg)
++ RTA_PUT(skb, XFRMA_ALG_CRYPT,
++ sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg);
++ if (x->calg)
++ RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
++
++ if (x->encap)
++ RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
++
++ nlh->nlmsg_len = skb->tail - b;
++out:
++ sp->this_idx++;
++ return 0;
++
++nlmsg_failure:
++rtattr_failure:
++ skb_trim(skb, b - skb->data);
++ return -1;
++}
++
++static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
++{
++ struct xfrm_dump_info info;
++
++ info.in_skb = cb->skb;
++ info.out_skb = skb;
++ info.nlmsg_seq = cb->nlh->nlmsg_seq;
++ info.nlmsg_flags = NLM_F_MULTI;
++ info.this_idx = 0;
++ info.start_idx = cb->args[0];
++ (void) xfrm_state_walk(IPSEC_PROTO_ANY, dump_one_state, &info);
++ cb->args[0] = info.this_idx;
++
++ return skb->len;
++}
++
++static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
++ struct xfrm_state *x, u32 seq)
++{
++ struct xfrm_dump_info info;
++ struct sk_buff *skb;
++
++ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
++ if (!skb)
++ return ERR_PTR(-ENOMEM);
++
++ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
++ info.in_skb = in_skb;
++ info.out_skb = skb;
++ info.nlmsg_seq = seq;
++ info.nlmsg_flags = 0;
++ info.this_idx = info.start_idx = 0;
++
++ if (dump_one_state(x, 0, &info)) {
++ kfree_skb(skb);
++ return NULL;
++ }
++
++ return skb;
++}
++
++static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
++ struct xfrm_state *x;
++ struct sk_buff *resp_skb;
++ int err;
++
++ x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
++ err = -ESRCH;
++ if (x == NULL)
++ goto out_noput;
++
++ resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
++ if (IS_ERR(resp_skb)) {
++ err = PTR_ERR(resp_skb);
++ } else {
++ err = netlink_unicast(xfrm_nl, resp_skb,
++ NETLINK_CB(skb).pid, MSG_DONTWAIT);
++ }
++ xfrm_state_put(x);
++out_noput:
++ return err;
++}
++
++static int verify_userspi_info(struct xfrm_userspi_info *p)
++{
++ switch (p->info.id.proto) {
++ case IPPROTO_AH:
++ case IPPROTO_ESP:
++ break;
++
++ case IPPROTO_COMP:
++ /* IPCOMP spi is 16-bits. */
++ if (p->max >= 0x10000)
++ return -EINVAL;
++ break;
++
++ default:
++ return -EINVAL;
++ };
++
++ if (p->min > p->max)
++ return -EINVAL;
++
++ return 0;
++}
++
++static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_state *x;
++ struct xfrm_userspi_info *p;
++ struct sk_buff *resp_skb;
++ xfrm_address_t *daddr;
++ int family;
++ int err;
++
++ p = NLMSG_DATA(nlh);
++ err = verify_userspi_info(p);
++ if (err)
++ goto out_noput;
++
++ family = p->info.family;
++ daddr = &p->info.id.daddr;
++
++ x = NULL;
++ if (p->info.seq) {
++ x = xfrm_find_acq_byseq(p->info.seq);
++ if (x && xfrm_addr_cmp(&x->id.daddr, daddr, family)) {
++ xfrm_state_put(x);
++ x = NULL;
++ }
++ }
++
++ if (!x)
++ x = xfrm_find_acq(p->info.mode, p->info.reqid,
++ p->info.id.proto, daddr,
++ &p->info.saddr, 1,
++ family);
++ err = -ENOENT;
++ if (x == NULL)
++ goto out_noput;
++
++ resp_skb = ERR_PTR(-ENOENT);
++
++ spin_lock_bh(&x->lock);
++ if (x->km.state != XFRM_STATE_DEAD) {
++ xfrm_alloc_spi(x, htonl(p->min), htonl(p->max));
++ if (x->id.spi)
++ resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
++ }
++ spin_unlock_bh(&x->lock);
++
++ if (IS_ERR(resp_skb)) {
++ err = PTR_ERR(resp_skb);
++ goto out;
++ }
++
++ err = netlink_unicast(xfrm_nl, resp_skb,
++ NETLINK_CB(skb).pid, MSG_DONTWAIT);
++
++out:
++ xfrm_state_put(x);
++out_noput:
++ return err;
++}
++
++static int verify_policy_dir(__u8 dir)
++{
++ switch (dir) {
++ case XFRM_POLICY_IN:
++ case XFRM_POLICY_OUT:
++ case XFRM_POLICY_FWD:
++ break;
++
++ default:
++ return -EINVAL;
++ };
++
++ return 0;
++}
++
++static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
++{
++ switch (p->share) {
++ case XFRM_SHARE_ANY:
++ case XFRM_SHARE_SESSION:
++ case XFRM_SHARE_USER:
++ case XFRM_SHARE_UNIQUE:
++ break;
++
++ default:
++ return -EINVAL;
++ };
++
++ switch (p->action) {
++ case XFRM_POLICY_ALLOW:
++ case XFRM_POLICY_BLOCK:
++ break;
++
++ default:
++ return -EINVAL;
++ };
++
++ switch (p->sel.family) {
++ case AF_INET:
++ break;
++
++ case AF_INET6:
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ break;
++#else
++ return -EAFNOSUPPORT;
++#endif
++
++ default:
++ return -EINVAL;
++ };
++
++ return verify_policy_dir(p->dir);
++}
++
++static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
++ int nr)
++{
++ int i;
++
++ xp->xfrm_nr = nr;
++ for (i = 0; i < nr; i++, ut++) {
++ struct xfrm_tmpl *t = &xp->xfrm_vec[i];
++
++ memcpy(&t->id, &ut->id, sizeof(struct xfrm_id));
++ memcpy(&t->saddr, &ut->saddr,
++ sizeof(xfrm_address_t));
++ t->reqid = ut->reqid;
++ t->mode = ut->mode;
++ t->share = ut->share;
++ t->optional = ut->optional;
++ t->aalgos = ut->aalgos;
++ t->ealgos = ut->ealgos;
++ t->calgos = ut->calgos;
++ }
++}
++
++static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma)
++{
++ struct rtattr *rt = xfrma[XFRMA_TMPL-1];
++ struct xfrm_user_tmpl *utmpl;
++ int nr;
++
++ if (!rt) {
++ pol->xfrm_nr = 0;
++ } else {
++ nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl);
++
++ if (nr > XFRM_MAX_DEPTH)
++ return -EINVAL;
++
++ copy_templates(pol, RTA_DATA(rt), nr);
++ }
++ return 0;
++}
++
++static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
++{
++ xp->priority = p->priority;
++ xp->index = p->index;
++ memcpy(&xp->selector, &p->sel, sizeof(xp->selector));
++ memcpy(&xp->lft, &p->lft, sizeof(xp->lft));
++ xp->action = p->action;
++ xp->flags = p->flags;
++ xp->family = p->sel.family;
++ /* XXX xp->share = p->share; */
++}
++
++static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir)
++{
++ memcpy(&p->sel, &xp->selector, sizeof(p->sel));
++ memcpy(&p->lft, &xp->lft, sizeof(p->lft));
++ memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft));
++ p->priority = xp->priority;
++ p->index = xp->index;
++ p->sel.family = xp->family;
++ p->dir = dir;
++ p->action = xp->action;
++ p->flags = xp->flags;
++ p->share = XFRM_SHARE_ANY; /* XXX xp->share */
++}
++
++static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, struct rtattr **xfrma, int *errp)
++{
++ struct xfrm_policy *xp = xfrm_policy_alloc(GFP_KERNEL);
++ int err;
++
++ if (!xp) {
++ *errp = -ENOMEM;
++ return NULL;
++ }
++
++ copy_from_user_policy(xp, p);
++ err = copy_from_user_tmpl(xp, xfrma);
++ if (err) {
++ *errp = err;
++ kfree(xp);
++ xp = NULL;
++ }
++
++ return xp;
++}
++
++static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh);
++ struct xfrm_policy *xp;
++ int err;
++ int excl;
++
++ err = verify_newpolicy_info(p);
++ if (err)
++ return err;
++
++ xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err);
++ if (!xp)
++ return err;
++
++ excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
++ err = xfrm_policy_insert(p->dir, xp, excl);
++ if (err) {
++ kfree(xp);
++ return err;
++ }
++
++ xfrm_pol_put(xp);
++
++ return 0;
++}
++
++static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
++{
++ struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH];
++ int i;
++
++ if (xp->xfrm_nr == 0)
++ return 0;
++
++ for (i = 0; i < xp->xfrm_nr; i++) {
++ struct xfrm_user_tmpl *up = &vec[i];
++ struct xfrm_tmpl *kp = &xp->xfrm_vec[i];
++
++ memcpy(&up->id, &kp->id, sizeof(up->id));
++ up->family = xp->family;
++ memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr));
++ up->reqid = kp->reqid;
++ up->mode = kp->mode;
++ up->share = kp->share;
++ up->optional = kp->optional;
++ up->aalgos = kp->aalgos;
++ up->ealgos = kp->ealgos;
++ up->calgos = kp->calgos;
++ }
++ RTA_PUT(skb, XFRMA_TMPL,
++ (sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr),
++ vec);
++
++ return 0;
++
++rtattr_failure:
++ return -1;
++}
++
++static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
++{
++ struct xfrm_dump_info *sp = ptr;
++ struct xfrm_userpolicy_info *p;
++ struct sk_buff *in_skb = sp->in_skb;
++ struct sk_buff *skb = sp->out_skb;
++ struct nlmsghdr *nlh;
++ unsigned char *b = skb->tail;
++
++ if (sp->this_idx < sp->start_idx)
++ goto out;
++
++ nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid,
++ sp->nlmsg_seq,
++ XFRM_MSG_NEWPOLICY, sizeof(*p));
++ p = NLMSG_DATA(nlh);
++ nlh->nlmsg_flags = sp->nlmsg_flags;
++
++ copy_to_user_policy(xp, p, dir);
++ if (copy_to_user_tmpl(xp, skb) < 0)
++ goto nlmsg_failure;
++
++ nlh->nlmsg_len = skb->tail - b;
++out:
++ sp->this_idx++;
++ return 0;
++
++nlmsg_failure:
++ skb_trim(skb, b - skb->data);
++ return -1;
++}
++
++static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
++{
++ struct xfrm_dump_info info;
++
++ info.in_skb = cb->skb;
++ info.out_skb = skb;
++ info.nlmsg_seq = cb->nlh->nlmsg_seq;
++ info.nlmsg_flags = NLM_F_MULTI;
++ info.this_idx = 0;
++ info.start_idx = cb->args[0];
++ (void) xfrm_policy_walk(dump_one_policy, &info);
++ cb->args[0] = info.this_idx;
++
++ return skb->len;
++}
++
++static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
++ struct xfrm_policy *xp,
++ int dir, u32 seq)
++{
++ struct xfrm_dump_info info;
++ struct sk_buff *skb;
++
++ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
++ if (!skb)
++ return ERR_PTR(-ENOMEM);
++
++ NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
++ info.in_skb = in_skb;
++ info.out_skb = skb;
++ info.nlmsg_seq = seq;
++ info.nlmsg_flags = 0;
++ info.this_idx = info.start_idx = 0;
++
++ if (dump_one_policy(xp, dir, 0, &info) < 0) {
++ kfree_skb(skb);
++ return NULL;
++ }
++
++ return skb;
++}
++
++static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_policy *xp;
++ struct xfrm_userpolicy_id *p;
++ int err;
++ int delete;
++
++ p = NLMSG_DATA(nlh);
++ delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
++
++ err = verify_policy_dir(p->dir);
++ if (err)
++ return err;
++
++ if (p->index)
++ xp = xfrm_policy_byid(p->dir, p->index, delete);
++ else
++ xp = xfrm_policy_bysel(p->dir, &p->sel, delete);
++ if (xp == NULL)
++ return -ENOENT;
++
++ if (!delete) {
++ struct sk_buff *resp_skb;
++
++ resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq);
++ if (IS_ERR(resp_skb)) {
++ err = PTR_ERR(resp_skb);
++ } else {
++ err = netlink_unicast(xfrm_nl, resp_skb,
++ NETLINK_CB(skb).pid,
++ MSG_DONTWAIT);
++ }
++ }
++
++ xfrm_pol_put(xp);
++
++ return err;
++}
++
++static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
++
++ xfrm_state_flush(p->proto);
++ return 0;
++}
++
++static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
++{
++ xfrm_policy_flush();
++ return 0;
++}
++
++static const int xfrm_msg_min[(XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)] = {
++ NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* NEW SA */
++ NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* DEL SA */
++ NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* GET SA */
++ NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* NEW POLICY */
++ NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* DEL POLICY */
++ NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* GET POLICY */
++ NLMSG_LENGTH(sizeof(struct xfrm_userspi_info)), /* ALLOC SPI */
++ NLMSG_LENGTH(sizeof(struct xfrm_user_acquire)), /* ACQUIRE */
++ NLMSG_LENGTH(sizeof(struct xfrm_user_expire)), /* EXPIRE */
++ NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* UPD POLICY */
++ NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* UPD SA */
++ NLMSG_LENGTH(sizeof(struct xfrm_user_polexpire)), /* POLEXPIRE */
++ NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush)), /* FLUSH SA */
++ NLMSG_LENGTH(0), /* FLUSH POLICY */
++};
++
++static struct xfrm_link {
++ int (*doit)(struct sk_buff *, struct nlmsghdr *, void **);
++ int (*dump)(struct sk_buff *, struct netlink_callback *);
++} xfrm_dispatch[] = {
++ { .doit = xfrm_add_sa, },
++ { .doit = xfrm_del_sa, },
++ {
++ .doit = xfrm_get_sa,
++ .dump = xfrm_dump_sa,
++ },
++ { .doit = xfrm_add_policy },
++ { .doit = xfrm_get_policy },
++ {
++ .doit = xfrm_get_policy,
++ .dump = xfrm_dump_policy,
++ },
++ { .doit = xfrm_alloc_userspi },
++ {},
++ {},
++ { .doit = xfrm_add_policy },
++ { .doit = xfrm_add_sa, },
++ {},
++ { .doit = xfrm_flush_sa },
++ { .doit = xfrm_flush_policy },
++};
++
++static int xfrm_done(struct netlink_callback *cb)
++{
++ return 0;
++}
++
++static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
++{
++ struct rtattr *xfrma[XFRMA_MAX];
++ struct xfrm_link *link;
++ int type, min_len;
++
++ if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
++ return 0;
++
++ type = nlh->nlmsg_type;
++
++ /* A control message: ignore them */
++ if (type < XFRM_MSG_BASE)
++ return 0;
++
++ /* Unknown message: reply with EINVAL */
++ if (type > XFRM_MSG_MAX)
++ goto err_einval;
++
++ type -= XFRM_MSG_BASE;
++ link = &xfrm_dispatch[type];
++
++ /* All operations require privileges, even GET */
++ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
++ *errp = -EPERM;
++ return -1;
++ }
++
++ if ((type == 2 || type == 5) && (nlh->nlmsg_flags & NLM_F_DUMP)) {
++ u32 rlen;
++
++ if (link->dump == NULL)
++ goto err_einval;
++
++ if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh,
++ link->dump,
++ xfrm_done)) != 0) {
++ return -1;
++ }
++ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
++ if (rlen > skb->len)
++ rlen = skb->len;
++ skb_pull(skb, rlen);
++ return -1;
++ }
++
++ memset(xfrma, 0, sizeof(xfrma));
++
++ if (nlh->nlmsg_len < (min_len = xfrm_msg_min[type]))
++ goto err_einval;
++
++ if (nlh->nlmsg_len > min_len) {
++ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
++ struct rtattr *attr = (void *) nlh + NLMSG_ALIGN(min_len);
++
++ while (RTA_OK(attr, attrlen)) {
++ unsigned short flavor = attr->rta_type;
++ if (flavor) {
++ if (flavor > XFRMA_MAX)
++ goto err_einval;
++ xfrma[flavor - 1] = attr;
++ }
++ attr = RTA_NEXT(attr, attrlen);
++ }
++ }
++
++ if (link->doit == NULL)
++ goto err_einval;
++ *errp = link->doit(skb, nlh, (void **) &xfrma);
++
++ return *errp;
++
++err_einval:
++ *errp = -EINVAL;
++ return -1;
++}
++
++static int xfrm_user_rcv_skb(struct sk_buff *skb)
++{
++ int err;
++ struct nlmsghdr *nlh;
++
++ while (skb->len >= NLMSG_SPACE(0)) {
++ u32 rlen;
++
++ nlh = (struct nlmsghdr *) skb->data;
++ if (nlh->nlmsg_len < sizeof(*nlh) ||
++ skb->len < nlh->nlmsg_len)
++ return 0;
++ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
++ if (rlen > skb->len)
++ rlen = skb->len;
++ if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) {
++ if (err == 0)
++ return -1;
++ netlink_ack(skb, nlh, err);
++ } else if (nlh->nlmsg_flags & NLM_F_ACK)
++ netlink_ack(skb, nlh, 0);
++ skb_pull(skb, rlen);
++ }
++
++ return 0;
++}
++
++static void xfrm_netlink_rcv(struct sock *sk, int len)
++{
++ do {
++ struct sk_buff *skb;
++
++ down(&xfrm_cfg_sem);
++
++ while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
++ if (xfrm_user_rcv_skb(skb)) {
++ if (skb->len)
++ skb_queue_head(&sk->receive_queue, skb);
++ else
++ kfree_skb(skb);
++ break;
++ }
++ kfree_skb(skb);
++ }
++
++ up(&xfrm_cfg_sem);
++
++ } while (xfrm_nl && xfrm_nl->receive_queue.qlen);
++}
++
++static int build_expire(struct sk_buff *skb, struct xfrm_state *x, int hard)
++{
++ struct xfrm_user_expire *ue;
++ struct nlmsghdr *nlh;
++ unsigned char *b = skb->tail;
++
++ nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_EXPIRE,
++ sizeof(*ue));
++ ue = NLMSG_DATA(nlh);
++ nlh->nlmsg_flags = 0;
++
++ copy_to_user_state(x, &ue->state);
++ ue->hard = (hard != 0) ? 1 : 0;
++
++ nlh->nlmsg_len = skb->tail - b;
++ return skb->len;
++
++nlmsg_failure:
++ skb_trim(skb, b - skb->data);
++ return -1;
++}
++
++static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
++{
++ struct sk_buff *skb;
++
++ skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC);
++ if (skb == NULL)
++ return -ENOMEM;
++
++ if (build_expire(skb, x, hard) < 0)
++ BUG();
++
++ NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
++
++ return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
++}
++
++static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
++ struct xfrm_tmpl *xt, struct xfrm_policy *xp,
++ int dir)
++{
++ struct xfrm_user_acquire *ua;
++ struct nlmsghdr *nlh;
++ unsigned char *b = skb->tail;
++ __u32 seq = xfrm_get_acqseq();
++
++ nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_ACQUIRE,
++ sizeof(*ua));
++ ua = NLMSG_DATA(nlh);
++ nlh->nlmsg_flags = 0;
++
++ memcpy(&ua->id, &x->id, sizeof(ua->id));
++ memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr));
++ memcpy(&ua->sel, &x->sel, sizeof(ua->sel));
++ copy_to_user_policy(xp, &ua->policy, dir);
++ ua->aalgos = xt->aalgos;
++ ua->ealgos = xt->ealgos;
++ ua->calgos = xt->calgos;
++ ua->seq = x->km.seq = seq;
++
++ if (copy_to_user_tmpl(xp, skb) < 0)
++ goto nlmsg_failure;
++
++ nlh->nlmsg_len = skb->tail - b;
++ return skb->len;
++
++nlmsg_failure:
++ skb_trim(skb, b - skb->data);
++ return -1;
++}
++
++static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
++ struct xfrm_policy *xp, int dir)
++{
++ struct sk_buff *skb;
++ size_t len;
++
++ len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
++ len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
++ skb = alloc_skb(len, GFP_ATOMIC);
++ if (skb == NULL)
++ return -ENOMEM;
++
++ if (build_acquire(skb, x, xt, xp, dir) < 0)
++ BUG();
++
++ NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE;
++
++ return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
++}
++
++/* User gives us xfrm_user_policy_info followed by an array of 0
++ * or more templates.
++ */
++struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
++ u8 *data, int len, int *dir)
++{
++ struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
++ struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1);
++ struct xfrm_policy *xp;
++ int nr;
++
++ switch (family) {
++ case AF_INET:
++ if (opt != IP_XFRM_POLICY) {
++ *dir = -EOPNOTSUPP;
++ return NULL;
++ }
++ break;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ case AF_INET6:
++ if (opt != IPV6_XFRM_POLICY) {
++ *dir = -EOPNOTSUPP;
++ return NULL;
++ }
++ break;
++#endif
++ default:
++ *dir = -EINVAL;
++ return NULL;
++ }
++
++ *dir = -EINVAL;
++
++ if (len < sizeof(*p) ||
++ verify_newpolicy_info(p))
++ return NULL;
++
++ nr = ((len - sizeof(*p)) / sizeof(*ut));
++ if (nr > XFRM_MAX_DEPTH)
++ return NULL;
++
++ xp = xfrm_policy_alloc(GFP_KERNEL);
++ if (xp == NULL) {
++ *dir = -ENOBUFS;
++ return NULL;
++ }
++
++ copy_from_user_policy(xp, p);
++ copy_templates(xp, ut, nr);
++
++ *dir = p->dir;
++
++ return xp;
++}
++
++static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
++ int dir, int hard)
++{
++ struct xfrm_user_polexpire *upe;
++ struct nlmsghdr *nlh;
++ unsigned char *b = skb->tail;
++
++ nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe));
++ upe = NLMSG_DATA(nlh);
++ nlh->nlmsg_flags = 0;
++
++ copy_to_user_policy(xp, &upe->pol, dir);
++ if (copy_to_user_tmpl(xp, skb) < 0)
++ goto nlmsg_failure;
++ upe->hard = !!hard;
++
++ nlh->nlmsg_len = skb->tail - b;
++ return skb->len;
++
++nlmsg_failure:
++ skb_trim(skb, b - skb->data);
++ return -1;
++}
++
++static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
++{
++ struct sk_buff *skb;
++ size_t len;
++
++ len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
++ len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
++ skb = alloc_skb(len, GFP_ATOMIC);
++ if (skb == NULL)
++ return -ENOMEM;
++
++ if (build_polexpire(skb, xp, dir, hard) < 0)
++ BUG();
++
++ NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
++
++ return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
++}
++
++static struct xfrm_mgr netlink_mgr = {
++ .id = "netlink",
++ .notify = xfrm_send_state_notify,
++ .acquire = xfrm_send_acquire,
++ .compile_policy = xfrm_compile_policy,
++ .notify_policy = xfrm_send_policy_notify,
++};
++
++static int __init xfrm_user_init(void)
++{
++ printk(KERN_INFO "Initializing IPsec netlink socket\n");
++
++ xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv);
++ if (xfrm_nl == NULL)
++ return -ENOMEM;
++
++ xfrm_register_km(&netlink_mgr);
++
++ return 0;
++}
++
++static void __exit xfrm_user_exit(void)
++{
++ xfrm_unregister_km(&netlink_mgr);
++ sock_release(xfrm_nl->socket);
++}
++
++module_init(xfrm_user_init);
++module_exit(xfrm_user_exit);
++MODULE_LICENSE("GPL");
+diff -Nru a/scripts/tkgen.c b/scripts/tkgen.c
+--- a/scripts/tkgen.c 2005-02-13 21:25:09 +11:00
++++ b/scripts/tkgen.c 2005-02-13 21:25:09 +11:00
+@@ -546,7 +546,7 @@
+ printf( "set %s [expr $%s&15]",
+ vartable[cfg->nameindex].name, vartable[cfg->nameindex].name );
+ printf( "} else {");
+- printf( "set %s [expr $%s|16]}\n",
++ printf( "set %s [expr $%s]}\n",
+ vartable[cfg->nameindex].name, vartable[cfg->nameindex].name );
+ break;
+
+@@ -612,7 +612,7 @@
+ /*
+ * Clear the disable bit to enable the correct radiobutton.
+ */
+- printf( "set %s [expr $%s|16]}\n",
++ printf( "set %s [expr $%s]}\n",
+ vartable[cfg->nameindex].name, vartable[cfg->nameindex].name );
+ break;
+
Modified: trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/series/2.4.29-1
===================================================================
--- trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/series/2.4.29-1 2005-02-14 17:51:54 UTC (rev 2486)
+++ trunk/kernel-2.4/source/kernel-source-2.4.29-2.4.29/debian/patches/series/2.4.29-1 2005-02-15 00:36:04 UTC (rev 2487)
@@ -89,3 +89,4 @@
+ 092_sparc64_hme_lockup.diff
+ 095_sparc32_initrd_memcpy.diff
+ 096_megaraid2_proc_name.diff
++ 097_ipsec.diff
More information about the Kernel-svn-changes
mailing list