[kernel] r13721 - in dists/trunk/linux-2.6/debian: . patches/features/arm patches/series
Martin Michlmayr
tbm at alioth.debian.org
Wed Jun 3 20:58:26 UTC 2009
Author: tbm
Date: Wed Jun 3 20:58:24 2009
New Revision: 13721
Log:
add copy user patches from Marvell
Added:
dists/trunk/linux-2.6/debian/patches/features/arm/copy_to_user-better_threshold.patch
dists/trunk/linux-2.6/debian/patches/features/arm/lower_overhead_with_alternative.patch
Modified:
dists/trunk/linux-2.6/debian/changelog
dists/trunk/linux-2.6/debian/patches/series/base
Modified: dists/trunk/linux-2.6/debian/changelog
==============================================================================
--- dists/trunk/linux-2.6/debian/changelog Wed Jun 3 20:53:42 2009 (r13720)
+++ dists/trunk/linux-2.6/debian/changelog Wed Jun 3 20:58:24 2009 (r13721)
@@ -45,6 +45,9 @@
these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD.
* [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and
BROADCOM_PHY.
+ * Add patches from git.marvell.com:
+ - alternative copy_to_user: more precise fallback threshold
+ - lower overhead with alternative copy_to_user for small copies
[ Aurelien Jarno ]
* [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed
Added: dists/trunk/linux-2.6/debian/patches/features/arm/copy_to_user-better_threshold.patch
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ dists/trunk/linux-2.6/debian/patches/features/arm/copy_to_user-better_threshold.patch Wed Jun 3 20:58:24 2009 (r13721)
@@ -0,0 +1,121 @@
+From: Nicolas Pitre <nico at cam.org>
+Date: Sat, 30 May 2009 01:55:50 +0000 (-0400)
+Subject: [ARM] alternative copy_to_user: more precise fallback threshold
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55
+
+[ARM] alternative copy_to_user: more precise fallback threshold
+
+Previous size thresholds were guessed from various user space benchmarks
+using a kernel with and without the alternative uaccess option. This
+is however not as precise as a kernel based test to measure the real
+speed of each method.
+
+This adds a simple test bench to show the time needed for each method.
+With this, the optimal size treshold for the alternative implementation
+can be determined with more confidence. It appears that the optimal
+threshold for both copy_to_user and clear_user is around 64 bytes. This
+is not a surprise knowing that the memcpy and memset implementations
+need at least 64 bytes to achieve maximum throughput.
+
+One might suggest that such test be used to determine the optimal
+threshold at run time instead, but results are near enough to 64 on
+tested targets concerned by this alternative copy_to_user implementation,
+so adding some overhead associated with a variable threshold is probably
+not worth it for now.
+
+Signed-off-by: Nicolas Pitre <nico at marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index 92838e7..6b967ff 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
+ * With frame pointer disabled, tail call optimization kicks in
+ * as well making this test almost invisible.
+ */
+- if (n < 1024)
++ if (n < 64)
+ return __copy_to_user_std(to, from, n);
+ return __copy_to_user_memcpy(to, from, n);
+ }
+@@ -151,7 +151,78 @@ out:
+ unsigned long __clear_user(void __user *addr, unsigned long n)
+ {
+ /* See rational for this in __copy_to_user() above. */
+- if (n < 256)
++ if (n < 64)
+ return __clear_user_std(addr, n);
+ return __clear_user_memset(addr, n);
+ }
++
++#if 0
++
++/*
++ * This code is disabled by default, but kept around in case the chosen
++ * thresholds need to be revalidated. Some overhead (small but still)
++ * would be implied by a runtime determined variable threshold, and
++ * so far the measurement on concerned targets didn't show a worthwhile
++ * variation.
++ *
++ * Note that a fairly precise sched_clock() implementation is needed
++ * for results to make some sense.
++ */
++
++#include <linux/vmalloc.h>
++
++static int __init test_size_treshold(void)
++{
++ struct page *src_page, *dst_page;
++ void *user_ptr, *kernel_ptr;
++ unsigned long long t0, t1, t2;
++ int size, ret;
++
++ ret = -ENOMEM;
++ src_page = alloc_page(GFP_KERNEL);
++ if (!src_page)
++ goto no_src;
++ dst_page = alloc_page(GFP_KERNEL);
++ if (!dst_page)
++ goto no_dst;
++ kernel_ptr = page_address(src_page);
++ user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
++ if (!user_ptr)
++ goto no_vmap;
++
++ /* warm up the src page dcache */
++ ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
++
++ for (size = PAGE_SIZE; size >= 4; size /= 2) {
++ t0 = sched_clock();
++ ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
++ t1 = sched_clock();
++ ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
++ t2 = sched_clock();
++ printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
++ }
++
++ for (size = PAGE_SIZE; size >= 4; size /= 2) {
++ t0 = sched_clock();
++ ret |= __clear_user_memset(user_ptr, size);
++ t1 = sched_clock();
++ ret |= __clear_user_std(user_ptr, size);
++ t2 = sched_clock();
++ printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
++ }
++
++ if (ret)
++ ret = -EFAULT;
++
++ vunmap(user_ptr);
++no_vmap:
++ put_page(dst_page);
++no_dst:
++ put_page(src_page);
++no_src:
++ return ret;
++}
++
++subsys_initcall(test_size_treshold);
++
++#endif
Added: dists/trunk/linux-2.6/debian/patches/features/arm/lower_overhead_with_alternative.patch
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ dists/trunk/linux-2.6/debian/patches/features/arm/lower_overhead_with_alternative.patch Wed Jun 3 20:58:24 2009 (r13721)
@@ -0,0 +1,88 @@
+From: Nicolas Pitre <nico at cam.org>
+Date: Fri, 22 May 2009 02:17:17 +0000 (-0400)
+Subject: [ARM] lower overhead with alternative copy_to_user for small copies
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047
+
+[ARM] lower overhead with alternative copy_to_user for small copies
+
+Because the alternate copy_to_user implementation has a higher setup cost
+than the standard implementation, the size of the memory area to copy
+is tested and the standard implementation invoked instead when that size
+is too small. Still, that test is made after the processor has preserved
+a bunch of registers on the stack which have to be reloaded right away
+needlessly in that case, causing a measurable performance regression
+compared to plain usage of the standard implementation only.
+
+To make the size test overhead negligible, let's factorize it out of
+the alternate copy_to_user function where it is clear to the compiler
+that no stack frame is needed. Thanks to CONFIG_ARM_UNWIND allowing
+for frame pointers to be disabled and tail call optimization to kick in,
+the overhead in the small copy case becomes only 3 assembly instructions.
+
+A similar trick is applied to clear_user as well.
+
+Signed-off-by: Nicolas Pitre <nico at marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index bf987b4..92838e7 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ return 1;
+ }
+
+-unsigned long
+-__copy_to_user(void __user *to, const void *from, unsigned long n)
++static unsigned long noinline
++__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+ int atomic;
+
+- if (n < 1024)
+- return __copy_to_user_std(to, from, n);
+-
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ memcpy((void *)to, from, n);
+ return 0;
+@@ -99,11 +96,24 @@ out:
+ return n;
+ }
+
+-unsigned long __clear_user(void __user *addr, unsigned long n)
++unsigned long
++__copy_to_user(void __user *to, const void *from, unsigned long n)
++{
++ /*
++ * This test is stubbed out of the main function above to keep
++ * the overhead for small copies low by avoiding a large
++ * register dump on the stack just to reload them right away.
++ * With frame pointer disabled, tail call optimization kicks in
++ * as well making this test almost invisible.
++ */
++ if (n < 1024)
++ return __copy_to_user_std(to, from, n);
++ return __copy_to_user_memcpy(to, from, n);
++}
++
++static unsigned long noinline
++__clear_user_memset(void __user *addr, unsigned long n)
+ {
+- if (n < 256)
+- return __clear_user_std(addr, n);
+-
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ memset((void *)addr, 0, n);
+ return 0;
+@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n)
+ out:
+ return n;
+ }
++
++unsigned long __clear_user(void __user *addr, unsigned long n)
++{
++ /* See rational for this in __copy_to_user() above. */
++ if (n < 256)
++ return __clear_user_std(addr, n);
++ return __clear_user_memset(addr, n);
++}
Modified: dists/trunk/linux-2.6/debian/patches/series/base
==============================================================================
--- dists/trunk/linux-2.6/debian/patches/series/base Wed Jun 3 20:53:42 2009 (r13720)
+++ dists/trunk/linux-2.6/debian/patches/series/base Wed Jun 3 20:58:24 2009 (r13721)
@@ -27,6 +27,8 @@
#+ features/sparc/video-sunxvr500-intergraph.patch
+ features/arm/allow-alternative-copy-user.patch
+ features/arm/alternative-copy-user.patch
++ features/arm/lower_overhead_with_alternative.patch
++ features/arm/copy_to_user-better_threshold.patch
+ bugfix/all/mvsdio-platform.patch
+ bugfix/all/mvsdio-ignore-high-speed.patch
+ bugfix/all/mvsdio-config-failure.patch
More information about the Kernel-svn-changes
mailing list