[Pkg-lustre-svn-commit] updated: [9047010] Rediffing patches to make them work on 1.6.7:
Patrick Winnertz
winnie at debian.org
Thu Mar 12 10:27:49 UTC 2009
The following commit has been merged in the master branch:
commit 90470105bc6a388bd2e517713a84f183d61f6b30
Author: Patrick Winnertz <winnie at debian.org>
Date: Thu Mar 12 10:34:21 2009 +0100
Rediffing patches to make them work on 1.6.7:
- posix_acl.dpatch
- remove-set_tunables.dpatch
- enable-quota.dpatch
- removed never used patches for patchless_support in subdir
- used new patchless support patch from bugzilla
Signed-off-by: Patrick Winnertz <winnie at debian.org>
diff --git a/debian/patches/00list b/debian/patches/00list
index d9f5cdc..4318389 100644
--- a/debian/patches/00list
+++ b/debian/patches/00list
@@ -10,22 +10,7 @@ no-darwin.dpatch
remove-set_tunables.dpatch
libsysio.dpatch
bug12769-ql-fix.dpatch
-#patchless_support/configure_for_HEAD.dpatch
-#patchless_support/fix_configure_RO_cache.dpatch
-#patchless_support/fix_nfs_fid_type.dpatch
-#patchless_support/fix_request_module_calls.dpatch
-#patchless_support/lustre_loop_devices_adaption.dpatch
-#patchless_support/nfs_changes_new_API.dpatch
-#patchless_support/sysctl_update.dpatch
-#patchless_support/configure_tests_2.6.27.dpatch
-#patchless_support/fix_mmap.dpatch
-#patchless_support/fix_path_API_changes.dpatch
-#patchless_support/lprocfs_changes.dpatch
-#patchless_support/new_page_fault_method.dpatch
-#patchless_support/splice_read_support.dpatch
-#patchless_support_2.6.24_lnet_part.dpatch
-#patchless_support_2.6.24_configure_part.dpatch
-patchless_support_2.6.24.dpatch
+patchless_support_2.6.26.dpatch
#server_support_2.6.27.dpatch
# Debian patches
bash_completion.dpatch
diff --git a/debian/patches/enable-quota.dpatch b/debian/patches/enable-quota.dpatch
index ddcf06a..9ed249a 100755
--- a/debian/patches/enable-quota.dpatch
+++ b/debian/patches/enable-quota.dpatch
@@ -4,9 +4,9 @@
## DP: --enable-quota check was only run when building modules.
@DPATCH@
-diff -urNad lustre-1.6.6~/configure.ac lustre-1.6.6/configure.ac
---- lustre-1.6.6~/configure.ac 2008-11-26 13:32:11.000000000 +0100
-+++ lustre-1.6.6/configure.ac 2008-11-26 13:37:27.000000000 +0100
+diff -urNad lustre~/configure.ac lustre/configure.ac
+--- lustre~/configure.ac 2009-03-12 10:32:27.000000000 +0100
++++ lustre/configure.ac 2009-03-12 11:19:53.000000000 +0100
@@ -8,6 +8,7 @@
LB_CHECK_VERSION
@@ -15,35 +15,10 @@ diff -urNad lustre-1.6.6~/configure.ac lustre-1.6.6/configure.ac
AC_CANONICAL_SYSTEM
-diff -urNad lustre-1.6.6~/lustre/autoconf/lustre-core.m4 lustre-1.6.6/lustre/autoconf/lustre-core.m4
---- lustre-1.6.6~/lustre/autoconf/lustre-core.m4 2008-11-26 13:37:22.000000000 +0100
-+++ lustre-1.6.6/lustre/autoconf/lustre-core.m4 2008-11-26 13:38:08.000000000 +0100
-@@ -1676,24 +1676,9 @@
- AC_HELP_STRING([--enable-quota],
- [enable quota support]),
- [],[enable_quota='default'])
--if test x$linux25 != xyes; then
-- enable_quota='no'
--fi
- LB_LINUX_CONFIG([QUOTA],[
-- if test x$enable_quota = xdefault; then
- enable_quota='yes'
-- fi
- ],[
-- if test x$enable_quota = xdefault; then
-- enable_quota='no'
-- AC_MSG_WARN([quota is not enabled because the kernel lacks quota
-- support])
-- else
-- if test x$enable_quota = xyes; then
-- AC_MSG_ERROR([cannot enable quota because the kernel lac
--ks quota support])
-- fi
-- fi
- ])
- if test x$enable_quota != xno; then
- AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
-@@ -1715,6 +1700,7 @@
+diff -urNad lustre~/lustre/autoconf/lustre-core.m4 lustre/lustre/autoconf/lustre-core.m4
+--- lustre~/lustre/autoconf/lustre-core.m4 2009-03-12 11:19:52.000000000 +0100
++++ lustre/lustre/autoconf/lustre-core.m4 2009-03-12 11:19:53.000000000 +0100
+@@ -1813,6 +1813,7 @@
])
])
diff --git a/debian/patches/patchless_support/configure_for_HEAD.dpatch b/debian/patches/patchless_support/configure_for_HEAD.dpatch
deleted file mode 100755
index d4eae95..0000000
--- a/debian/patches/patchless_support/configure_for_HEAD.dpatch
+++ /dev/null
@@ -1,329 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-diff -urNad lustre~/lustre/autoconf/lustre-core.m4 lustre/lustre/autoconf/lustre-core.m4
---- lustre~/lustre/autoconf/lustre-core.m4 2008-11-25 13:59:37.000000000 +0100
-+++ lustre/lustre/autoconf/lustre-core.m4 2008-12-22 10:00:57.000000000 +0100
-@@ -1109,15 +1109,20 @@
- AC_DEFUN([LC_PAGE_CHECKED],
- [AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
- LB_LINUX_TRY_COMPILE([
-- #include <linux/mm.h>
-- #include <linux/page-flags.h>
-+ #include <linux/autoconf.h>
-+#ifdef HAVE_LINUX_MMTYPES_H
-+ #include <linux/mm_types.h>
-+#endif
-+ #include <linux/page-flags.h>
- ],[
-- #ifndef PageChecked
-- #error PageChecked not defined in kernel
-- #endif
-- #ifndef SetPageChecked
-- #error SetPageChecked not defined in kernel
-- #endif
-+ struct page *p;
-+
-+ /* before 2.6.26 this define*/
-+ #ifndef PageChecked
-+ /* 2.6.26 use function instead of define for it */
-+ SetPageChecked(p);
-+ PageChecked(p);
-+ #endif
- ],[
- AC_MSG_RESULT(yes)
- AC_DEFINE(HAVE_PAGE_CHECKED, 1,
-@@ -1271,11 +1276,149 @@
-
- # 2.6.23 extract nfs export related data into exportfs.h
- AC_DEFUN([LC_HAVE_EXPORTFS_H],
--[
--tmpfl="$CFLAGS"
--CFLAGS="$CFLAGS -I$LINUX_OBJ/include"
--AC_CHECK_HEADERS([linux/exportfs.h])
--CFLAGS="$tmpfl"
-+[LB_CHECK_FILE([$LINUX/include/linux/exportfs.h], [
-+ AC_DEFINE(HAVE_LINUX_EXPORTFS_H, 1,
-+ [kernel has include/exportfs.h])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.23 have new page fault handling API
-+AC_DEFUN([LC_VM_OP_FAULT],
-+[AC_MSG_CHECKING([if kernel has .fault in vm_operation_struct])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/mm.h>
-+],[
-+ struct vm_operations_struct op;
-+
-+ op.fault = NULL;
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_VM_OP_FAULT, 1,
-+ [if kernel has .fault in vm_operation_struct])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+#2.6.23 has new shrinker API
-+AC_DEFUN([LC_REGISTER_SHRINKER],
-+[AC_MSG_CHECKING([if kernel has register_shrinker])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/mm.h>
-+],[
-+ register_shrinker(NULL);
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_REGISTER_SHRINKER, 1,
-+ [if kernel has register_shrinker])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.24 has bio_endio with 2 args
-+AC_DEFUN([LC_BIO_ENDIO_2ARG],
-+[AC_MSG_CHECKING([if kernel has bio_endio with 2 args])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/bio.h>
-+],[
-+ bio_endio(NULL, 0);
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_BIO_ENDIO_2ARG, 1,
-+ [if kernel has bio_endio with 2 args])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.24 has new members in exports struct.
-+AC_DEFUN([LC_FH_TO_DENTRY],
-+[AC_MSG_CHECKING([if kernel has .fh_to_dentry member in export_operations struct])
-+LB_LINUX_TRY_COMPILE([
-+#ifdef HAVE_LINUX_EXPORTFS_H
-+ #include <linux/exportfs.h>
-+#else
-+ #include <linux/fs.h>
-+#endif
-+],[
-+ struct export_operations exp;
-+
-+ exp.fh_to_dentry = NULL;
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_FH_TO_DENTRY, 1,
-+ [kernel has .fh_to_dentry member in export_operations struct])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.24 need linux/mm_types.h included
-+AC_DEFUN([LC_HAVE_MMTYPES_H],
-+[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [
-+ AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1,
-+ [kernel has include/mm_types.h])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.24 remove long aged procfs entry -> deleted member
-+AC_DEFUN([LC_PROCFS_DELETED],
-+[AC_MSG_CHECKING([if kernel has deleted member in procfs entry struct])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/proc_fs.h>
-+],[
-+ struct proc_dir_entry pde;
-+
-+ pde.deleted = NULL;
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_PROCFS_DELETED, 1,
-+ [kernel has deleted member in procfs entry struct])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.26 isn't export set_fs_pwd and change paramter in fs struct
-+AC_DEFUN([LC_FS_STRUCT_USE_PATH],
-+[AC_MSG_CHECKING([fs_struct use path structure])
-+LB_LINUX_TRY_COMPILE([
-+ #include <asm/atomic.h>
-+ #include <linux/spinlock.h>
-+ #include <linux/fs_struct.h>
-+],[
-+ struct path path;
-+ struct fs_struct fs;
-+
-+ fs.pwd = path;
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_FS_STRUCT_USE_PATH, 1,
-+ [fs_struct use path structure])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.26 remove path_release and use path_put instead
-+AC_DEFUN([LC_PATH_RELEASE],
-+[AC_MSG_CHECKING([if path_release exist])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/dcache.h>
-+ #include <linux/namei.h>
-+],[
-+ path_release(NULL);
-+],[
-+ AC_DEFINE(HAVE_PATH_RELEASE, 1, [path_release exist])
-+ AC_MSG_RESULT([yes])
-+],[
-+ AC_MSG_RESULT([no])
-+])
- ])
-
- #
-@@ -1370,13 +1513,27 @@
- # raid5-zerocopy patch
- LC_PAGE_CONSTANT
-
-- # 2.6.22
-+ # 2.6.22
- LC_INVALIDATE_BDEV_2ARG
- LC_FS_RENAME_DOES_D_MOVE
-- # 2.6.23
-+ # 2.6.23
- LC_UNREGISTER_BLKDEV_RETURN_INT
- LC_KERNEL_SPLICE_READ
- LC_HAVE_EXPORTFS_H
-+ LC_VM_OP_FAULT
-+ LC_REGISTER_SHRINKER
-+
-+ # 2.6.24
-+ LC_HAVE_MMTYPES_H
-+ LC_BIO_ENDIO_2ARG
-+ LC_FH_TO_DENTRY
-+ LC_PROCFS_DELETED
-+
-+ # 2.6.26
-+ LC_FS_STRUCT_USE_PATH
-+ LC_RCU_LIST_SAFE
-+ LC_PATH_RELEASE
-+
- ])
-
- #
-@@ -1609,6 +1766,7 @@
- ],[
- AC_MSG_RESULT([no])
- ])
-+
- ],[
- AC_MSG_RESULT([no])
- ])
-diff -urNad lustre~/lustre/include/linux/lustre_compat25.h lustre/lustre/include/linux/lustre_compat25.h
---- lustre~/lustre/include/linux/lustre_compat25.h 2008-11-25 13:59:37.000000000 +0100
-+++ lustre/lustre/include/linux/lustre_compat25.h 2008-12-22 10:02:32.000000000 +0100
-@@ -57,6 +57,28 @@
- #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) */
-
- #ifndef HAVE_SET_FS_PWD
-+
-+#ifdef HAVE_FS_STRUCT_USE_PATH
-+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
-+ struct dentry *dentry)
-+{
-+ struct path path;
-+ struct path old_pwd;
-+
-+ path.mnt = mnt;
-+ path.dentry = dentry;
-+ write_lock(&fs->lock);
-+ old_pwd = fs->pwd;
-+ path_get(&path);
-+ fs->pwd = path;
-+ write_unlock(&fs->lock);
-+
-+ if (old_pwd.dentry)
-+ path_put(&old_pwd);
-+}
-+
-+#else
-+
- static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
- struct dentry *dentry)
- {
-@@ -75,6 +97,7 @@
- mntput(old_pwdmnt);
- }
- }
-+#endif
- #else
- #define ll_set_fs_pwd set_fs_pwd
- #endif /* HAVE_SET_FS_PWD */
-@@ -590,5 +613,56 @@
- vfs_rename(old,old_dir,new,new_dir)
- #endif
-
-+#ifdef HAVE_REGISTER_SHRINKER
-+typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
-+
-+static inline
-+struct shrinker *set_shrinker(int seek, shrinker_t func)
-+{
-+ struct shrinker *s;
-+
-+ s = kmalloc(sizeof(*s), GFP_KERNEL);
-+ if (s == NULL)
-+ return (NULL);
-+
-+ s->shrink = func;
-+ s->seeks = seek;
-+
-+ register_shrinker(s);
-+
-+ return s;
-+}
-+
-+static inline
-+void remove_shrinker(struct shrinker *shrinker)
-+{
-+ if (shrinker == NULL)
-+ return;
-+
-+ unregister_shrinker(shrinker);
-+ kfree(shrinker);
-+}
-+#endif
-+
-+#ifdef HAVE_BIO_ENDIO_2ARG
-+#define cfs_bio_io_error(a,b) bio_io_error((a))
-+#define cfs_bio_endio(a,b,c) bio_endio((a),(c))
-+#else
-+#define cfs_bio_io_error(a,b) bio_io_error((a),(b))
-+#define cfs_bio_endio(a,b,c) bio_endio((a),(b),(c))
-+#endif
-+
-+#ifdef HAVE_FS_STRUCT_USE_PATH
-+#define cfs_fs_pwd(fs) ((fs)->pwd.dentry)
-+#define cfs_fs_mnt(fs) ((fs)->pwd.mnt)
-+#else
-+#define cfs_fs_pwd(fs) ((fs)->pwd)
-+#define cfs_fs_mnt(fs) ((fs)->pwdmnt)
-+#endif
-+
-+#ifndef list_for_each_safe_rcu
-+#define list_for_each_safe_rcu(a,b,c) list_for_each_rcu(b, c)
-+#endif
-+
- #endif /* __KERNEL__ */
- #endif /* _COMPAT25_H */
diff --git a/debian/patches/patchless_support/configure_tests_2.6.27.dpatch b/debian/patches/patchless_support/configure_tests_2.6.27.dpatch
deleted file mode 100755
index ab89906..0000000
--- a/debian/patches/patchless_support/configure_tests_2.6.27.dpatch
+++ /dev/null
@@ -1,461 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/libcfs/autoconf/lustre-libcfs.m4
-===================================================================
---- HEAD.orig/libcfs/autoconf/lustre-libcfs.m4 2008-11-20 14:33:51.000000000 +0200
-+++ HEAD/libcfs/autoconf/lustre-libcfs.m4 2008-12-17 17:15:38.000000000 +0200
-@@ -364,7 +364,7 @@
- ])
-
- # 2.6.24 request not use real numbers for ctl_name
--AC_DEFUN([LN_SYSCTL_UNNUMBERED],
-+AC_DEFUN([LIBCFS_SYSCTL_UNNUMBERED],
- [AC_MSG_CHECKING([for CTL_UNNUMBERED])
- LB_LINUX_TRY_COMPILE([
- #include <linux/sysctl.h>
-@@ -382,7 +382,7 @@
- ])
-
- # 2.6.24 lost scatterlist->page
--AC_DEFUN([LN_SCATTERLIST_SETPAGE],
-+AC_DEFUN([LIBCFS_SCATTERLIST_SETPAGE],
- [AC_MSG_CHECKING([for exist sg_set_page])
- LB_LINUX_TRY_COMPILE([
- #include <linux/scatterlist.h>
-@@ -398,7 +398,7 @@
- ])
-
- # 2.6.26 use int instead of atomic for sem.count
--AC_DEFUN([LN_SEM_COUNT],
-+AC_DEFUN([LIBCFS_SEM_COUNT],
- [AC_MSG_CHECKING([atomic sem.count])
- LB_LINUX_TRY_COMPILE([
- #include <asm/semaphore.h>
-@@ -415,6 +415,21 @@
- ])
- ])
-
-+# 2.6.27 have second argument to sock_map_fd
-+AC_DEFUN([LIBCFS_SOCK_MAP_FD_2ARG],
-+[AC_MSG_CHECKING([sock_map_fd have second argument])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/net.h>
-+],[
-+ sock_map_fd(NULL, 0);
-+],[
-+ AC_MSG_RESULT(yes)
-+ AC_DEFINE(HAVE_SOCK_MAP_FD_2ARG, 1,
-+ [sock_map_fd have second argument])
-+],[
-+ AC_MSG_RESULT(NO)
-+])
-+])
-
- #
- # LIBCFS_PROG_LINUX
-@@ -447,10 +462,12 @@
- # 2.6.23
- LIBCFS_KMEM_CACHE_CREATE_DTOR
- # 2.6.24
--LN_SYSCTL_UNNUMBERED
--LN_SCATTERLIST_SETPAGE
-+LIBCFS_SYSCTL_UNNUMBERED
-+LIBCFS_SCATTERLIST_SETPAGE
- # 2.6.26
--LN_SEM_COUNT
-+LIBCFS_SEM_COUNT
-+# 2.6.27
-+LIBCFS_SOCK_MAP_FD_2ARG
- ])
-
- #
-Index: HEAD/libcfs/libcfs/linux/linux-tcpip.c
-===================================================================
---- HEAD.orig/libcfs/libcfs/linux/linux-tcpip.c 2008-08-07 20:22:50.000000000 +0300
-+++ HEAD/libcfs/libcfs/linux/linux-tcpip.c 2008-12-17 17:15:38.000000000 +0200
-@@ -63,7 +63,11 @@
- return rc;
- }
-
-+#ifdef HAVE_SOCK_MAP_FD_2ARG
-+ fd = sock_map_fd(sock,0);
-+#else
- fd = sock_map_fd(sock);
-+#endif
- if (fd < 0) {
- rc = fd;
- sock_release(sock);
-Index: HEAD/lustre/autoconf/lustre-core.m4
-===================================================================
---- HEAD.orig/lustre/autoconf/lustre-core.m4 2008-12-17 17:15:38.000000000 +0200
-+++ HEAD/lustre/autoconf/lustre-core.m4 2008-12-17 17:15:38.000000000 +0200
-@@ -1727,6 +1727,56 @@
- ])
- ])
-
-+#2.6.27
-+AC_DEFUN([LC_INODE_PERMISION_2ARGS],
-+[AC_MSG_CHECKING([inode_operations->permission have two args])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/fs.h>
-+],[
-+ struct inode *inode;
-+
-+ inode->i_op->permission(NULL,0);
-+],[
-+ AC_DEFINE(HAVE_INODE_PERMISION_2ARGS, 1,
-+ [inode_operations->permission have two args])
-+ AC_MSG_RESULT([yes])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.27 have file_remove_suid instead of remove_suid
-+AC_DEFUN([LC_FILE_REMOVE_SUID],
-+[AC_MSG_CHECKING([kernel have file_remove_suid])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/fs.h>
-+],[
-+ file_remove_suid(NULL);
-+],[
-+ AC_DEFINE(HAVE_FILE_REMOVE_SUID, 1,
-+ [kernel have file_remove_suid])
-+ AC_MSG_RESULT([yes])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.27 have new page locking API
-+AC_DEFUN([LC_TRYLOCKPAGE],
-+[AC_MSG_CHECKING([kernel use trylock_page for page lock])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/pagemap.h>
-+],[
-+ trylock_page(NULL);
-+],[
-+ AC_DEFINE(HAVE_TRYLOCK_PAGE, 1,
-+ [kernel use trylock_page for page lock])
-+ AC_MSG_RESULT([yes])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
- #
- # LC_PROG_LINUX
- #
-@@ -1818,9 +1868,9 @@
- LC_INVALIDATEPAGE_RETURN_INT
- LC_UMOUNTBEGIN_HAS_VFSMOUNT
- LC_SEQ_LOCK
-+ LC_EXPORT_FILEMAP_FDATAWRITE_RANGE
- if test x$enable_server = xyes ; then
-- LC_EXPORT_INVALIDATE_MAPPING_PAGES
-- LC_EXPORT_FILEMAP_FDATAWRITE_RANGE
-+ LC_EXPORT_INVALIDATE_MAPPING_PAGES
- fi
-
- #2.6.18 + RHEL5 (fc6)
-@@ -1863,6 +1913,11 @@
- LC_FS_STRUCT_USE_PATH
- LC_RCU_LIST_SAFE
- LC_PATH_RELEASE
-+
-+ # 2.6.27
-+ LC_INODE_PERMISION_2ARGS
-+ LC_FILE_REMOVE_SUID
-+ LC_TRYLOCKPAGE
- ])
-
- #
-Index: HEAD/lustre/obdclass/lustre_handles.c
-===================================================================
---- HEAD.orig/lustre/obdclass/lustre_handles.c 2008-08-07 20:23:44.000000000 +0300
-+++ HEAD/lustre/obdclass/lustre_handles.c 2008-12-17 17:15:38.000000000 +0200
-@@ -246,7 +246,7 @@
- int i;
-
- for (i = 0; i < HANDLE_HASH_SIZE; i++) {
-- struct list_head *tmp, *pos;
-+ struct list_head *tmp = NULL , *pos;
- spin_lock(&handle_hash[i].lock);
- list_for_each_safe_rcu(tmp, pos, &(handle_hash[i].head)) {
- struct portals_handle *h;
-Index: HEAD/lustre/obdclass/capa.c
-===================================================================
---- HEAD.orig/lustre/obdclass/capa.c 2008-11-17 11:36:44.000000000 +0200
-+++ HEAD/lustre/obdclass/capa.c 2008-12-17 17:15:38.000000000 +0200
-@@ -246,11 +246,7 @@
- struct ll_crypto_hash *tfm;
- struct capa_hmac_alg *alg;
- int keylen;
-- struct scatterlist sl = {
-- .page = virt_to_page(capa),
-- .offset = (unsigned long)(capa) % CFS_PAGE_SIZE,
-- .length = offsetof(struct lustre_capa, lc_hmac),
-- };
-+ struct scatterlist sl;
-
- if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
- CERROR("unknown capability hmac algorithm!\n");
-@@ -267,6 +263,10 @@
- }
- keylen = alg->ha_keylen;
-
-+ sg_set_page(&sl, virt_to_page(capa),
-+ offsetof(struct lustre_capa, lc_hmac),
-+ (unsigned long)(capa) % CFS_PAGE_SIZE);
-+
- ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
- ll_crypto_free_hash(tfm);
-
-@@ -276,16 +276,8 @@
- int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
- {
- struct ll_crypto_cipher *tfm;
-- struct scatterlist sd = {
-- .page = virt_to_page(d),
-- .offset = (unsigned long)(d) % CFS_PAGE_SIZE,
-- .length = 16,
-- };
-- struct scatterlist ss = {
-- .page = virt_to_page(s),
-- .offset = (unsigned long)(s) % CFS_PAGE_SIZE,
-- .length = 16,
-- };
-+ struct scatterlist sd;
-+ struct scatterlist ss;
- struct blkcipher_desc desc;
- unsigned int min;
- int rc;
-@@ -309,6 +301,11 @@
- GOTO(out, rc);
- }
-
-+ sg_set_page(&sd, virt_to_page(d), 16,
-+ (unsigned long)(d) % CFS_PAGE_SIZE);
-+
-+ sg_set_page(&ss, virt_to_page(s), 16,
-+ (unsigned long)(s) % CFS_PAGE_SIZE);
- desc.tfm = tfm;
- desc.info = NULL;
- desc.flags = 0;
-@@ -328,16 +325,8 @@
- int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
- {
- struct ll_crypto_cipher *tfm;
-- struct scatterlist sd = {
-- .page = virt_to_page(d),
-- .offset = (unsigned long)(d) % CFS_PAGE_SIZE,
-- .length = 16,
-- };
-- struct scatterlist ss = {
-- .page = virt_to_page(s),
-- .offset = (unsigned long)(s) % CFS_PAGE_SIZE,
-- .length = 16,
-- };
-+ struct scatterlist sd;
-+ struct scatterlist ss;
- struct blkcipher_desc desc;
- unsigned int min;
- int rc;
-@@ -361,6 +350,12 @@
- GOTO(out, rc);
- }
-
-+ sg_set_page(&sd, virt_to_page(d), 16,
-+ (unsigned long)(d) % CFS_PAGE_SIZE);
-+
-+ sg_set_page(&ss, virt_to_page(s), 16,
-+ (unsigned long)(s) % CFS_PAGE_SIZE);
-+
- desc.tfm = tfm;
- desc.info = NULL;
- desc.flags = 0;
-Index: HEAD/lustre/ptlrpc/sec_bulk.c
-===================================================================
---- HEAD.orig/lustre/ptlrpc/sec_bulk.c 2008-08-14 23:55:36.000000000 +0300
-+++ HEAD/lustre/ptlrpc/sec_bulk.c 2008-12-17 17:15:38.000000000 +0200
-@@ -992,9 +992,9 @@
- }
-
- for (i = 0; i < desc->bd_iov_count; i++) {
-- sl[i].page = desc->bd_iov[i].kiov_page;
-- sl[i].offset = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
-- sl[i].length = desc->bd_iov[i].kiov_len;
-+ sg_set_page(&sl[i], desc->bd_iov[i].kiov_page,
-+ desc->bd_iov[i].kiov_len,
-+ desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK);
- bytes += desc->bd_iov[i].kiov_len;
- }
-
-Index: HEAD/lustre/ptlrpc/sec_config.c
-===================================================================
---- HEAD.orig/lustre/ptlrpc/sec_config.c 2008-12-03 05:47:20.000000000 +0200
-+++ HEAD/lustre/ptlrpc/sec_config.c 2008-12-15 17:19:13.000000000 +0200
-@@ -1170,7 +1170,7 @@
-
- push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
-- dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd,
-+ dentry = lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
- strlen(MOUNT_CONFIGS_DIR));
- if (IS_ERR(dentry)) {
- rc = PTR_ERR(dentry);
-Index: HEAD/lustre/include/linux/lustre_patchless_compat.h
-===================================================================
---- HEAD.orig/lustre/include/linux/lustre_patchless_compat.h 2008-08-07 20:23:18.000000000 +0300
-+++ HEAD/lustre/include/linux/lustre_patchless_compat.h 2008-12-17 17:15:38.000000000 +0200
-@@ -52,7 +52,7 @@
-
- BUG_ON(!PageLocked(page));
-
--#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15))
-+#ifdef HAVE_RW_TREE_LOCK
- write_lock_irq(&mapping->tree_lock);
- #else
- spin_lock_irq(&mapping->tree_lock);
-@@ -66,7 +66,7 @@
- __dec_zone_page_state(page, NR_FILE_PAGES);
- #endif
-
--#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15))
-+#ifdef HAVE_RW_TREE_LOCK
- write_unlock_irq(&mapping->tree_lock);
- #else
- spin_unlock_irq(&mapping->tree_lock);
-Index: HEAD/lustre/include/linux/lustre_compat25.h
-===================================================================
---- HEAD.orig/lustre/include/linux/lustre_compat25.h 2008-12-17 17:15:38.000000000 +0200
-+++ HEAD/lustre/include/linux/lustre_compat25.h 2008-12-17 17:15:38.000000000 +0200
-@@ -170,7 +170,12 @@
- #endif
-
- /* XXX our code should be using the 2.6 calls, not the other way around */
-+#ifndef HAVE_TRYLOCK_PAGE
- #define TryLockPage(page) TestSetPageLocked(page)
-+#else
-+#define TryLockPage(page) (!trylock_page(page))
-+#endif
-+
- #define Page_Uptodate(page) PageUptodate(page)
- #define ll_redirty_page(page) set_page_dirty(page)
-
-@@ -623,8 +628,17 @@
- #define ll_crypto_free_blkcipher(tfm) crypto_free_tfm(tfm)
- #endif /* HAVE_ASYNC_BLOCK_CIPHER */
-
-+#ifdef HAVE_FILE_REMOVE_SUID
-+#define ll_remove_suid(file, mnt) file_remove_suid(file)
-+#else
-+ #ifdef HAVE_SECURITY_PLUG
-+ #define ll_remove_suid(file,mnt) remove_suid(file->f_dentry,mnt)
-+ #else
-+ #define ll_remove_suid(file,mnt) remove_suid(file->f_dentry)
-+ #endif
-+#endif
-+
- #ifdef HAVE_SECURITY_PLUG
--#define ll_remove_suid(inode,mnt) remove_suid(inode,mnt)
- #define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry,mnt)
- #define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mnt,mode)
- #define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,mnt,dir,new,mnt1)
-@@ -636,7 +650,6 @@
- #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
- vfs_rename(old,old_dir,mnt,new,new_dir,mnt1)
- #else
--#define ll_remove_suid(inode,mnt) remove_suid(inode)
- #define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry)
- #define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode)
- #define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new)
-Index: HEAD/lustre/include/linux/lustre_lib.h
-===================================================================
---- HEAD.orig/lustre/include/linux/lustre_lib.h 2008-08-07 20:23:18.000000000 +0300
-+++ HEAD/lustre/include/linux/lustre_lib.h 2008-12-17 17:15:38.000000000 +0200
-@@ -49,7 +49,6 @@
- # include <string.h>
- # include <sys/types.h>
- #else
--# include <asm/semaphore.h>
- # include <linux/rwsem.h>
- # include <linux/sched.h>
- # include <linux/signal.h>
-Index: HEAD/lustre/llite/llite_internal.h
-===================================================================
---- HEAD.orig/lustre/llite/llite_internal.h 2008-12-17 17:15:38.000000000 +0200
-+++ HEAD/lustre/llite/llite_internal.h 2008-12-17 17:15:38.000000000 +0200
-@@ -661,7 +661,11 @@
- struct lookup_intent *it, struct kstat *stat);
- int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
- struct ll_file_data *ll_file_data_get(void);
-+#ifndef HAVE_INODE_PERMISION_2ARGS
- int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
-+#else
-+int ll_inode_permission(struct inode *inode, int mask);
-+#endif
- int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
- int flags, struct lov_user_md *lum,
- int lum_size);
-Index: HEAD/lustre/llite/file.c
-===================================================================
---- HEAD.orig/lustre/llite/file.c 2008-11-20 14:34:31.000000000 +0200
-+++ HEAD/lustre/llite/file.c 2008-12-17 17:15:38.000000000 +0200
-@@ -2305,7 +2305,11 @@
- }
-
- #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
-+#ifndef HAVE_INODE_PERMISION_2ARGS
- int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
-+#else
-+int ll_inode_permission(struct inode *inode, int mask)
-+#endif
- {
- CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
- inode->i_ino, inode->i_generation, inode, mask);
-Index: HEAD/lustre/llite/vvp_page.c
-===================================================================
---- HEAD.orig/lustre/llite/vvp_page.c 2008-11-10 20:27:54.000000000 +0200
-+++ HEAD/lustre/llite/vvp_page.c 2008-12-17 17:15:38.000000000 +0200
-@@ -341,7 +341,7 @@
-
- result = -EAGAIN;
- /* we're trying to write, but the page is locked.. come back later */
-- if (!TestSetPageLocked(vmpage)) {
-+ if (!TryLockPage(vmpage)) {
- if (pg->cp_state == CPS_CACHED) {
- /*
- * We can cancel IO if page wasn't dirty after all.
-Index: HEAD/lnet/autoconf/lustre-lnet.m4
-===================================================================
---- HEAD.orig/lnet/autoconf/lustre-lnet.m4 2008-09-25 07:44:45.000000000 +0300
-+++ HEAD/lnet/autoconf/lustre-lnet.m4 2008-12-17 17:15:38.000000000 +0200
-@@ -1098,6 +1098,22 @@
- AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
- ])
-
-+# 2.6.27 have second argument to sock_map_fd
-+AC_DEFUN([LN_SOCK_MAP_FD_2ARG],
-+[AC_MSG_CHECKING([sock_map_fd have second argument])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/net.h>
-+],[
-+ sock_map_fd(NULL, 0);
-+],[
-+ AC_MSG_RESULT(yes)
-+ AC_DEFINE(HAVE_SOCK_MAP_FD_2ARG, 1,
-+ [sock_map_fd have second argument])
-+],[
-+ AC_MSG_RESULT(NO)
-+])
-+])
-+
- #
- # LN_CONFIG_FILES
- #
diff --git a/debian/patches/patchless_support/fix_configure_RO_cache.dpatch b/debian/patches/patchless_support/fix_configure_RO_cache.dpatch
deleted file mode 100755
index 0c5664b..0000000
--- a/debian/patches/patchless_support/fix_configure_RO_cache.dpatch
+++ /dev/null
@@ -1,57 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-diff -urNad lustre~/lustre/autoconf/lustre-core.m4 lustre/lustre/autoconf/lustre-core.m4
---- lustre~/lustre/autoconf/lustre-core.m4 2008-12-22 10:08:29.000000000 +0100
-+++ lustre/lustre/autoconf/lustre-core.m4 2008-12-22 10:08:29.000000000 +0100
-@@ -1240,6 +1240,9 @@
- ])
- ])
-
-+# 2.6.18
-+
-+
- # 2.6.23 have return type 'void' for unregister_blkdev
- AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT],
- [AC_MSG_CHECKING([if unregister_blkdev return int])
-@@ -1384,6 +1387,26 @@
- ])
- ])
-
-+# 2.6.25 change define to inline
-+AC_DEFUN([LC_MAPPING_CAP_WRITEBACK_DIRTY],
-+[AC_MSG_CHECKING([if kernel have mapping_cap_writeback_dirty])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/backing-dev.h>
-+],[
-+ #ifndef mapping_cap_writeback_dirty
-+ mapping_cap_writeback_dirty(NULL);
-+ #endif
-+],[
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_MAPPING_CAP_WRITEBACK_DIRTY, 1,
-+ [kernel have mapping_cap_writeback_dirty])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+
-+
- # 2.6.26 isn't export set_fs_pwd and change paramter in fs struct
- AC_DEFUN([LC_FS_STRUCT_USE_PATH],
- [AC_MSG_CHECKING([fs_struct use path structure])
-@@ -1528,6 +1551,9 @@
- LC_BIO_ENDIO_2ARG
- LC_FH_TO_DENTRY
- LC_PROCFS_DELETED
-+
-+ #2.6.25
-+ LC_MAPPING_CAP_WRITEBACK_DIRTY
-
- # 2.6.26
- LC_FS_STRUCT_USE_PATH
diff --git a/debian/patches/patchless_support/fix_mmap.dpatch b/debian/patches/patchless_support/fix_mmap.dpatch
deleted file mode 100755
index 4b05ac7..0000000
--- a/debian/patches/patchless_support/fix_mmap.dpatch
+++ /dev/null
@@ -1,308 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: b1_8_gate/lustre/llite/llite_mmap.c
-===================================================================
---- b1_8_gate.orig/lustre/llite/llite_mmap.c 2008-11-11 18:23:11.000000000 +0300
-+++ b1_8_gate/lustre/llite/llite_mmap.c 2008-12-03 13:25:37.000000000 +0300
-@@ -81,8 +81,7 @@
- int lt_get_mmap_locks(struct ll_lock_tree *tree,
- unsigned long addr, size_t count);
-
--struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
-- int *type);
-+static struct vm_operations_struct ll_file_vm_ops;
-
- struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
- __u64 end, ldlm_mode_t mode)
-@@ -285,9 +284,19 @@
- return LCK_PR;
- }
-
-+static void policy_from_vma_pgoff(ldlm_policy_data_t *policy,
-+ struct vm_area_struct *vma,
-+ __u64 pgoff, size_t count)
-+{
-+ policy->l_extent.start = pgoff << CFS_PAGE_SHIFT;
-+ policy->l_extent.end = (policy->l_extent.start + count - 1) |
-+ ~CFS_PAGE_MASK;
-+}
-+
- static void policy_from_vma(ldlm_policy_data_t *policy,
- struct vm_area_struct *vma, unsigned long addr,
- size_t count)
-+
- {
- policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
- ((__u64)vma->vm_pgoff << CFS_PAGE_SHIFT);
-@@ -308,7 +317,7 @@
- spin_lock(&mm->page_table_lock);
- for(vma = find_vma(mm, addr);
- vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
-- if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage &&
-+ if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
- vma->vm_flags & VM_SHARED) {
- ret = vma;
- break;
-@@ -360,44 +369,30 @@
- }
- RETURN(0);
- }
--/**
-- * Page fault handler.
-- *
-- * \param vma - is virtiual area struct related to page fault
-- * \param address - address when hit fault
-- * \param type - of fault
-- *
-- * \return allocated and filled page for address
-- * \retval NOPAGE_SIGBUS if page not exist on this address
-- * \retval NOPAGE_OOM not have memory for allocate new page
-- */
--struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
-- int *type)
-+
-+static int ll_get_extent_lock(struct vm_area_struct *vma, unsigned long pgoff,
-+ int *save_flags, struct lustre_handle *lockh)
- {
- struct file *filp = vma->vm_file;
- struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
- struct inode *inode = filp->f_dentry->d_inode;
-- struct lustre_handle lockh = { 0 };
- ldlm_policy_data_t policy;
- ldlm_mode_t mode;
-- struct page *page = NULL;
- struct ll_inode_info *lli = ll_i2info(inode);
-- struct lov_stripe_md *lsm;
- struct ost_lvb lvb;
- __u64 kms, old_mtime;
-- unsigned long pgoff, size, rand_read, seq_read;
-- int rc = 0;
-+ unsigned long size;
- ENTRY;
-
- if (lli->lli_smd == NULL) {
- CERROR("No lsm on fault?\n");
-- RETURN(NOPAGE_SIGBUS);
-+ RETURN(0);
- }
-
- ll_clear_file_contended(inode);
-
- /* start and end the lock on the first and last bytes in the page */
-- policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE);
-+ policy_from_vma_pgoff(&policy, vma, pgoff, CFS_PAGE_SIZE);
-
- CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
- vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end);
-@@ -405,26 +400,28 @@
- mode = mode_from_vma(vma);
- old_mtime = LTIME_S(inode->i_mtime);
-
-- lsm = lli->lli_smd;
-- rc = ll_extent_lock(fd, inode, lsm, mode, &policy,
-- &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU);
-- if (rc != 0)
-- RETURN(NOPAGE_SIGBUS);
-+ if(ll_extent_lock(fd, inode, lli->lli_smd, mode, &policy,
-+ lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU) != 0)
-+ RETURN(0);
-
- if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
- CWARN("binary changed. inode %lu\n", inode->i_ino);
-
-- lov_stripe_lock(lsm);
-+ lov_stripe_lock(lli->lli_smd);
- inode_init_lvb(inode, &lvb);
-- obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);
-+ if(obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 1)) {
-+ lov_stripe_unlock(lli->lli_smd);
-+ RETURN(0);
-+ }
- kms = lvb.lvb_size;
-
-- pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
- size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-+ CDEBUG(D_INFO, "Kms %lu - %lu\n", size, pgoff);
-
- if (pgoff >= size) {
-- lov_stripe_unlock(lsm);
-+ lov_stripe_unlock(lli->lli_smd);
- ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
-+ lov_stripe_lock(lli->lli_smd);
- } else {
- /* XXX change inode size without ll_inode_size_lock() held!
- * there is a race condition with truncate path. (see
-@@ -446,29 +443,69 @@
- CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n",
- inode->i_ino, i_size_read(inode));
- }
-- lov_stripe_unlock(lsm);
- }
-
- /* If mapping is writeable, adjust kms to cover this page,
- * but do not extend kms beyond actual file size.
- * policy.l_extent.end is set to the end of the page by policy_from_vma
- * bug 10919 */
-- lov_stripe_lock(lsm);
- if (mode == LCK_PW)
-- obd_adjust_kms(ll_i2obdexp(inode), lsm,
-+ obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd,
- min_t(loff_t, policy.l_extent.end + 1,
- i_size_read(inode)), 0);
-- lov_stripe_unlock(lsm);
-+ lov_stripe_unlock(lli->lli_smd);
-
- /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that
- * the kernel will not read other pages not covered by ldlm in
- * filemap_nopage. we do our readahead in ll_readpage.
- */
-- rand_read = vma->vm_flags & VM_RAND_READ;
-- seq_read = vma->vm_flags & VM_SEQ_READ;
-+ *save_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ);
- vma->vm_flags &= ~ VM_SEQ_READ;
- vma->vm_flags |= VM_RAND_READ;
-
-+ return 1;
-+}
-+
-+static void ll_put_extent_lock(struct vm_area_struct *vma, int save_flags,
-+ struct lustre_handle *lockh)
-+{
-+ struct file *filp = vma->vm_file;
-+ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
-+ struct inode *inode = filp->f_dentry->d_inode;
-+ ldlm_mode_t mode;
-+
-+ mode = mode_from_vma(vma);
-+ vma->vm_flags &= ~(VM_RAND_READ | VM_SEQ_READ);
-+ vma->vm_flags |= save_flags;
-+
-+ ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, lockh);
-+}
-+
-+#ifndef HAVE_VM_OP_FAULT
-+/**
-+ * Page fault handler.
-+ *
-+ * \param vma - is virtiual area struct related to page fault
-+ * \param address - address when hit fault
-+ * \param type - of fault
-+ *
-+ * \return allocated and filled page for address
-+ * \retval NOPAGE_SIGBUS if page not exist on this address
-+ * \retval NOPAGE_OOM not have memory for allocate new page
-+ */
-+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
-+ int *type)
-+{
-+ struct lustre_handle lockh = { 0 };
-+ int save_fags = 0;
-+ unsigned long pgoff;
-+ struct page *page;
-+ ENTRY;
-+
-+ pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
-+ if(!ll_get_extent_lock(vma, pgoff, &save_fags, &lockh))
-+ RETURN(NOPAGE_SIGBUS);
-+
- page = filemap_nopage(vma, address, type);
- if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM)
- LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address,
-@@ -477,13 +514,48 @@
- CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", address,
- (long)type);
-
-- vma->vm_flags &= ~VM_RAND_READ;
-- vma->vm_flags |= (rand_read | seq_read);
-+ ll_put_extent_lock(vma, save_fags, &lockh);
-
-- ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
- RETURN(page);
- }
-
-+#else
-+/* New fault() API*/
-+/**
-+ * Page fault handler.
-+ *
-+ * \param vma - is virtiual area struct related to page fault
-+ * \param address - address when hit fault
-+ * \param type - of fault
-+ *
-+ * \return allocated and filled page for address
-+ * \retval NOPAGE_SIGBUS if page not exist on this address
-+ * \retval NOPAGE_OOM not have memory for allocate new page
-+ */
-+int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-+{
-+ struct lustre_handle lockh = { 0 };
-+ int save_fags = 0;
-+ int rc;
-+ ENTRY;
-+
-+ if(!ll_get_extent_lock(vma, vmf->pgoff, &save_fags, &lockh))
-+ RETURN(VM_FAULT_SIGBUS);
-+
-+ rc = filemap_fault(vma, vmf);
-+ if (vmf->page)
-+ LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
-+ vmf->virtual_address);
-+ else
-+ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n",
-+ vmf->virtual_address);
-+
-+ ll_put_extent_lock(vma, save_fags, &lockh);
-+
-+ RETURN(rc);
-+}
-+#endif
-+
- /* To avoid cancel the locks covering mmapped region for lock cache pressure,
- * we track the mapped vma count by lli_mmap_cnt.
- * ll_vm_open(): when first vma is linked, split locks from lru.
-@@ -548,6 +620,7 @@
- }
- }
-
-+#ifndef HAVE_VM_OP_FAULT
- #ifndef HAVE_FILEMAP_POPULATE
- static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
- #endif
-@@ -562,6 +635,7 @@
- rc = filemap_populate(area, address, len, prot, pgoff, 1);
- RETURN(rc);
- }
-+#endif
-
- /* return the user space pointer that maps to a file offset via a vma */
- static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
-@@ -588,10 +662,14 @@
- }
-
- static struct vm_operations_struct ll_file_vm_ops = {
-- .nopage = ll_nopage,
- .open = ll_vm_open,
- .close = ll_vm_close,
-+#ifdef HAVE_VM_OP_FAULT
-+ .fault = ll_fault,
-+#else
-+ .nopage = ll_nopage,
- .populate = ll_populate,
-+#endif
- };
-
- int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
-@@ -602,7 +680,7 @@
- ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1);
- rc = generic_file_mmap(file, vma);
- if (rc == 0) {
--#ifndef HAVE_FILEMAP_POPULATE
-+#if !defined(HAVE_FILEMAP_POPULATE) && !defined(HAVE_VM_OP_FAULT)
- if (!filemap_populate)
- filemap_populate = vma->vm_ops->populate;
- #endif
diff --git a/debian/patches/patchless_support/fix_nfs_fid_type.dpatch b/debian/patches/patchless_support/fix_nfs_fid_type.dpatch
deleted file mode 100755
index 222d358..0000000
--- a/debian/patches/patchless_support/fix_nfs_fid_type.dpatch
+++ /dev/null
@@ -1,352 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-diff -urNad lustre~/lustre/llite/llite_internal.h lustre/lustre/llite/llite_internal.h
---- lustre~/lustre/llite/llite_internal.h 2008-11-25 13:59:37.000000000 +0100
-+++ lustre/lustre/llite/llite_internal.h 2008-12-22 10:13:32.000000000 +0100
-@@ -748,9 +748,6 @@
- /* llite/llite_nfs.c */
- extern struct export_operations lustre_export_operations;
- __u32 get_uuid2int(const char *name, int len);
--struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
-- int fhtype, int parent);
--int ll_dentry_to_fh(struct dentry *, __u32 *datap, int *lenp, int need_parent);
-
- /* llite/special.c */
- extern struct inode_operations ll_special_inode_operations;
-diff -urNad lustre~/lustre/llite/llite_nfs.c lustre/lustre/llite/llite_nfs.c
---- lustre~/lustre/llite/llite_nfs.c 2008-11-25 13:59:37.000000000 +0100
-+++ lustre/lustre/llite/llite_nfs.c 2008-12-22 10:23:59.000000000 +0100
-@@ -57,11 +57,7 @@
- return (key0 << 1);
- }
-
--#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
--static int ll_nfs_test_inode(struct inode *inode, unsigned long ino, void *opaque)
--#else
- static int ll_nfs_test_inode(struct inode *inode, void *opaque)
--#endif
- {
- struct ll_fid *iid = opaque;
-
-@@ -72,36 +68,30 @@
- }
-
- static struct inode * search_inode_for_lustre(struct super_block *sb,
-- unsigned long ino,
-- unsigned long generation,
-- int mode)
-+ struct ll_fid *iid)
- {
- struct ptlrpc_request *req = NULL;
- struct ll_sb_info *sbi = ll_s2sbi(sb);
-- struct ll_fid fid;
- unsigned long valid = 0;
- int eadatalen = 0, rc;
- struct inode *inode = NULL;
-- struct ll_fid iid = { .id = ino, .generation = generation };
- ENTRY;
-
-- inode = ILOOKUP(sb, ino, ll_nfs_test_inode, &iid);
-+ inode = ILOOKUP(sb, iid->id, ll_nfs_test_inode, iid);
-
- if (inode)
- RETURN(inode);
-- if (S_ISREG(mode)) {
-- rc = ll_get_max_mdsize(sbi, &eadatalen);
-- if (rc)
-- RETURN(ERR_PTR(rc));
-- valid |= OBD_MD_FLEASIZE;
-- }
-- fid.id = (__u64)ino;
-- fid.generation = generation;
-- fid.f_type = mode;
-
-- rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, eadatalen, &req);
-+ rc = ll_get_max_mdsize(sbi, &eadatalen);
-+ if (rc)
-+ RETURN(ERR_PTR(rc));
-+
-+ valid |= OBD_MD_FLEASIZE;
-+
-+ /* mds_fid2dentry is ignore f_type */
-+ rc = mdc_getattr(sbi->ll_mdc_exp, iid, valid, eadatalen, &req);
- if (rc) {
-- CERROR("failure %d inode %lu\n", rc, ino);
-+ CERROR("failure %d inode "LPU64"\n", rc, iid->id);
- RETURN(ERR_PTR(rc));
- }
-
-@@ -115,67 +105,35 @@
- RETURN(inode);
- }
-
--static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
-- __u32 generation, umode_t mode)
-+static struct dentry *ll_iget_for_nfs(struct super_block *sb,
-+ struct ll_fid *iid)
- {
- struct inode *inode;
- struct dentry *result;
--#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-- struct list_head *lp;
--#endif
- ENTRY;
-
-- if (ino == 0)
-+ if (iid->id == 0)
- RETURN(ERR_PTR(-ESTALE));
-
-- inode = search_inode_for_lustre(sb, ino, generation, mode);
-- if (IS_ERR(inode)) {
-+ inode = search_inode_for_lustre(sb, iid);
-+ if (IS_ERR(inode))
- RETURN(ERR_PTR(PTR_ERR(inode)));
-- }
- if (is_bad_inode(inode) ||
-- (generation && inode->i_generation != generation)){
-+ (iid->generation && inode->i_generation != iid->generation)) {
- /* we didn't find the right inode.. */
- CERROR("Inode %lu, Bad count: %lu %d or version %u %u\n",
- inode->i_ino, (unsigned long)inode->i_nlink,
- atomic_read(&inode->i_count), inode->i_generation,
-- generation);
-+ iid->generation);
- iput(inode);
- RETURN(ERR_PTR(-ESTALE));
- }
-
--#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- result = d_alloc_anon(inode);
- if (!result) {
- iput(inode);
- RETURN(ERR_PTR(-ENOMEM));
- }
--#else
-- /* now to find a dentry.
-- * If possible, get a well-connected one
-- */
-- spin_lock(&dcache_lock);
-- for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) {
-- result = list_entry(lp,struct dentry, d_alias);
-- lock_dentry(result);
-- if (!(result->d_flags & DCACHE_DISCONNECTED)) {
-- dget_locked(result);
-- ll_set_dflags(result, DCACHE_REFERENCED);
-- unlock_dentry(result);
-- spin_unlock(&dcache_lock);
-- iput(inode);
-- RETURN(result);
-- }
-- unlock_dentry(result);
-- }
-- spin_unlock(&dcache_lock);
-- result = d_alloc_root(inode);
-- if (result == NULL) {
-- iput(inode);
-- RETURN(ERR_PTR(-ENOMEM));
-- }
-- result->d_flags |= DCACHE_DISCONNECTED;
--
--#endif
- ll_set_dd(result);
-
- lock_dentry(result);
-@@ -192,57 +150,98 @@
- RETURN(result);
- }
-
--struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
-- int fhtype, int parent)
-+#define LUSTRE_NFS_FID 0x94
-+
-+struct lustre_nfs_fid {
-+ struct ll_fid child;
-+ struct ll_fid parent;
-+ umode_t mode;
-+};
-+
-+/* The return value is file handle type:
-+ * 1 -- contains child file handle;
-+ * 2 -- contains child file handle and parent file handle;
-+ * 255 -- error.
-+ */
-+static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen,
-+ int connectable)
- {
-- switch (fhtype) {
-- case 2:
-- if (len < 5)
-- break;
-- if (parent)
-- return ll_iget_for_nfs(sb, data[3], 0, data[4]);
-- case 1:
-- if (len < 3)
-- break;
-- if (parent)
-- break;
-- return ll_iget_for_nfs(sb, data[0], data[1], data[2]);
-- default: break;
-- }
-- return ERR_PTR(-EINVAL);
-+ struct inode *inode = de->d_inode;
-+ struct inode *parent = de->d_parent->d_inode;
-+ struct lustre_nfs_fid *nfs_fid = (void *)fh;
-+ ENTRY;
-+
-+ CDEBUG(D_INFO, "encoding for (%lu) maxlen=%d minlen=%lu\n",
-+ inode->i_ino, *plen,
-+ sizeof(struct lustre_nfs_fid));
-+
-+ if (*plen < sizeof(struct lustre_nfs_fid))
-+ RETURN(255);
-+
-+ ll_inode2fid(&nfs_fid->child, inode);
-+ ll_inode2fid(&nfs_fid->parent, parent);
-+
-+ nfs_fid->mode = (S_IFMT & inode->i_mode);
-+ *plen = sizeof(struct lustre_nfs_fid);
-+
-+ RETURN(LUSTRE_NFS_FID);
- }
-
--int ll_dentry_to_fh(struct dentry *dentry, __u32 *datap, int *lenp,
-- int need_parent)
-+#ifdef HAVE_FH_TO_DENTRY
-+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
-+ int fh_len, int fh_type)
- {
-- if (*lenp < 3)
-- return 255;
-- *datap++ = dentry->d_inode->i_ino;
-- *datap++ = dentry->d_inode->i_generation;
-- *datap++ = (__u32)(S_IFMT & dentry->d_inode->i_mode);
-+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
-
-- if (*lenp == 3 || S_ISDIR(dentry->d_inode->i_mode)) {
-- *lenp = 3;
-- return 1;
-- }
-- if (dentry->d_parent) {
-- *datap++ = dentry->d_parent->d_inode->i_ino;
-- *datap++ = (__u32)(S_IFMT & dentry->d_parent->d_inode->i_mode);
-+ if (fh_type != LUSTRE_NFS_FID)
-+ RETURN(ERR_PTR(-EINVAL));
-
-- *lenp = 5;
-- return 2;
-- }
-- *lenp = 3;
-- return 1;
-+ RETURN(ll_iget_for_nfs(sb, &nfs_fid->child));
- }
-+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
-+ int fh_len, int fh_type)
-+{
-+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
-
--#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
--struct dentry *ll_get_dentry(struct super_block *sb, void *data)
-+ if (fh_type != LUSTRE_NFS_FID)
-+ RETURN(ERR_PTR(-EINVAL));
-+ RETURN(ll_iget_for_nfs(sb, &nfs_fid->parent));
-+}
-+
-+#else
-+/*
-+ * This length is counted as amount of __u32,
-+ * It is composed of a fid and a mode
-+ */
-+static struct dentry *ll_decode_fh(struct super_block *sb, __u32 *fh, int fh_len,
-+ int fh_type,
-+ int (*acceptable)(void *, struct dentry *),
-+ void *context)
- {
-- __u32 *inump = (__u32*)data;
-- return ll_iget_for_nfs(sb, inump[0], inump[1], S_IFREG);
-+ struct lustre_nfs_fid *nfs_fid = (void *)fh;
-+ struct dentry *entry;
-+ ENTRY;
-+
-+ CDEBUG(D_INFO, "decoding for "LPU64" fh_len=%d fh_type=%x\n",
-+ nfs_fid->child.id, fh_len, fh_type);
-+
-+ if (fh_type != LUSTRE_NFS_FID)
-+ RETURN(ERR_PTR(-ESTALE));
-+
-+ entry = sb->s_export_op->find_exported_dentry(sb, &nfs_fid->child,
-+ &nfs_fid->parent,
-+ acceptable, context);
-+ RETURN(entry);
- }
-
-+struct dentry *ll_get_dentry(struct super_block *sb, void *data)
-+{
-+ struct lustre_nfs_fid *fid = data;
-+ ENTRY;
-+
-+ RETURN(ll_iget_for_nfs(sb, &fid->child));
-+}
-+#endif
- struct dentry *ll_get_parent(struct dentry *dchild)
- {
- struct ptlrpc_request *req = NULL;
-@@ -254,11 +253,11 @@
- char dotdot[] = "..";
- int rc = 0;
- ENTRY;
--
-+
- LASSERT(dir && S_ISDIR(dir->i_mode));
--
-- sbi = ll_s2sbi(dir->i_sb);
--
-+
-+ sbi = ll_s2sbi(dir->i_sb);
-+
- fid.id = (__u64)dir->i_ino;
- fid.generation = dir->i_generation;
- fid.f_type = S_IFDIR;
-@@ -269,11 +268,12 @@
- CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
- return ERR_PTR(rc);
- }
-- body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof (*body));
--
-+ body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof (*body));
-+
- LASSERT((body->valid & OBD_MD_FLGENER) && (body->valid & OBD_MD_FLID));
--
-- result = ll_iget_for_nfs(dir->i_sb, body->ino, body->generation, S_IFDIR);
-+ fid.id = body->ino;
-+ fid.generation = body->generation;
-+ result = ll_iget_for_nfs(dir->i_sb, &fid);
-
- if (IS_ERR(result))
- rc = PTR_ERR(result);
-@@ -282,10 +282,18 @@
- if (rc)
- return ERR_PTR(rc);
- RETURN(result);
--}
-+}
-
-+
-+#if THREAD_SIZE >= 8192
- struct export_operations lustre_export_operations = {
-- .get_parent = ll_get_parent,
-- .get_dentry = ll_get_dentry,
-+ .encode_fh = ll_encode_fh,
-+#ifdef HAVE_FH_TO_DENTRY
-+ .fh_to_dentry = ll_fh_to_dentry,
-+ .fh_to_parent = ll_fh_to_parent,
-+#else
-+ .get_dentry = ll_get_dentry,
-+ .decode_fh = ll_decode_fh,
-+#endif
- };
- #endif
diff --git a/debian/patches/patchless_support/fix_path_API_changes.dpatch b/debian/patches/patchless_support/fix_path_API_changes.dpatch
deleted file mode 100755
index 291bf83..0000000
--- a/debian/patches/patchless_support/fix_path_API_changes.dpatch
+++ /dev/null
@@ -1,83 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: b1_8_gate/lustre/llite/symlink.c
-===================================================================
---- b1_8_gate.orig/lustre/llite/symlink.c 2008-11-27 07:36:47.000000000 +0300
-+++ b1_8_gate/lustre/llite/symlink.c 2008-11-27 07:37:23.000000000 +0300
-@@ -177,8 +177,12 @@
- up(&lli->lli_size_sem);
- }
- if (rc) {
-+#ifdef HAVE_PATH_RELEASE
- path_release(nd); /* Kernel assumes that ->follow_link()
- releases nameidata on error */
-+#else
-+ path_put(&nd->path);
-+#endif
- GOTO(out, rc);
- }
-
-Index: b1_8_gate/lustre/mgc/mgc_request.c
-===================================================================
---- b1_8_gate.orig/lustre/mgc/mgc_request.c 2008-11-27 07:36:47.000000000 +0300
-+++ b1_8_gate/lustre/mgc/mgc_request.c 2008-11-27 07:37:23.000000000 +0300
-@@ -415,7 +415,7 @@
- obd->obd_lvfs_ctxt.fs = get_ds();
-
- push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-- dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd,
-+ dentry = lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
- strlen(MOUNT_CONFIGS_DIR));
- pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
- if (IS_ERR(dentry)) {
-Index: b1_8_gate/lustre/ptlrpc/service.c
-===================================================================
---- b1_8_gate.orig/lustre/ptlrpc/service.c 2008-11-27 07:36:47.000000000 +0300
-+++ b1_8_gate/lustre/ptlrpc/service.c 2008-11-27 07:37:23.000000000 +0300
-@@ -1501,7 +1501,7 @@
- cfs_daemonize(name);
- exit_fs(cfs_current());
- current->fs = fs;
-- ll_set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
-+ ll_set_fs_pwd(current->fs, cfs_fs_mnt(init_task.fs), cfs_fs_pwd(init_task.fs));
- }
-
- static void
-Index: b1_8_gate/lustre/lvfs/lvfs_linux.c
-===================================================================
---- b1_8_gate.orig/lustre/lvfs/lvfs_linux.c 2008-11-27 07:36:47.000000000 +0300
-+++ b1_8_gate/lustre/lvfs/lvfs_linux.c 2008-11-27 07:37:23.000000000 +0300
-@@ -148,10 +148,10 @@
- */
-
- save->fs = get_fs();
-- LASSERT(atomic_read(¤t->fs->pwd->d_count));
-+ LASSERT(atomic_read(&cfs_fs_pwd(current->fs)->d_count));
- LASSERT(atomic_read(&new_ctx->pwd->d_count));
-- save->pwd = dget(current->fs->pwd);
-- save->pwdmnt = mntget(current->fs->pwdmnt);
-+ save->pwd = dget(cfs_fs_pwd(current->fs));
-+ save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
- save->luc.luc_umask = current->fs->umask;
-
- LASSERT(save->pwd);
-@@ -205,10 +205,10 @@
- atomic_read(¤t->fs->pwdmnt->mnt_count));
- */
-
-- LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
-- current->fs->pwd, new_ctx->pwd);
-- LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
-- current->fs->pwdmnt, new_ctx->pwdmnt);
-+ LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
-+ cfs_fs_pwd(current->fs), new_ctx->pwd);
-+ LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
-+ cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
-
- set_fs(saved->fs);
- ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
diff --git a/debian/patches/patchless_support/fix_request_module_calls.dpatch b/debian/patches/patchless_support/fix_request_module_calls.dpatch
deleted file mode 100755
index e01c88f..0000000
--- a/debian/patches/patchless_support/fix_request_module_calls.dpatch
+++ /dev/null
@@ -1,20 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: b1_8_gate/lnet/lnet/api-ni.c
-===================================================================
---- b1_8_gate.orig/lnet/lnet/api-ni.c 2008-10-21 19:12:50.000000000 +0400
-+++ b1_8_gate/lnet/lnet/api-ni.c 2008-11-27 16:06:07.000000000 +0300
-@@ -1032,7 +1032,7 @@
- #ifdef __KERNEL__
- if (lnd == NULL) {
- LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
-- rc = request_module(libcfs_lnd2modname(lnd_type));
-+ rc = request_module("%s", libcfs_lnd2modname(lnd_type));
- LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
-
- lnd = lnet_find_lnd_by_type(lnd_type);
diff --git a/debian/patches/patchless_support/lprocfs_changes.dpatch b/debian/patches/patchless_support/lprocfs_changes.dpatch
deleted file mode 100755
index 91819bc..0000000
--- a/debian/patches/patchless_support/lprocfs_changes.dpatch
+++ /dev/null
@@ -1,78 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/include/lprocfs_status.h
-===================================================================
---- HEAD.orig/lustre/include/lprocfs_status.h 2008-12-08 05:48:08.000000000 +0200
-+++ HEAD/lustre/include/lprocfs_status.h 2008-12-08 13:42:28.000000000 +0200
-@@ -563,6 +563,8 @@
- #define LPROCFS_EXIT() do { \
- up_read(&_lprocfs_lock); \
- } while(0)
-+
-+#ifdef HAVE_PROCFS_DELETED
- #define LPROCFS_ENTRY_AND_CHECK(dp) do { \
- typecheck(struct proc_dir_entry *, dp); \
- LPROCFS_ENTRY(); \
-@@ -571,6 +573,14 @@
- return -ENODEV; \
- } \
- } while(0)
-+#define LPROCFS_CHECK_DELETED(dp) ((dp)->deleted)
-+#else
-+
-+#define LPROCFS_ENTRY_AND_CHECK(dp) \
-+ LPROCFS_ENTRY();
-+#define LPROCFS_CHECK_DELETED(dp) (0)
-+#endif
-+
- #define LPROCFS_WRITE_ENTRY() do { \
- down_write(&_lprocfs_lock); \
- } while(0)
-@@ -578,6 +588,7 @@
- up_write(&_lprocfs_lock); \
- } while(0)
-
-+
- /* You must use these macros when you want to refer to
- * the import in a client obd_device for a lprocfs entry */
- #define LPROCFS_CLIMP_CHECK(obd) do { \
-Index: HEAD/lustre/obdclass/linux/linux-module.c
-===================================================================
---- HEAD.orig/lustre/obdclass/linux/linux-module.c 2008-12-08 05:48:20.000000000 +0200
-+++ HEAD/lustre/obdclass/linux/linux-module.c 2008-12-08 13:50:36.000000000 +0200
-@@ -418,7 +418,7 @@
- ENTRY;
-
- obd_sysctl_init();
-- proc_lustre_root = lprocfs_register("lustre", proc_root_fs,
-+ proc_lustre_root = lprocfs_register("fs/lustre", NULL,
- lprocfs_base, NULL);
- rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
- &obd_device_list_fops, NULL);
-Index: HEAD/lustre/obdclass/lprocfs_status.c
-===================================================================
---- HEAD.orig/lustre/obdclass/lprocfs_status.c 2008-12-08 05:48:20.000000000 +0200
-+++ HEAD/lustre/obdclass/lprocfs_status.c 2008-12-08 13:42:28.000000000 +0200
-@@ -173,7 +173,7 @@
-
- LPROCFS_ENTRY();
- OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
-- if (!dp->deleted && dp->read_proc)
-+ if (!LPROCFS_CHECK_DELETED(dp) && dp->read_proc)
- rc = dp->read_proc(page, &start, *ppos, CFS_PAGE_SIZE,
- &eof, dp->data);
- LPROCFS_EXIT();
-@@ -213,7 +213,7 @@
- int rc = -EIO;
-
- LPROCFS_ENTRY();
-- if (!dp->deleted && dp->write_proc)
-+ if (!LPROCFS_CHECK_DELETED(dp) && dp->write_proc)
- rc = dp->write_proc(f, buf, size, dp->data);
- LPROCFS_EXIT();
- return rc;
diff --git a/debian/patches/patchless_support/lustre_loop_devices_adaption.dpatch b/debian/patches/patchless_support/lustre_loop_devices_adaption.dpatch
deleted file mode 100755
index bc3b884..0000000
--- a/debian/patches/patchless_support/lustre_loop_devices_adaption.dpatch
+++ /dev/null
@@ -1,84 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/llite/lloop.c
-===================================================================
---- HEAD.orig/lustre/llite/lloop.c 2008-11-17 11:36:35.000000000 +0200
-+++ HEAD/lustre/llite/lloop.c 2008-12-17 23:29:17.000000000 +0200
-@@ -152,7 +152,7 @@
- struct semaphore lo_bh_mutex;
- atomic_t lo_pending;
-
-- request_queue_t *lo_queue;
-+ struct request_queue *lo_queue;
-
- /* data to handle bio for lustre. */
- struct lo_request_data {
-@@ -288,7 +288,7 @@
- return bio;
- }
-
--static int loop_make_request(request_queue_t *q, struct bio *old_bio)
-+static int loop_make_request(struct request_queue *q, struct bio *old_bio)
- {
- struct lloop_device *lo = q->queuedata;
- int rw = bio_rw(old_bio);
-@@ -317,7 +317,7 @@
- if (atomic_dec_and_test(&lo->lo_pending))
- up(&lo->lo_bh_mutex);
- out:
-- bio_io_error(old_bio, old_bio->bi_size);
-+ cfs_bio_io_error(old_bio, old_bio->bi_size);
- return 0;
- inactive:
- spin_unlock_irq(&lo->lo_lock);
-@@ -327,7 +327,7 @@
- /*
- * kick off io on the underlying address space
- */
--static void loop_unplug(request_queue_t *q)
-+static void loop_unplug(struct request_queue *q)
- {
- struct lloop_device *lo = q->queuedata;
-
-@@ -339,7 +339,7 @@
- {
- int ret;
- ret = do_bio_filebacked(lo, bio);
-- bio_endio(bio, bio->bi_size, ret);
-+ cfs_bio_endio(bio, bio->bi_size, ret);
- }
-
- /*
-@@ -366,7 +366,8 @@
- up(&lo->lo_sem);
-
- for (;;) {
-- down_interruptible(&lo->lo_bh_mutex);
-+ if(!down_interruptible(&lo->lo_bh_mutex))
-+ continue;
- /*
- * could be upped because of tear-down, not because of
- * pending work
-@@ -743,7 +744,7 @@
-
- out_mem4:
- while (i--)
-- blk_put_queue(loop_dev[i].lo_queue);
-+ blk_cleanup_queue(loop_dev[i].lo_queue);
- i = max_loop;
- out_mem3:
- while (i--)
-@@ -765,7 +766,7 @@
- ll_iocontrol_unregister(ll_iocontrol_magic);
- for (i = 0; i < max_loop; i++) {
- del_gendisk(disks[i]);
-- blk_put_queue(loop_dev[i].lo_queue);
-+ blk_cleanup_queue(loop_dev[i].lo_queue);
- put_disk(disks[i]);
- }
- if (ll_unregister_blkdev(lloop_major, "lloop"))
diff --git a/debian/patches/patchless_support/new_page_fault_method.dpatch b/debian/patches/patchless_support/new_page_fault_method.dpatch
deleted file mode 100755
index 26f84d6..0000000
--- a/debian/patches/patchless_support/new_page_fault_method.dpatch
+++ /dev/null
@@ -1,442 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/llite/vvp_io.c
-===================================================================
---- HEAD.orig/lustre/llite/vvp_io.c 2008-12-17 17:15:38.000000000 +0200
-+++ HEAD/lustre/llite/vvp_io.c 2008-12-17 17:15:38.000000000 +0200
-@@ -559,6 +559,61 @@
- RETURN(result);
- }
-
-+#ifndef HAVE_VM_OP_FAULT
-+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
-+{
-+ cfs_page_t *result;
-+
-+ result = filemap_nopage(cfio->ft_vma, cfio->ft_address, cfio->ft_type);
-+ if (result != NOPAGE_SIGBUS && result != NOPAGE_OOM)
-+ LL_CDEBUG_PAGE(D_PAGE, result,
-+ "got addr %lu type %lx\n",
-+ cfio->ft_address, (long)cfio->ft_type);
-+ else
-+ CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n",
-+ cfio->ft_address, (long)cfio->ft_type);
-+
-+ if (result == NOPAGE_SIGBUS)
-+ return -EFAULT;
-+ else if (result == NOPAGE_OOM)
-+ return -ENOMEM;
-+
-+ /* new fault API can return page locked already */
-+ lock_page(result);
-+ cfio->ft_page = result;
-+
-+ return 0;
-+}
-+#else
-+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
-+{
-+ cfio->ft_flags = filemap_fault(cfio->ft_vma, cfio->ft_vmf);
-+ if (cfio->ft_vmf->page)
-+ LL_CDEBUG_PAGE(D_PAGE, cfio->ft_vmf->page,
-+ "got addr %p type NOPAGE\n",
-+ cfio->ft_vmf->virtual_address);
-+ else
-+ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n",
-+ cfio->ft_vmf->virtual_address);
-+
-+ if (unlikely (cfio->ft_flags & VM_FAULT_ERROR))
-+ return -EFAULT;
-+
-+ if (unlikely (cfio->ft_flags & VM_FAULT_NOPAGE))
-+ return -ENOMEM;
-+
-+ if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
-+ lock_page(cfio->ft_vmf->page);
-+ cfio->ft_flags |= VM_FAULT_LOCKED;
-+ }
-+
-+ cfio->ft_page = cfio->ft_vmf->page;
-+
-+ return 0;
-+}
-+
-+#endif
-+
- static int vvp_io_fault_start(const struct lu_env *env,
- const struct cl_io_slice *ios)
- {
-@@ -568,9 +623,12 @@
- struct inode *inode = ccc_object_inode(obj);
- struct cl_fault_io *fio = &io->u.ci_fault;
- struct vvp_fault_io *cfio = &vio->u.fault;
-- cfs_page_t *vmpage;
- loff_t offset;
-+ int kernel_result = 0;
- int result = 0;
-+ struct cl_page *page;
-+ loff_t size;
-+ pgoff_t last; /* last page in a file data region */
-
- LASSERT(vio->cui_oneshot == 0);
-
-@@ -587,55 +645,43 @@
- if (result != 0)
- return result;
-
-- vmpage = filemap_nopage(cfio->ft_vma, cfio->ft_address, cfio->ft_type);
-- if (vmpage != NOPAGE_SIGBUS && vmpage != NOPAGE_OOM)
-- LL_CDEBUG_PAGE(D_PAGE, vmpage,
-- "got addr %lu type %lx\n",
-- cfio->ft_address, (long)cfio->ft_type);
-- else
-- CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n",
-- cfio->ft_address, (long)cfio->ft_type);
--
-- if (vmpage == NOPAGE_SIGBUS)
-- result = -EFAULT;
-- else if (vmpage == NOPAGE_OOM)
-- result = -ENOMEM;
-- else {
-- struct cl_page *page;
-- loff_t size;
-- pgoff_t last; /* last page in a file data region */
--
-- /* Temporarily lock vmpage to keep cl_page_find() happy. */
-- lock_page(vmpage);
-- page = cl_page_find(env, obj, fio->ft_index, vmpage,
-- CPT_CACHEABLE);
-- unlock_page(vmpage);
-- if (!IS_ERR(page)) {
-- size = i_size_read(inode);
-- last = cl_index(obj, size - 1);
-- if (fio->ft_index == last)
-- /*
-- * Last page is mapped partially.
-- */
-- fio->ft_nob = size - cl_offset(obj,
-- fio->ft_index);
-- else
-- fio->ft_nob = cl_page_size(obj);
-- lu_ref_add(&page->cp_reference, "fault", io);
-- fio->ft_page = page;
-- /*
-- * Certain 2.6 kernels return not-NULL from
-- * filemap_nopage() when page is beyond the file size,
-- * on the grounds that "An external ptracer can access
-- * pages that normally aren't accessible.." Don't
-- * propagate such page fault to the lower layers to
-- * avoid side-effects like KMS updates.
-- */
-- if (fio->ft_index > last)
-- result = +1;
-- } else
-- result = PTR_ERR(page);
-+ /* must return locked page */
-+ kernel_result = vvp_io_kernel_fault(cfio);
-+ if (kernel_result != 0)
-+ return kernel_result;
-+
-+ page = cl_page_find(env, obj, fio->ft_index, cfio->ft_page,
-+ CPT_CACHEABLE);
-+ if (IS_ERR(page)) {
-+ unlock_page(cfio->ft_page);
-+ page_cache_release(cfio->ft_page);
-+ cfio->ft_page = NULL;
-+ return PTR_ERR(page);
- }
-+
-+ size = i_size_read(inode);
-+ last = cl_index(obj, size - 1);
-+ if (fio->ft_index == last)
-+ /*
-+ * Last page is mapped partially.
-+ */
-+ fio->ft_nob = size - cl_offset(obj, fio->ft_index);
-+ else
-+ fio->ft_nob = cl_page_size(obj);
-+
-+ lu_ref_add(&page->cp_reference, "fault", io);
-+ fio->ft_page = page;
-+ /*
-+ * Certain 2.6 kernels return not-NULL from
-+ * filemap_nopage() when page is beyond the file size,
-+ * on the grounds that "An external ptracer can access
-+ * pages that normally aren't accessible.." Don't
-+ * propagate such page fault to the lower layers to
-+ * avoid side-effects like KMS updates.
-+ */
-+ if (fio->ft_index > last)
-+ result = +1;
-+
- return result;
- }
-
-Index: HEAD/lustre/llite/llite_mmap.c
-===================================================================
---- HEAD.orig/lustre/llite/llite_mmap.c 2008-11-17 11:36:34.000000000 +0200
-+++ HEAD/lustre/llite/llite_mmap.c 2008-12-17 17:15:38.000000000 +0200
-@@ -72,6 +72,8 @@
- struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
- int *type);
-
-+static struct vm_operations_struct ll_file_vm_ops;
-+
- void policy_from_vma(ldlm_policy_data_t *policy,
- struct vm_area_struct *vma, unsigned long addr,
- size_t count)
-@@ -95,7 +97,7 @@
- spin_lock(&mm->page_table_lock);
- for(vma = find_vma(mm, addr);
- vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
-- if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage &&
-+ if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
- vma->vm_flags & VM_SHARED) {
- ret = vma;
- break;
-@@ -105,6 +107,7 @@
- RETURN(ret);
- }
-
-+#ifndef HAVE_VM_OP_FAULT
- /**
- * Lustre implementation of a vm_operations_struct::nopage() method, called by
- * VM to server page fault (both in kernel and user space).
-@@ -125,11 +128,12 @@
- struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
- int *type)
- {
-+
- struct file *file = vma->vm_file;
- struct inode *inode = file->f_dentry->d_inode;
- struct lu_env *env;
- struct cl_io *io;
-- struct page *page = NULL;
-+ struct page *page = NOPAGE_SIGBUS;
- struct cl_env_nest nest;
- int result;
-
-@@ -143,7 +147,7 @@
- * one.
- */
- env = cl_env_nested_get(&nest);
-- if (!IS_ERR(env)) {
-+ if (IS_ERR(env)) {
- pgoff_t pg_offset;
- const unsigned long writable = VM_SHARED|VM_WRITE;
- unsigned long ra_flags;
-@@ -183,16 +187,25 @@
- cio->cui_fd = LUSTRE_FPRIVATE(file);
-
- result = cl_io_loop(env, io);
-- if (result == 0) {
-- LASSERT(fio->ft_page != NULL);
-- page = cl_page_vmpage(env, fio->ft_page);
-- } else if (result == -EFAULT) {
-- page = NOPAGE_SIGBUS;
-- } else if (result == -ENOMEM) {
-- page = NOPAGE_OOM;
-- }
-- } else
-+ } else {
- result = io->ci_result;
-+ }
-+
-+ switch (result) {
-+ case 0:
-+ LASSERT(fio->ft_page != NULL);
-+ page = vio->u.fault.ft_page;
-+ page_unlock(page);
-+ break;
-+ case -EFAULT:
-+ page = NOPAGE_SIGBUS;
-+ break;
-+ case -ENOMEM:
-+ page = NOPAGE_OOM;
-+ break;
-+ default:
-+ LBUG();
-+ }
-
- vma->vm_flags &= ~VM_RAND_READ;
- vma->vm_flags |= ra_flags;
-@@ -200,8 +213,99 @@
- cl_io_fini(env, io);
- cl_env_nested_put(&nest, env);
- }
-+
- RETURN(page);
- }
-+#else
-+/* New fault() API*/
-+/**
-+ * Page fault handler.
-+ *
-+ * \param vma - is virtiual area struct related to page fault
-+ * \param address - address when hit fault
-+ * \param type - of fault
-+ *
-+ * \return allocated and filled page for address
-+ * \retval NOPAGE_SIGBUS if page not exist on this address
-+ * \retval NOPAGE_OOM not have memory for allocate new page
-+ */
-+int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-+{
-+ struct file *file = vma->vm_file;
-+ struct inode *inode = file->f_dentry->d_inode;
-+ struct lu_env *env;
-+ struct cl_io *io;
-+ const unsigned long writable = VM_SHARED|VM_WRITE;
-+ unsigned long ra_flags;
-+ struct cl_fault_io *fio;
-+ struct cl_env_nest nest;
-+ int result;
-+ int fault_ret = 0;
-+
-+ ENTRY;
-+
-+ /*
-+ * vm_operations_struct::nopage() can be called when lustre IO is
-+ * already active for the current thread, e.g., when doing read/write
-+ * against user level buffer mapped from Lustre buffer. To avoid
-+ * stomping on existing context, optionally force an allocation of a new
-+ * one.
-+ */
-+ env = cl_env_nested_get(&nest);
-+ if (IS_ERR(env))
-+ RETURN(VM_FAULT_ERROR);
-+
-+ io = &ccc_env_info(env)->cti_io;
-+ io->ci_obj = ll_i2info(inode)->lli_clob;
-+ LASSERT(io->ci_obj != NULL);
-+
-+ fio = &io->u.ci_fault;
-+ fio->ft_index = vmf->pgoff + vma->vm_pgoff;
-+ fio->ft_writable = (vma->vm_flags&writable) == writable;
-+ fio->ft_executable = vma->vm_flags&VM_EXEC;
-+
-+ /*
-+ * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
-+ * the kernel will not read other pages not covered by ldlm in
-+ * filemap_nopage. we do our readahead in ll_readpage.
-+ */
-+ ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
-+ vma->vm_flags &= ~VM_SEQ_READ;
-+ vma->vm_flags |= VM_RAND_READ;
-+
-+ CDEBUG(D_INFO, "vm_flags: %lx (%lu %i %i)\n", vma->vm_flags,
-+ fio->ft_index, fio->ft_writable, fio->ft_executable);
-+
-+ if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
-+ struct vvp_io *vio = vvp_env_io(env);
-+ struct ccc_io *cio = ccc_env_io(env);
-+
-+ LASSERT(cio->cui_cl.cis_io == io);
-+
-+ vio->u.fault.ft_vma = vma;
-+ vio->u.fault.ft_vmf = vmf;
-+ cio->cui_fd = LUSTRE_FPRIVATE(file);
-+
-+ result = cl_io_loop(env, io);
-+ fault_ret = vio->u.fault.ft_flags;
-+ if (result != 0)
-+ fault_ret |= VM_FAULT_ERROR;
-+ } else {
-+ if(io->ci_result)
-+ fault_ret = VM_FAULT_ERROR;
-+ }
-+
-+ vma->vm_flags |= ra_flags;
-+
-+ cl_io_fini(env, io);
-+ cl_env_nested_put(&nest, env);
-+
-+ RETURN(fault_ret);
-+}
-+
-+
-+
-+#endif
-
- /**
- * To avoid cancel the locks covering mmapped region for lock cache pressure,
-@@ -234,6 +338,7 @@
- EXIT;
- }
-
-+#ifndef HAVE_VM_OP_FAULT
- #ifndef HAVE_FILEMAP_POPULATE
- static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
- #endif
-@@ -248,6 +353,7 @@
- rc = filemap_populate(area, address, len, prot, pgoff, 1);
- RETURN(rc);
- }
-+#endif
-
- /* return the user space pointer that maps to a file offset via a vma */
- static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
-@@ -274,10 +380,15 @@
- }
-
- static struct vm_operations_struct ll_file_vm_ops = {
-+#ifndef HAVE_VM_OP_FAULT
- .nopage = ll_nopage,
-+ .populate = ll_populate,
-+
-+#else
-+ .fault = ll_fault,
-+#endif
- .open = ll_vm_open,
- .close = ll_vm_close,
-- .populate = ll_populate,
- };
-
- int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
-@@ -288,7 +399,7 @@
- ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1);
- rc = generic_file_mmap(file, vma);
- if (rc == 0) {
--#if !defined(HAVE_FILEMAP_POPULATE)
-+#if !defined(HAVE_FILEMAP_POPULATE) && !defined(HAVE_VM_OP_FAULT)
- if (!filemap_populate)
- filemap_populate = vma->vm_ops->populate;
- #endif
-Index: HEAD/lustre/llite/llite_internal.h
-===================================================================
---- HEAD.orig/lustre/llite/llite_internal.h 2008-12-17 17:15:38.000000000 +0200
-+++ HEAD/lustre/llite/llite_internal.h 2008-12-17 17:35:56.000000000 +0200
-@@ -819,6 +819,11 @@
- time_t ft_mtime;
- struct vm_area_struct *ft_vma;
- /**
-+ * locked page returned from vvp_io
-+ */
-+ cfs_page_t *ft_page;
-+#ifndef HAVE_VM_OP_FAULT
-+ /**
- * Virtual address at which fault occurred.
- */
- unsigned long ft_address;
-@@ -826,6 +831,16 @@
- * Fault type, as to be supplied to filemap_nopage().
- */
- int *ft_type;
-+#else
-+ /**
-+ * kernel fault info
-+ */
-+ struct vm_fault *ft_vmf;
-+ /**
-+ * fault API used bitflags for return code save it.
-+ */
-+ unsigned int ft_flags;
-+#endif
- } fault;
- } u;
- /**
diff --git a/debian/patches/patchless_support/nfs_changes_new_API.dpatch b/debian/patches/patchless_support/nfs_changes_new_API.dpatch
deleted file mode 100755
index 15b2602..0000000
--- a/debian/patches/patchless_support/nfs_changes_new_API.dpatch
+++ /dev/null
@@ -1,277 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/llite/llite_internal.h
-===================================================================
---- HEAD.orig/lustre/llite/llite_internal.h 2008-11-20 14:34:31.000000000 +0200
-+++ HEAD/lustre/llite/llite_internal.h 2008-12-17 17:15:38.000000000 +0200
-@@ -744,9 +744,6 @@
- /* llite/llite_nfs.c */
- extern struct export_operations lustre_export_operations;
- __u32 get_uuid2int(const char *name, int len);
--struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
-- int fhtype, int parent);
--int ll_dentry_to_fh(struct dentry *, __u32 *datap, int *lenp, int need_parent);
-
- /* llite/special.c */
- extern struct inode_operations ll_special_inode_operations;
-Index: HEAD/lustre/llite/llite_nfs.c
-===================================================================
---- HEAD.orig/lustre/llite/llite_nfs.c 2008-11-17 11:36:34.000000000 +0200
-+++ HEAD/lustre/llite/llite_nfs.c 2008-12-17 17:40:22.000000000 +0200
-@@ -67,14 +67,13 @@
- }
-
- static struct inode *search_inode_for_lustre(struct super_block *sb,
-- struct lu_fid *fid,
-- int mode)
-+ struct lu_fid *fid)
- {
- struct ll_sb_info *sbi = ll_s2sbi(sb);
- struct ptlrpc_request *req = NULL;
- struct inode *inode = NULL;
- unsigned long valid = 0;
-- int eadatalen = 0;
-+ int eadatalen;
- ino_t ino = ll_fid_build_ino(sbi, fid);
- int rc;
- ENTRY;
-@@ -85,13 +84,13 @@
- if (inode)
- RETURN(inode);
-
-- if (S_ISREG(mode)) {
-- rc = ll_get_max_mdsize(sbi, &eadatalen);
-- if (rc)
-- RETURN(ERR_PTR(rc));
-- valid |= OBD_MD_FLEASIZE;
-- }
-+ rc = ll_get_max_mdsize(sbi, &eadatalen);
-+ if (rc)
-+ RETURN(ERR_PTR(rc));
-+
-+ valid |= OBD_MD_FLEASIZE;
-
-+ /* mds_fid2dentry ignore f_type */
- rc = md_getattr(sbi->ll_md_exp, fid, NULL, valid, eadatalen, &req);
- if (rc) {
- CERROR("can't get object attrs, fid "DFID", rc %d\n",
-@@ -107,9 +106,7 @@
- RETURN(inode);
- }
-
--static struct dentry *ll_iget_for_nfs(struct super_block *sb,
-- struct lu_fid *fid,
-- umode_t mode)
-+static struct dentry *ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, __u32 mode)
- {
- struct inode *inode;
- struct dentry *result;
-@@ -119,7 +116,7 @@
- if (!fid_is_sane(fid))
- RETURN(ERR_PTR(-ESTALE));
-
-- inode = search_inode_for_lustre(sb, fid, mode);
-+ inode = search_inode_for_lustre(sb, fid);
- if (IS_ERR(inode))
- RETURN(ERR_PTR(PTR_ERR(inode)));
-
-@@ -142,85 +139,103 @@
- RETURN(result);
- }
-
-+#define LUSTRE_NFS_FID 0x97
-+
-+struct lustre_nfs_fid {
-+ struct lu_fid child;
-+ struct lu_fid parent;
-+ umode_t mode;
-+};
-+
-+/* The return value is file handle type:
-+ * 1 -- contains child file handle;
-+ * 2 -- contains child file handle and parent file handle;
-+ * 255 -- error.
-+ */
-+static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen,
-+ int connectable)
-+{
-+ struct inode *inode = de->d_inode;
-+ struct inode *parent = de->d_parent->d_inode;
-+ struct lustre_nfs_fid *nfs_fid = (void *)fh;
-+ ENTRY;
-+
-+ CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%u\n",
-+ inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
-+ sizeof(struct lustre_nfs_fid));
-+
-+ if (*plen < sizeof(struct lustre_nfs_fid))
-+ RETURN(255);
-+
-+ nfs_fid->child = *ll_inode2fid(inode);
-+ nfs_fid->parent = *ll_inode2fid(parent);
-+ nfs_fid->mode = (S_IFMT & inode->i_mode);
-+ *plen = sizeof(struct lustre_nfs_fid);
-+
-+ RETURN(LUSTRE_NFS_FID);
-+}
-+
-+#ifdef HAVE_FH_TO_DENTRY
-+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
-+ int fh_len, int fh_type)
-+{
-+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
-+
-+ if (fh_type != LUSTRE_NFS_FID)
-+ RETURN(ERR_PTR(-EINVAL));
-+
-+ RETURN(ll_iget_for_nfs(sb, &nfs_fid->child, nfs_fid->mode));
-+}
-+
-+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
-+ int fh_len, int fh_type)
-+{
-+ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
-+
-+ if (fh_type != LUSTRE_NFS_FID)
-+ RETURN(ERR_PTR(-EINVAL));
-+
-+
-+ RETURN(ll_iget_for_nfs(sb, &nfs_fid->parent, S_IFDIR));
-+}
-+
-+#else
-+
- /*
- * This length is counted as amount of __u32,
- * It is composed of a fid and a mode
- */
--#define ONE_FH_LEN (sizeof(struct lu_fid)/4 + 1)
--
- static struct dentry *ll_decode_fh(struct super_block *sb, __u32 *fh, int fh_len,
- int fh_type,
- int (*acceptable)(void *, struct dentry *),
- void *context)
- {
-- struct lu_fid *parent = NULL;
-- struct lu_fid *child;
-+ struct lustre_nfs_fid *nfs_fid = (void *)fh;
- struct dentry *entry;
- ENTRY;
-
-- CDEBUG(D_INFO, "decoding for "DFID" fh_len=%d fh_type=%d\n",
-- PFID((struct lu_fid*)fh), fh_len, fh_type);
-+ CDEBUG(D_INFO, "decoding for "DFID" fh_len=%d fh_type=%x\n",
-+ PFID(&nfs_fid->child), fh_len, fh_type);
-
-- if (fh_type != 1 && fh_type != 2)
-- RETURN(ERR_PTR(-ESTALE));
-- if (fh_len < ONE_FH_LEN * fh_type)
-+ if (fh_type != LUSTRE_NFS_FID)
- RETURN(ERR_PTR(-ESTALE));
-
-- child = (struct lu_fid*)fh;
-- if (fh_type == 2)
-- parent = (struct lu_fid*)(fh + ONE_FH_LEN);
--
-- entry = sb->s_export_op->find_exported_dentry(sb, child, parent,
-+ entry = sb->s_export_op->find_exported_dentry(sb, &nfs_fid->child,
-+ &nfs_fid->parent,
- acceptable, context);
- RETURN(entry);
- }
-
--/* The return value is file handle type:
-- * 1 -- contains child file handle;
-- * 2 -- contains child file handle and parent file handle;
-- * 255 -- error.
-- */
--static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen, int connectable)
--{
-- struct inode *inode = de->d_inode;
-- struct lu_fid *fid = ll_inode2fid(inode);
-- ENTRY;
--
-- CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
-- inode->i_ino, PFID(fid), *plen, (int)ONE_FH_LEN);
--
-- if (*plen < ONE_FH_LEN)
-- RETURN(255);
--
-- memcpy((char*)fh, fid, sizeof(*fid));
-- *(fh + ONE_FH_LEN - 1) = (__u32)(S_IFMT & inode->i_mode);
--
-- if (de->d_parent && *plen >= ONE_FH_LEN * 2) {
-- struct inode *parent = de->d_parent->d_inode;
-- fh += ONE_FH_LEN;
-- memcpy((char*)fh, &ll_i2info(parent)->lli_fid, sizeof(*fid));
-- *(fh + ONE_FH_LEN - 1) = (__u32)(S_IFMT & parent->i_mode);
-- *plen = ONE_FH_LEN * 2;
-- RETURN(2);
-- } else {
-- *plen = ONE_FH_LEN;
-- RETURN(1);
-- }
--}
--
- static struct dentry *ll_get_dentry(struct super_block *sb, void *data)
- {
-- struct lu_fid *fid;
-+ struct lustre_nfs_fid *fid = data;
- struct dentry *entry;
-- __u32 mode;
- ENTRY;
-
-- fid = (struct lu_fid *)data;
-- mode = *((__u32*)data + ONE_FH_LEN - 1);
--
-- entry = ll_iget_for_nfs(sb, fid, mode);
-+ entry = ll_iget_for_nfs(sb, &fid->child, fid->mode);
- RETURN(entry);
- }
-+#endif
-
- static struct dentry *ll_get_parent(struct dentry *dchild)
- {
-@@ -232,11 +247,11 @@
- static char dotdot[] = "..";
- int rc;
- ENTRY;
--
-+
- LASSERT(dir && S_ISDIR(dir->i_mode));
--
-+
- sbi = ll_s2sbi(dir->i_sb);
--
-+
- CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
- dir->i_ino, PFID(ll_inode2fid(dir)));
-
-@@ -249,7 +264,7 @@
- }
- body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
- LASSERT(body->valid & OBD_MD_FLID);
--
-+
- CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
- PFID(ll_inode2fid(dir)), PFID(&body->fid1));
-
-@@ -261,7 +276,12 @@
-
- struct export_operations lustre_export_operations = {
- .get_parent = ll_get_parent,
-- .get_dentry = ll_get_dentry,
- .encode_fh = ll_encode_fh,
-+#ifdef HAVE_FH_TO_DENTRY
-+ .fh_to_dentry = ll_fh_to_dentry,
-+ .fh_to_parent = ll_fh_to_parent,
-+#else
-+ .get_dentry = ll_get_dentry,
- .decode_fh = ll_decode_fh,
-+#endif
- };
diff --git a/debian/patches/patchless_support/splice_read_support.dpatch b/debian/patches/patchless_support/splice_read_support.dpatch
deleted file mode 100755
index 9384817..0000000
--- a/debian/patches/patchless_support/splice_read_support.dpatch
+++ /dev/null
@@ -1,423 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/autoconf/lustre-core.m4
-===================================================================
---- HEAD.orig/lustre/autoconf/lustre-core.m4 2008-12-17 16:21:23.000000000 +0200
-+++ HEAD/lustre/autoconf/lustre-core.m4 2008-12-17 16:21:40.000000000 +0200
-@@ -1388,6 +1388,25 @@
- ])
-
- # 2.6.23 change .sendfile to .splice_read
-+# RHEL4 (-92 kernel) have both sendfile and .splice_read API
-+AC_DEFUN([LC_KERNEL_SENDFILE],
-+[AC_MSG_CHECKING([if kernel has .sendfile])
-+LB_LINUX_TRY_COMPILE([
-+ #include <linux/fs.h>
-+],[
-+ struct file_operations file;
-+
-+ file.sendfile = NULL;
-+], [
-+ AC_MSG_RESULT([yes])
-+ AC_DEFINE(HAVE_KERNEL_SENDFILE, 1,
-+ [kernel has .sendfile])
-+],[
-+ AC_MSG_RESULT([no])
-+])
-+])
-+
-+# 2.6.23 change .sendfile to .splice_read
- AC_DEFUN([LC_KERNEL_SPLICE_READ],
- [AC_MSG_CHECKING([if kernel has .splice_read])
- LB_LINUX_TRY_COMPILE([
-@@ -1846,6 +1865,7 @@
- # 2.6.23
- LC_UNREGISTER_BLKDEV_RETURN_INT
- LC_KERNEL_SPLICE_READ
-+ LC_KERNEL_SENDFILE
- LC_HAVE_EXPORTFS_H
- LC_VM_OP_FAULT
- LC_REGISTER_SHRINKER
-Index: HEAD/lustre/obdclass/cl_io.c
-===================================================================
---- HEAD.orig/lustre/obdclass/cl_io.c 2008-11-12 22:58:06.000000000 +0200
-+++ HEAD/lustre/obdclass/cl_io.c 2008-12-17 16:21:40.000000000 +0200
-@@ -75,15 +75,6 @@
- }
-
- /**
-- * True, iff \a io is a sendfile().
-- */
--int cl_io_is_sendfile(const struct cl_io *io)
--{
-- return io->ci_type == CIT_READ && io->u.ci_rd.rd_is_sendfile;
--}
--EXPORT_SYMBOL(cl_io_is_sendfile);
--
--/**
- * Returns true iff there is an IO ongoing in the given environment.
- */
- int cl_io_is_going(const struct lu_env *env)
-Index: HEAD/lustre/include/cl_object.h
-===================================================================
---- HEAD.orig/lustre/include/cl_object.h 2008-11-08 01:52:38.000000000 +0200
-+++ HEAD/lustre/include/cl_object.h 2008-12-17 16:21:40.000000000 +0200
-@@ -2177,6 +2177,16 @@
- int crw_nonblock;
- };
-
-+/* IO subtypes */
-+enum cl_io_subtype {
-+ /** normal IO */
-+ IO_NORMAL,
-+ /** io called from .sendfile */
-+ IO_SENDFILE,
-+ /** io started from splice_{read|write} */
-+ IO_SPLICE
-+};
-+
- /**
- * State for io.
- *
-@@ -2207,7 +2217,6 @@
- union {
- struct cl_rd_io {
- struct cl_io_rw_common rd;
-- int rd_is_sendfile;
- } ci_rd;
- struct cl_wr_io {
- struct cl_io_rw_common wr;
-@@ -2860,8 +2869,6 @@
- return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
- }
-
--int cl_io_is_sendfile(const struct cl_io *io);
--
- struct cl_io *cl_io_top(struct cl_io *io);
-
- void cl_io_print(const struct lu_env *env, void *cookie,
-Index: HEAD/lustre/include/lclient.h
-===================================================================
---- HEAD.orig/lustre/include/lclient.h 2008-12-11 06:02:39.000000000 +0200
-+++ HEAD/lustre/include/lclient.h 2008-12-17 16:21:40.000000000 +0200
-@@ -51,14 +51,19 @@
- * Common IO arguments for various VFS I/O interfaces.
- */
- struct ccc_io_args {
-- int cia_is_sendfile;
-+ /** normal/sendfile/splice */
-+ enum cl_io_subtype cia_io_subtype;
- #ifndef HAVE_FILE_WRITEV
-- struct kiocb *cia_iocb;
-+ struct kiocb *cia_iocb;
- #endif
-- struct iovec *cia_iov;
-- unsigned long cia_nrsegs;
-- read_actor_t cia_actor;
-- void *cia_target;
-+ struct iovec *cia_iov;
-+ unsigned long cia_nrsegs;
-+ /* sendfile */
-+ read_actor_t cia_actor;
-+ void *cia_target;
-+ /* splice */
-+ struct pipe_inode_info *cia_pipe;
-+ unsigned int cia_flags;
- };
-
- /**
-Index: HEAD/lustre/llite/vvp_io.c
-===================================================================
---- HEAD.orig/lustre/llite/vvp_io.c 2008-11-12 23:00:37.000000000 +0200
-+++ HEAD/lustre/llite/vvp_io.c 2008-12-17 16:21:40.000000000 +0200
-@@ -52,6 +52,16 @@
- static struct vvp_io *cl2vvp_io(const struct lu_env *env,
- const struct cl_io_slice *slice);
-
-+/**
-+ * True, if \a io is a normal io, False for sendfile() / splice_{read|write}
-+ */
-+static int vvp_io_is_normalio(const struct lu_env *env, const struct cl_io *io)
-+{
-+ struct vvp_io *vio = vvp_env_io(env);
-+
-+ return vio->ci_io_subtype == IO_NORMAL;
-+}
-+
- /*****************************************************************************
- *
- * io operations.
-@@ -132,7 +142,7 @@
-
- LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
-
-- if (cl_io_is_sendfile(io))
-+ if (!vvp_io_is_normalio(env, io))
- RETURN(0);
-
- for (seg = 0; seg < vio->cui_nrsegs; seg++) {
-@@ -180,7 +190,7 @@
- size_t size = io->u.ci_rw.crw_count;
-
- vio->cui_iov_olen = 0;
-- if (cl_io_is_sendfile(io) || size == vio->cui_tot_count)
-+ if (!vvp_io_is_normalio(env, io) || size == vio->cui_tot_count)
- return;
-
- if (vio->cui_tot_nrsegs == 0)
-@@ -476,11 +486,27 @@
-
- /* BUG: 5972 */
- file_accessed(file);
-- if (cl_io_is_sendfile(io)) {
-+ switch(vio->ci_io_subtype) {
-+ case IO_NORMAL:
-+ result = lustre_generic_file_read(file, cio, &pos);
-+ break;
-+#ifdef HAVE_KERNEL_SENDFILE
-+ case IO_SENDFILE:
- result = generic_file_sendfile(file, &pos, cnt,
-- vio->u.read.cui_actor, vio->u.read.cui_target);
-- } else {
-- result = lustre_generic_file_read(file, cio, &pos);
-+ vio->u.sendfile.cui_actor,
-+ vio->u.sendfile.cui_target);
-+ break;
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ case IO_SPLICE:
-+ result = generic_file_splice_read(file, &pos,
-+ vio->u.splice.pipe, cnt,
-+ vio->u.splice.flags);
-+ break;
-+#endif
-+ default:
-+ CERROR("Wrong IO type %u\n", vio->ci_io_subtype);
-+ LBUG();
- }
-
- if (result >= 0) {
-@@ -622,7 +648,7 @@
-
- CLOBINVRNT(env, obj, ccc_object_invariant(obj));
-
-- if (!cl_io_is_sendfile(io) && io->ci_continue) {
-+ if (vvp_io_is_normalio(env, io) && io->ci_continue) {
- /* update the iov */
- LASSERT(vio->cui_tot_nrsegs >= vio->cui_nrsegs);
- LASSERT(vio->cui_tot_count >= nob);
-Index: HEAD/lustre/llite/llite_internal.h
-===================================================================
---- HEAD.orig/lustre/llite/llite_internal.h 2008-12-17 16:21:24.000000000 +0200
-+++ HEAD/lustre/llite/llite_internal.h 2008-12-17 16:21:40.000000000 +0200
-@@ -791,11 +791,22 @@
- void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
-
- struct vvp_io {
-+ /** io subtype */
-+ enum cl_io_subtype ci_io_subtype;
-+
- union {
-+#ifdef HAVE_KERNEL_SENDFILE
- struct {
- read_actor_t cui_actor;
- void *cui_target;
-- } read;
-+ } sendfile;
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ struct {
-+ struct pipe_inode_info *pipe;
-+ unsigned int flags;
-+ } splice;
-+#endif
- struct vvp_fault_io {
- /**
- * Inode modification time that is checked across DLM
-Index: HEAD/lustre/llite/file.c
-===================================================================
---- HEAD.orig/lustre/llite/file.c 2008-11-20 14:34:31.000000000 +0200
-+++ HEAD/lustre/llite/file.c 2008-12-17 16:26:49.000000000 +0200
-@@ -807,27 +807,43 @@
- io = &ccc_env_info(env)->cti_io;
- ll_io_init(io, file, iot == CIT_WRITE);
-
-- if (iot == CIT_READ)
-- io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
--
- if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
- struct vvp_io *vio = vvp_env_io(env);
- struct ccc_io *cio = ccc_env_io(env);
-- if (cl_io_is_sendfile(io)) {
-- vio->u.read.cui_actor = args->cia_actor;
-- vio->u.read.cui_target = args->cia_target;
-- } else {
-+
-+ vio->ci_io_subtype = args->cia_io_subtype;
-+
-+ switch(vio->ci_io_subtype) {
-+ case IO_NORMAL:
- cio->cui_iov = args->cia_iov;
- cio->cui_nrsegs = args->cia_nrsegs;
- #ifndef HAVE_FILE_WRITEV
- cio->cui_iocb = args->cia_iocb;
- #endif
-+ break;
-+#ifdef HAVE_KERNEL_SENDFILE
-+ case IO_SENDFILE:
-+ vio->u.sendfile.cui_actor = args->cia_actor;
-+ vio->u.sendfile.cui_target = args->cia_target;
-+ break;
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ case IO_SPLICE:
-+ vio->u.splice.pipe = args->cia_pipe;
-+ vio->u.splice.flags = args->cia_flags;
-+ break;
-+#endif
-+ default:
-+ CERROR("Unknow IO type - %u\n", vio->ci_io_subtype);
-+ LBUG();
- }
- cio->cui_fd = LUSTRE_FPRIVATE(file);
- result = cl_io_loop(env, io);
-- } else
-+ } else {
- /* cl_io_rw_init() handled IO */
- result = io->ci_result;
-+ }
-+
- if (io->ci_nob > 0) {
- result = io->ci_nob;
- *ppos = io->u.ci_wr.wr.crw_pos;
-@@ -888,7 +904,7 @@
- RETURN(PTR_ERR(env));
-
- args = &vvp_env_info(env)->vti_args;
-- args->cia_is_sendfile = 0;
-+ args->cia_io_subtype = IO_NORMAL;
- args->cia_iov = (struct iovec *)iov;
- args->cia_nrsegs = nr_segs;
- result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
-@@ -937,7 +953,7 @@
- RETURN(PTR_ERR(env));
-
- args = &vvp_env_info(env)->vti_args;
-- args->cia_is_sendfile = 0;
-+ args->cia_io_subtype = IO_NORMAL;
- args->cia_iov = (struct iovec *)iov;
- args->cia_nrsegs = nr_segs;
- args->cia_iocb = iocb;
-@@ -1002,6 +1018,7 @@
- args = &vvp_env_info(env)->vti_args;
- args->cia_iov = (struct iovec *)iov;
- args->cia_nrsegs = nr_segs;
-+ args->cia_io_subtype = IO_NORMAL;
- result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
- cl_env_put(env, &refcheck);
- RETURN(result);
-@@ -1052,6 +1069,7 @@
- args->cia_iov = (struct iovec *)iov;
- args->cia_nrsegs = nr_segs;
- args->cia_iocb = iocb;
-+ args->cia_io_subtype = IO_NORMAL;
- result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
- &iocb->ki_pos, count);
- cl_env_put(env, &refcheck);
-@@ -1089,6 +1107,7 @@
- #endif
-
-
-+#ifdef HAVE_KERNEL_SENDFILE
- /*
- * Send file content (through pagecache) somewhere with helper
- */
-@@ -1106,13 +1125,43 @@
- RETURN(PTR_ERR(env));
-
- args = &vvp_env_info(env)->vti_args;
-- args->cia_is_sendfile = 1;
-+ args->cia_io_subtype = IO_SENDFILE;
- args->cia_target = target;
- args->cia_actor = actor;
- result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
- cl_env_put(env, &refcheck);
- RETURN(result);
- }
-+#endif
-+
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+/*
-+ * Send file content (through pagecache) somewhere with helper
-+ */
-+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
-+ struct pipe_inode_info *pipe, size_t count,
-+ unsigned int flags)
-+{
-+ struct lu_env *env;
-+ struct ccc_io_args *args;
-+ ssize_t result;
-+ int refcheck;
-+ ENTRY;
-+
-+ env = cl_env_get(&refcheck);
-+ if (IS_ERR(env))
-+ RETURN(PTR_ERR(env));
-+
-+ args = &vvp_env_info(env)->vti_args;
-+ args->cia_io_subtype = IO_SPLICE;
-+ args->cia_pipe = pipe;
-+ args->cia_flags = flags;
-+
-+ result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
-+ cl_env_put(env, &refcheck);
-+ RETURN(result);
-+}
-+#endif
-
- static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
- unsigned long arg)
-@@ -2390,7 +2439,12 @@
- .release = ll_file_release,
- .mmap = ll_file_mmap,
- .llseek = ll_file_seek,
-+#ifdef HAVE_KERNEL_SENDFILE
- .sendfile = ll_file_sendfile,
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ .splice_read = ll_file_splice_read,
-+#endif
- .fsync = ll_fsync,
- };
-
-@@ -2404,7 +2458,12 @@
- .release = ll_file_release,
- .mmap = ll_file_mmap,
- .llseek = ll_file_seek,
-+#ifdef HAVE_KERNEL_SENDFILE
- .sendfile = ll_file_sendfile,
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ .splice_read = ll_file_splice_read,
-+#endif
- .fsync = ll_fsync,
- #ifdef HAVE_F_OP_FLOCK
- .flock = ll_file_flock,
-@@ -2423,7 +2482,12 @@
- .release = ll_file_release,
- .mmap = ll_file_mmap,
- .llseek = ll_file_seek,
-+#ifdef HAVE_KERNEL_SENDFILE
- .sendfile = ll_file_sendfile,
-+#endif
-+#ifdef HAVE_KERNEL_SPLICE_READ
-+ .splice_read = ll_file_splice_read,
-+#endif
- .fsync = ll_fsync,
- #ifdef HAVE_F_OP_FLOCK
- .flock = ll_file_noflock,
diff --git a/debian/patches/patchless_support/sysctl_update.dpatch b/debian/patches/patchless_support/sysctl_update.dpatch
deleted file mode 100755
index 23078a7..0000000
--- a/debian/patches/patchless_support/sysctl_update.dpatch
+++ /dev/null
@@ -1,278 +0,0 @@
-#! /bin/sh /usr/share/dpatch/dpatch-run
-## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
-##
-## All lines beginning with `## DP:' are a description of the patch.
-## DP: Patch which will enable 2.6.24 patchless support for lustre, taken from #14250
-
- at DPATCH@
-Index: HEAD/lustre/obdclass/linux/linux-sysctl.c
-===================================================================
---- HEAD.orig/lustre/obdclass/linux/linux-sysctl.c 2008-08-12 11:40:23.000000000 +0400
-+++ HEAD/lustre/obdclass/linux/linux-sysctl.c 2008-12-05 21:13:18.000000000 +0300
-@@ -56,7 +56,9 @@
-
- cfs_sysctl_table_header_t *obd_table_header = NULL;
-
--#define OBD_SYSCTL 300
-+#ifndef HAVE_SYSCTL_UNNUMBERED
-+
-+#define CTL_LUSTRE 300
-
- enum {
- OBD_FAIL_LOC = 1, /* control test failures instrumentation */
-@@ -74,6 +76,23 @@
- OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
- OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */
- };
-+#else
-+#define CTL_LUSTRE CTL_UNNUMBERED
-+#define OBD_FAIL_LOC CTL_UNNUMBERED
-+#define OBD_FAIL_VAL CTL_UNNUMBERED
-+#define OBD_TIMEOUT CTL_UNNUMBERED
-+#define OBD_DUMP_ON_TIMEOUT CTL_UNNUMBERED
-+#define OBD_MEMUSED CTL_UNNUMBERED
-+#define OBD_PAGESUSED CTL_UNNUMBERED
-+#define OBD_MAXMEMUSED CTL_UNNUMBERED
-+#define OBD_MAXPAGESUSED CTL_UNNUMBERED
-+#define OBD_SYNCFILTER CTL_UNNUMBERED
-+#define OBD_LDLM_TIMEOUT CTL_UNNUMBERED
-+#define OBD_DUMP_ON_EVICTION CTL_UNNUMBERED
-+#define OBD_DEBUG_PEER_ON_TIMEOUT CTL_UNNUMBERED
-+#define OBD_ALLOC_FAIL_RATE CTL_UNNUMBERED
-+#define OBD_MAX_DIRTY_PAGES CTL_UNNUMBERED
-+#endif
-
- int LL_PROC_PROTO(proc_fail_loc)
- {
-@@ -100,6 +119,7 @@
- {
- char buf[22];
- int len;
-+ struct ctl_table dummy;
- DECLARE_LL_PROC_PPOS_DECL;
-
- if (!*lenp || (*ppos && !write)) {
-@@ -113,17 +133,19 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-- return -EFAULT;
-- *lenp = len;
-- *ppos += *lenp;
-- return 0;
-+
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
- }
-
- int LL_PROC_PROTO(proc_pages_alloc)
- {
- char buf[22];
- int len;
-+ struct ctl_table dummy;
- DECLARE_LL_PROC_PPOS_DECL;
-
- if (!*lenp || (*ppos && !write)) {
-@@ -137,17 +159,19 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-- return -EFAULT;
-- *lenp = len;
-- *ppos += *lenp;
-- return 0;
-+
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
- }
-
- int LL_PROC_PROTO(proc_mem_max)
- {
- char buf[22];
- int len;
-+ struct ctl_table dummy;
- DECLARE_LL_PROC_PPOS_DECL;
-
- if (!*lenp || (*ppos && !write)) {
-@@ -161,17 +185,19 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-- return -EFAULT;
-- *lenp = len;
-- *ppos += *lenp;
-- return 0;
-+
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
- }
-
- int LL_PROC_PROTO(proc_pages_max)
- {
- char buf[22];
- int len;
-+ struct ctl_table dummy;
- DECLARE_LL_PROC_PPOS_DECL;
-
- if (!*lenp || (*ppos && !write)) {
-@@ -185,11 +211,12 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-- return -EFAULT;
-- *lenp = len;
-- *ppos += *lenp;
-- return 0;
-+
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
- }
-
- int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
-@@ -216,7 +243,8 @@
- obd_max_dirty_pages = 4 << (20 - CFS_PAGE_SHIFT);
- }
- } else {
-- char buf[21];
-+ char buf[22];
-+ struct ctl_table dummy;
- int len;
-
- len = lprocfs_read_frac_helper(buf, sizeof(buf),
-@@ -225,7 +253,13 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-+
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ rc = ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
-+ if (rc)
- return -EFAULT;
- *lenp = len;
- }
-@@ -248,7 +282,8 @@
- (unsigned int*)table->data,
- OBD_ALLOC_FAIL_MULT);
- } else {
-- char buf[21];
-+ char buf[22];
-+ struct ctl_table dummy;
- int len;
-
- len = lprocfs_read_frac_helper(buf, 21,
-@@ -257,7 +292,12 @@
- if (len > *lenp)
- len = *lenp;
- buf[len] = '\0';
-- if (copy_to_user(buffer, buf, len))
-+ dummy = *table;
-+ dummy.data = buf;
-+ dummy.maxlen = sizeof(buf);
-+
-+ rc = ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
-+ if(rc)
- return -EFAULT;
- *lenp = len;
- }
-@@ -281,7 +321,8 @@
- .data = &obd_fail_val,
- .maxlen = sizeof(int),
- .mode = 0644,
-- .proc_handler = &proc_dointvec
-+ .proc_handler = &proc_dointvec,
-+ .strategy = &sysctl_intvec,
- },
- {
- .ctl_name = OBD_TIMEOUT,
-@@ -297,7 +338,7 @@
- .data = &obd_debug_peer_on_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
-- .proc_handler = &proc_dointvec
-+ .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = OBD_DUMP_ON_TIMEOUT,
-@@ -305,7 +346,7 @@
- .data = &obd_dump_on_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
-- .proc_handler = &proc_dointvec
-+ .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = OBD_DUMP_ON_EVICTION,
-@@ -313,7 +354,7 @@
- .data = &obd_dump_on_eviction,
- .maxlen = sizeof(int),
- .mode = 0644,
-- .proc_handler = &proc_dointvec
-+ .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = OBD_MEMUSED,
-@@ -321,7 +362,7 @@
- .data = NULL,
- .maxlen = 0,
- .mode = 0444,
-- .proc_handler = &proc_memory_alloc
-+ .proc_handler = &proc_memory_alloc,
- },
- {
- .ctl_name = OBD_PAGESUSED,
-@@ -329,7 +370,7 @@
- .data = NULL,
- .maxlen = 0,
- .mode = 0444,
-- .proc_handler = &proc_pages_alloc
-+ .proc_handler = &proc_pages_alloc,
- },
- {
- .ctl_name = OBD_MAXMEMUSED,
-@@ -337,7 +378,7 @@
- .data = NULL,
- .maxlen = 0,
- .mode = 0444,
-- .proc_handler = &proc_mem_max
-+ .proc_handler = &proc_mem_max,
- },
- {
- .ctl_name = OBD_MAXPAGESUSED,
-@@ -345,7 +386,7 @@
- .data = NULL,
- .maxlen = 0,
- .mode = 0444,
-- .proc_handler = &proc_pages_max
-+ .proc_handler = &proc_pages_max,
- },
- {
- .ctl_name = OBD_LDLM_TIMEOUT,
-@@ -378,7 +419,7 @@
-
- static cfs_sysctl_table_t parent_table[] = {
- {
-- .ctl_name = OBD_SYSCTL,
-+ .ctl_name = CTL_LUSTRE,
- .procname = "lustre",
- .data = NULL,
- .maxlen = 0,
diff --git a/debian/patches/patchless_support_2.6.26.dpatch b/debian/patches/patchless_support_2.6.26.dpatch
new file mode 100755
index 0000000..2173bde
--- /dev/null
+++ b/debian/patches/patchless_support_2.6.26.dpatch
@@ -0,0 +1,16673 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## posix_acl.patch by Patrick Winnertz <winnie at debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Patch which will enable 2.6.26 patchless support for lustre, taken from #14250
+
+ at DPATCH@
+diff -urNad lustre~/lnet/autoconf/lustre-lnet.m4 lustre/lnet/autoconf/lustre-lnet.m4
+--- lustre~/lnet/autoconf/lustre-lnet.m4 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lnet/autoconf/lustre-lnet.m4 2009-03-12 11:02:51.000000000 +0100
+@@ -1362,6 +1362,22 @@
+ ])
+ ])
+
++# 2.6.27 have second argument to sock_map_fd
++AC_DEFUN([LN_SOCK_MAP_FD_2ARG],
++[AC_MSG_CHECKING([sock_map_fd have second argument])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/net.h>
++],[
++ sock_map_fd(NULL, 0);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_SOCK_MAP_FD_2ARG, 1,
++ [sock_map_fd have second argument])
++],[
++ AC_MSG_RESULT(NO)
++])
++])
++
+ #
+ # LN_PROG_LINUX
+ #
+@@ -1410,6 +1426,8 @@
+ LN_SCATTERLIST_SETPAGE
+ # 2.6.26
+ LN_SEM_COUNT
++# 2.6.27
++LN_SOCK_MAP_FD_2ARG
+ ])
+
+ #
+diff -urNad lustre~/lnet/libcfs/linux/linux-prim.c lustre/lnet/libcfs/linux/linux-prim.c
+--- lustre~/lnet/libcfs/linux/linux-prim.c 2008-08-07 11:51:06.000000000 +0200
++++ lustre/lnet/libcfs/linux/linux-prim.c 2009-03-12 11:02:51.000000000 +0100
+@@ -49,7 +49,7 @@
+ void cfs_enter_debugger(void)
+ {
+ #if defined(CONFIG_KGDB)
+- BREAKPOINT();
++// BREAKPOINT();
+ #elif defined(__arch_um__)
+ asm("int $3");
+ #else
+diff -urNad lustre~/lnet/libcfs/linux/linux-tcpip.c lustre/lnet/libcfs/linux/linux-tcpip.c
+--- lustre~/lnet/libcfs/linux/linux-tcpip.c 2008-08-07 11:51:07.000000000 +0200
++++ lustre/lnet/libcfs/linux/linux-tcpip.c 2009-03-12 11:02:51.000000000 +0100
+@@ -63,7 +63,11 @@
+ return rc;
+ }
+
++#ifdef HAVE_SOCK_MAP_FD_2ARG
++ fd = sock_map_fd(sock,0);
++#else
+ fd = sock_map_fd(sock);
++#endif
+ if (fd < 0) {
+ rc = fd;
+ sock_release(sock);
+diff -urNad lustre~/lnet/lnet/api-ni.c lustre/lnet/lnet/api-ni.c
+--- lustre~/lnet/lnet/api-ni.c 2009-03-12 10:21:27.000000000 +0100
++++ lustre/lnet/lnet/api-ni.c 2009-03-12 11:02:51.000000000 +0100
+@@ -1032,7 +1032,7 @@
+ #ifdef __KERNEL__
+ if (lnd == NULL) {
+ LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+- rc = request_module(libcfs_lnd2modname(lnd_type));
++ rc = request_module("%s", libcfs_lnd2modname(lnd_type));
+ LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
+
+ lnd = lnet_find_lnd_by_type(lnd_type);
+diff -urNad lustre~/lustre/autoconf/lustre-core.m4 lustre/lustre/autoconf/lustre-core.m4
+--- lustre~/lustre/autoconf/lustre-core.m4 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/autoconf/lustre-core.m4 2009-03-12 11:07:59.000000000 +0100
+@@ -1106,15 +1106,20 @@
+ AC_DEFUN([LC_PAGE_CHECKED],
+ [AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
+ LB_LINUX_TRY_COMPILE([
+- #include <linux/mm.h>
+- #include <linux/page-flags.h>
++ #include <linux/autoconf.h>
++#ifdef HAVE_LINUX_MMTYPES_H
++ #include <linux/mm_types.h>
++#endif
++ #include <linux/page-flags.h>
+ ],[
+- #ifndef PageChecked
+- #error PageChecked not defined in kernel
+- #endif
+- #ifndef SetPageChecked
+- #error SetPageChecked not defined in kernel
+- #endif
++ struct page *p;
++
++ /* before 2.6.26 this define*/
++ #ifndef PageChecked
++ /* 2.6.26 use function instead of define for it */
++ SetPageChecked(p);
++ PageChecked(p);
++ #endif
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PAGE_CHECKED, 1,
+@@ -1232,6 +1237,9 @@
+ ])
+ ])
+
++# 2.6.18
++
++
+ # 2.6.23 have return type 'void' for unregister_blkdev
+ AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT],
+ [AC_MSG_CHECKING([if unregister_blkdev return int])
+@@ -1249,6 +1257,25 @@
+ ])
+
+ # 2.6.23 change .sendfile to .splice_read
++# RHEL4 (-92 kernel) have both sendfile and .splice_read API
++AC_DEFUN([LC_KERNEL_SENDFILE],
++[AC_MSG_CHECKING([if kernel has .sendfile])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations file;
++
++ file.sendfile = NULL;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_KERNEL_SENDFILE, 1,
++ [kernel has .sendfile])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.23 change .sendfile to .splice_read
+ AC_DEFUN([LC_KERNEL_SPLICE_READ],
+ [AC_MSG_CHECKING([if kernel has .splice_read])
+ LB_LINUX_TRY_COMPILE([
+@@ -1268,11 +1295,219 @@
+
+ # 2.6.23 extract nfs export related data into exportfs.h
+ AC_DEFUN([LC_HAVE_EXPORTFS_H],
+-[
+-tmpfl="$CFLAGS"
+-CFLAGS="$CFLAGS -I$LINUX_OBJ/include"
+-AC_CHECK_HEADERS([linux/exportfs.h])
+-CFLAGS="$tmpfl"
++[LB_CHECK_FILE([$LINUX/include/linux/exportfs.h], [
++ AC_DEFINE(HAVE_LINUX_EXPORTFS_H, 1,
++ [kernel has include/exportfs.h])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.23 have new page fault handling API
++AC_DEFUN([LC_VM_OP_FAULT],
++[AC_MSG_CHECKING([if kernel has .fault in vm_operation_struct])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++],[
++ struct vm_operations_struct op;
++
++ op.fault = NULL;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_VM_OP_FAULT, 1,
++ [if kernel has .fault in vm_operation_struct])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#2.6.23 has new shrinker API
++AC_DEFUN([LC_REGISTER_SHRINKER],
++[AC_MSG_CHECKING([if kernel has register_shrinker])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++],[
++ register_shrinker(NULL);
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_REGISTER_SHRINKER, 1,
++ [if kernel has register_shrinker])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.24 has bio_endio with 2 args
++AC_DEFUN([LC_BIO_ENDIO_2ARG],
++[AC_MSG_CHECKING([if kernel has bio_endio with 2 args])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/bio.h>
++],[
++ bio_endio(NULL, 0);
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_BIO_ENDIO_2ARG, 1,
++ [if kernel has bio_endio with 2 args])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.24 has new members in exports struct.
++AC_DEFUN([LC_FH_TO_DENTRY],
++[AC_MSG_CHECKING([if kernel has .fh_to_dentry member in export_operations struct])
++LB_LINUX_TRY_COMPILE([
++#ifdef HAVE_LINUX_EXPORTFS_H
++ #include <linux/exportfs.h>
++#else
++ #include <linux/fs.h>
++#endif
++],[
++ struct export_operations exp;
++
++ exp.fh_to_dentry = NULL;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FH_TO_DENTRY, 1,
++ [kernel has .fh_to_dentry member in export_operations struct])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.24 need linux/mm_types.h included
++AC_DEFUN([LC_HAVE_MMTYPES_H],
++[LB_CHECK_FILE([$LINUX/include/linux/mm_types.h], [
++ AC_DEFINE(HAVE_LINUX_MMTYPES_H, 1,
++ [kernel has include/mm_types.h])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.24 remove long aged procfs entry -> deleted member
++AC_DEFUN([LC_PROCFS_DELETED],
++[AC_MSG_CHECKING([if kernel has deleted member in procfs entry struct])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/proc_fs.h>
++],[
++ struct proc_dir_entry pde;
++
++ pde.deleted = NULL;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_PROCFS_DELETED, 1,
++ [kernel has deleted member in procfs entry struct])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.25 change define to inline
++AC_DEFUN([LC_MAPPING_CAP_WRITEBACK_DIRTY],
++[AC_MSG_CHECKING([if kernel have mapping_cap_writeback_dirty])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/backing-dev.h>
++],[
++ #ifndef mapping_cap_writeback_dirty
++ mapping_cap_writeback_dirty(NULL);
++ #endif
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_MAPPING_CAP_WRITEBACK_DIRTY, 1,
++ [kernel have mapping_cap_writeback_dirty])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++
++
++# 2.6.26 isn't export set_fs_pwd and change paramter in fs struct
++AC_DEFUN([LC_FS_STRUCT_USE_PATH],
++[AC_MSG_CHECKING([fs_struct use path structure])
++LB_LINUX_TRY_COMPILE([
++ #include <asm/atomic.h>
++ #include <linux/spinlock.h>
++ #include <linux/fs_struct.h>
++],[
++ struct path path;
++ struct fs_struct fs;
++
++ fs.pwd = path;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FS_STRUCT_USE_PATH, 1,
++ [fs_struct use path structure])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.26 remove path_release and use path_put instead
++AC_DEFUN([LC_PATH_RELEASE],
++[AC_MSG_CHECKING([if path_release exist])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/dcache.h>
++ #include <linux/namei.h>
++],[
++ path_release(NULL);
++],[
++ AC_DEFINE(HAVE_PATH_RELEASE, 1, [path_release exist])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#2.6.27
++AC_DEFUN([LC_INODE_PERMISION_2ARGS],
++[AC_MSG_CHECKING([inode_operations->permission have two args])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct inode *inode;
++
++ inode->i_op->permission(NULL,0);
++],[
++ AC_DEFINE(HAVE_INODE_PERMISION_2ARGS, 1,
++ [inode_operations->permission have two args])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.27 have file_remove_suid instead of remove_suid
++AC_DEFUN([LC_FILE_REMOVE_SUID],
++[AC_MSG_CHECKING([kernel have file_remove_suid])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ file_remove_suid(NULL);
++],[
++ AC_DEFINE(HAVE_FILE_REMOVE_SUID, 1,
++ [kernel have file_remove_suid])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.27 have new page locking API
++AC_DEFUN([LC_TRYLOCKPAGE],
++[AC_MSG_CHECKING([kernel use trylock_page for page lock])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/pagemap.h>
++],[
++ trylock_page(NULL);
++],[
++ AC_DEFINE(HAVE_TRYLOCK_PAGE, 1,
++ [kernel use trylock_page for page lock])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
+ ])
+
+ #
+@@ -1372,8 +1607,30 @@
+ LC_FS_RENAME_DOES_D_MOVE
+ # 2.6.23
+ LC_UNREGISTER_BLKDEV_RETURN_INT
++ LC_KERNEL_SENDFILE
+ LC_KERNEL_SPLICE_READ
+ LC_HAVE_EXPORTFS_H
++ LC_VM_OP_FAULT
++ LC_REGISTER_SHRINKER
++
++ #2.6.25
++ LC_MAPPING_CAP_WRITEBACK_DIRTY
++
++ # 2.6.24
++ LC_HAVE_MMTYPES_H
++ LC_BIO_ENDIO_2ARG
++ LC_FH_TO_DENTRY
++ LC_PROCFS_DELETED
++
++ # 2.6.26
++ LC_FS_STRUCT_USE_PATH
++ LC_RCU_LIST_SAFE
++ LC_PATH_RELEASE
++
++ # 2.6.27
++ LC_INODE_PERMISION_2ARGS
++ LC_FILE_REMOVE_SUID
++ LC_TRYLOCKPAGE
+ ])
+
+ #
+@@ -1606,6 +1863,7 @@
+ ],[
+ AC_MSG_RESULT([no])
+ ])
++
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+diff -urNad lustre~/lustre/autoconf/lustre-core.m4.orig lustre/lustre/autoconf/lustre-core.m4.orig
+--- lustre~/lustre/autoconf/lustre-core.m4.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/autoconf/lustre-core.m4.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,1817 @@
++#* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++#* vim:expandtab:shiftwidth=8:tabstop=8:
++#
++# LC_CONFIG_SRCDIR
++#
++# Wrapper for AC_CONFIG_SUBDIR
++#
++AC_DEFUN([LC_CONFIG_SRCDIR],
++[AC_CONFIG_SRCDIR([lustre/obdclass/obdo.c])
++])
++
++#
++# LC_PATH_DEFAULTS
++#
++# lustre specific paths
++#
++AC_DEFUN([LC_PATH_DEFAULTS],
++[# ptlrpc kernel build requires this
++LUSTRE="$PWD/lustre"
++AC_SUBST(LUSTRE)
++
++# mount.lustre
++rootsbindir='/sbin'
++AC_SUBST(rootsbindir)
++
++demodir='$(docdir)/demo'
++AC_SUBST(demodir)
++
++pkgexampledir='${pkgdatadir}/examples'
++AC_SUBST(pkgexampledir)
++])
++
++#
++# LC_TARGET_SUPPORTED
++#
++# is the target os supported?
++#
++AC_DEFUN([LC_TARGET_SUPPORTED],
++[case $target_os in
++ linux* | darwin*)
++$1
++ ;;
++ *)
++$2
++ ;;
++esac
++])
++
++#
++# LC_CONFIG_EXT3
++#
++# that ext3 is enabled in the kernel
++#
++AC_DEFUN([LC_CONFIG_EXT3],
++[LB_LINUX_CONFIG([EXT3_FS],[],[
++ LB_LINUX_CONFIG([EXT3_FS_MODULE],[],[$2])
++])
++LB_LINUX_CONFIG([EXT3_FS_XATTR],[$1],[$3])
++])
++
++#
++# LC_FSHOOKS
++#
++# If we have (and can build) fshooks.h
++#
++AC_DEFUN([LC_FSHOOKS],
++[LB_CHECK_FILE([$LINUX/include/linux/fshooks.h],[
++ AC_MSG_CHECKING([if fshooks.h can be compiled])
++ LB_LINUX_TRY_COMPILE([
++ #include <linux/fshooks.h>
++ ],[],[
++ AC_MSG_RESULT([yes])
++ ],[
++ AC_MSG_RESULT([no])
++ AC_MSG_WARN([You might have better luck with gcc 3.3.x.])
++ AC_MSG_WARN([You can set CC=gcc33 before running configure.])
++ AC_MSG_ERROR([Your compiler cannot build fshooks.h.])
++ ])
++$1
++],[
++$2
++])
++])
++
++#
++# LC_STRUCT_KIOBUF
++#
++# rh 2.4.18 has iobuf->dovary, but other kernels do not
++#
++AC_DEFUN([LC_STRUCT_KIOBUF],
++[AC_MSG_CHECKING([if struct kiobuf has a dovary field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/iobuf.h>
++],[
++ struct kiobuf iobuf;
++ iobuf.dovary = 1;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_KIOBUF_DOVARY, 1, [struct kiobuf has a dovary field])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_COND_RESCHED
++#
++# cond_resched() was introduced in 2.4.20
++#
++AC_DEFUN([LC_FUNC_COND_RESCHED],
++[AC_MSG_CHECKING([if kernel offers cond_resched])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/sched.h>
++],[
++ cond_resched();
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_COND_RESCHED, 1, [cond_resched found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_ZAP_PAGE_RANGE
++#
++# if zap_page_range() takes a vma arg
++#
++AC_DEFUN([LC_FUNC_ZAP_PAGE_RANGE],
++[AC_MSG_CHECKING([if zap_page_range with vma parameter])
++ZAP_PAGE_RANGE_VMA="`grep -c 'zap_page_range.*struct vm_area_struct' $LINUX/include/linux/mm.h`"
++if test "$ZAP_PAGE_RANGE_VMA" != 0 ; then
++ AC_DEFINE(ZAP_PAGE_RANGE_VMA, 1, [zap_page_range with vma parameter])
++ AC_MSG_RESULT([yes])
++else
++ AC_MSG_RESULT([no])
++fi
++])
++
++#
++# LC_FUNC_PDE
++#
++# if proc_fs.h defines PDE()
++#
++AC_DEFUN([LC_FUNC_PDE],
++[AC_MSG_CHECKING([if kernel defines PDE])
++HAVE_PDE="`grep -c 'proc_dir_entry..PDE' $LINUX/include/linux/proc_fs.h`"
++if test "$HAVE_PDE" != 0 ; then
++ AC_DEFINE(HAVE_PDE, 1, [the kernel defines PDE])
++ AC_MSG_RESULT([yes])
++else
++ AC_MSG_RESULT([no])
++fi
++])
++
++#
++# LC_FUNC_FILEMAP_FDATASYNC
++#
++# if filemap_fdatasync() exists
++#
++AC_DEFUN([LC_FUNC_FILEMAP_FDATAWRITE],
++[AC_MSG_CHECKING([whether filemap_fdatawrite() is defined])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int (*foo)(struct address_space *)= filemap_fdatawrite;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FILEMAP_FDATAWRITE, 1, [filemap_fdatawrite() found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_DIRECT_IO
++#
++# if direct_IO takes a struct file argument
++#
++AC_DEFUN([LC_FUNC_DIRECT_IO],
++[AC_MSG_CHECKING([if kernel passes struct file to direct_IO])
++HAVE_DIO_FILE="`grep -c 'direct_IO.*struct file' $LINUX/include/linux/fs.h`"
++if test "$HAVE_DIO_FILE" != 0 ; then
++ AC_DEFINE(HAVE_DIO_FILE, 1, [the kernel passes struct file to direct_IO])
++ AC_MSG_RESULT(yes)
++else
++ AC_MSG_RESULT(no)
++fi
++])
++
++#
++# LC_HEADER_MM_INLINE
++#
++# RHEL kernels define page_count in mm_inline.h
++#
++AC_DEFUN([LC_HEADER_MM_INLINE],
++[AC_MSG_CHECKING([if kernel has mm_inline.h header])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm_inline.h>
++],[
++ #ifndef page_count
++ #error mm_inline.h does not define page_count
++ #endif
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_MM_INLINE, 1, [mm_inline found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_STRUCT_INODE
++#
++# if inode->i_alloc_sem exists
++#
++AC_DEFUN([LC_STRUCT_INODE],
++[AC_MSG_CHECKING([if struct inode has i_alloc_sem])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++ #include <linux/version.h>
++],[
++ struct inode i;
++ return (char *)&i.i_alloc_sem - (char *)&i;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_I_ALLOC_SEM, 1, [struct inode has i_alloc_sem])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_REGISTER_CACHE
++#
++# if register_cache() is defined by kernel
++#
++AC_DEFUN([LC_FUNC_REGISTER_CACHE],
++[AC_MSG_CHECKING([if kernel defines register_cache()])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/list.h>
++ #include <linux/cache_def.h>
++],[
++ struct cache_definition cache;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_REGISTER_CACHE, 1, [register_cache found])
++ AC_MSG_CHECKING([if kernel expects return from cache shrink function])
++ HAVE_CACHE_RETURN_INT="`grep -c 'int.*shrink' $LINUX/include/linux/cache_def.h`"
++ if test "$HAVE_CACHE_RETURN_INT" != 0 ; then
++ AC_DEFINE(HAVE_CACHE_RETURN_INT, 1, [kernel expects return from shrink_cache])
++ AC_MSG_RESULT(yes)
++ else
++ AC_MSG_RESULT(no)
++ fi
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP
++#
++# check for our patched grab_cache_page_nowait_gfp() function
++#
++AC_DEFUN([LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP],
++[AC_MSG_CHECKING([if kernel defines grab_cache_page_nowait_gfp()])
++HAVE_GCPN_GFP="`grep -c 'grab_cache_page_nowait_gfp' $LINUX/include/linux/pagemap.h`"
++if test "$HAVE_GCPN_GFP" != 0 ; then
++ AC_DEFINE(HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP, 1,
++ [kernel has grab_cache_page_nowait_gfp()])
++ AC_MSG_RESULT(yes)
++else
++ AC_MSG_RESULT(no)
++fi
++])
++
++#
++# LC_FUNC_DEV_SET_RDONLY
++#
++# check for the old-style dev_set_rdonly which took an extra "devno" param
++# and can only set a single device to discard writes at one time
++#
++AC_DEFUN([LC_FUNC_DEV_SET_RDONLY],
++[AC_MSG_CHECKING([if kernel has new dev_set_rdonly])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ #ifndef HAVE_CLEAR_RDONLY_ON_PUT
++ #error needs to be patched by lustre kernel patches from Lustre version 1.4.3 or above.
++ #endif
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_DEV_SET_RDONLY, 1, [kernel has new dev_set_rdonly])
++],[
++ AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre
++kernel patches from Lustre version 1.4.3 or above.])
++])
++])
++
++#
++# LC_CONFIG_BACKINGFS
++#
++# setup, check the backing filesystem
++#
++AC_DEFUN([LC_CONFIG_BACKINGFS],
++[
++BACKINGFS="ldiskfs"
++
++if test x$with_ldiskfs = xno ; then
++ BACKINGFS="ext3"
++
++ if test x$linux25$enable_server = xyesyes ; then
++ AC_MSG_ERROR([ldiskfs is required for 2.6-based servers.])
++ fi
++
++ # --- Check that ext3 and ext3 xattr are enabled in the kernel
++ LC_CONFIG_EXT3([],[
++ AC_MSG_ERROR([Lustre requires that ext3 is enabled in the kernel])
++ ],[
++ AC_MSG_WARN([Lustre requires that extended attributes for ext3 are enabled in the kernel])
++ AC_MSG_WARN([This build may fail.])
++ ])
++else
++ # ldiskfs is enabled
++ LB_DEFINE_LDISKFS_OPTIONS
++fi #ldiskfs
++
++AC_MSG_CHECKING([which backing filesystem to use])
++AC_MSG_RESULT([$BACKINGFS])
++AC_SUBST(BACKINGFS)
++])
++
++#
++# LC_CONFIG_PINGER
++#
++# the pinger is temporary, until we have the recovery node in place
++#
++AC_DEFUN([LC_CONFIG_PINGER],
++[AC_MSG_CHECKING([whether to enable pinger support])
++AC_ARG_ENABLE([pinger],
++ AC_HELP_STRING([--disable-pinger],
++ [disable recovery pinger support]),
++ [],[enable_pinger='yes'])
++AC_MSG_RESULT([$enable_pinger])
++if test x$enable_pinger != xno ; then
++ AC_DEFINE(ENABLE_PINGER, 1, Use the Pinger)
++fi
++])
++
++#
++# LC_CONFIG_CHECKSUM
++#
++# do checksum of bulk data between client and OST
++#
++AC_DEFUN([LC_CONFIG_CHECKSUM],
++[AC_MSG_CHECKING([whether to enable data checksum support])
++AC_ARG_ENABLE([checksum],
++ AC_HELP_STRING([--disable-checksum],
++ [disable data checksum support]),
++ [],[enable_checksum='yes'])
++AC_MSG_RESULT([$enable_checksum])
++if test x$enable_checksum != xno ; then
++ AC_DEFINE(ENABLE_CHECKSUM, 1, do data checksums)
++fi
++])
++
++#
++# LC_CONFIG_HEALTH_CHECK_WRITE
++#
++# Turn on the actual write to the disk
++#
++AC_DEFUN([LC_CONFIG_HEALTH_CHECK_WRITE],
++[AC_MSG_CHECKING([whether to enable a write with the health check])
++AC_ARG_ENABLE([health-write],
++ AC_HELP_STRING([--enable-health-write],
++ [enable disk writes when doing health check]),
++ [],[enable_health_write='no'])
++AC_MSG_RESULT([$enable_health_write])
++if test x$enable_health_write == xyes ; then
++ AC_DEFINE(USE_HEALTH_CHECK_WRITE, 1, Write when Checking Health)
++fi
++])
++
++#
++# LC_CONFIG_LIBLUSTRE_RECOVERY
++#
++AC_DEFUN([LC_CONFIG_LIBLUSTRE_RECOVERY],
++[AC_MSG_CHECKING([whether to enable liblustre recovery support])
++AC_ARG_ENABLE([liblustre-recovery],
++ AC_HELP_STRING([--disable-liblustre-recovery],
++ [disable liblustre recovery support]),
++ [],[enable_liblustre_recovery='yes'])
++AC_MSG_RESULT([$enable_liblustre_recovery])
++if test x$enable_liblustre_recovery != xno ; then
++ AC_DEFINE(ENABLE_LIBLUSTRE_RECOVERY, 1, Liblustre Can Recover)
++fi
++])
++
++#
++# LC_CONFIG_OBD_BUFFER_SIZE
++#
++# the maximum buffer size of lctl ioctls
++#
++AC_DEFUN([LC_CONFIG_OBD_BUFFER_SIZE],
++[AC_MSG_CHECKING([maximum OBD ioctl size])
++AC_ARG_WITH([obd-buffer-size],
++ AC_HELP_STRING([--with-obd-buffer-size=[size]],
++ [set lctl ioctl maximum bytes (default=8192)]),
++ [
++ OBD_BUFFER_SIZE=$with_obd_buffer_size
++ ],[
++ OBD_BUFFER_SIZE=8192
++ ])
++AC_MSG_RESULT([$OBD_BUFFER_SIZE bytes])
++AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size])
++])
++
++#
++# LC_STRUCT_STATFS
++#
++# AIX does not have statfs.f_namelen
++#
++AC_DEFUN([LC_STRUCT_STATFS],
++[AC_MSG_CHECKING([if struct statfs has a f_namelen field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/vfs.h>
++],[
++ struct statfs sfs;
++ sfs.f_namelen = 1;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_STATFS_NAMELEN, 1, [struct statfs has a namelen field])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_READLINK_SSIZE_T
++#
++AC_DEFUN([LC_READLINK_SSIZE_T],
++[AC_MSG_CHECKING([if readlink returns ssize_t])
++AC_TRY_COMPILE([
++ #include <unistd.h>
++],[
++ ssize_t readlink(const char *, char *, size_t);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_POSIX_1003_READLINK, 1, [readlink returns ssize_t])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++AC_DEFUN([LC_FUNC_PAGE_MAPPED],
++[AC_MSG_CHECKING([if kernel offers page_mapped])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++],[
++ page_mapped(NULL);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_PAGE_MAPPED, 1, [page_mapped found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL],
++[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations fops;
++ &fops.unlocked_ioctl;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++AC_DEFUN([LC_FILEMAP_POPULATE],
++[AC_MSG_CHECKING([for exported filemap_populate])
++LB_LINUX_TRY_COMPILE([
++ #include <asm/page.h>
++ #include <linux/mm.h>
++],[
++ filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++AC_DEFUN([LC_D_ADD_UNIQUE],
++[AC_MSG_CHECKING([for d_add_unique])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/dcache.h>
++],[
++ d_add_unique(NULL, NULL);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++AC_DEFUN([LC_BIT_SPINLOCK_H],
++[LB_CHECK_FILE([$LINUX/include/linux/bit_spinlock.h],[
++ AC_MSG_CHECKING([if bit_spinlock.h can be compiled])
++ LB_LINUX_TRY_COMPILE([
++ #include <asm/processor.h>
++ #include <linux/spinlock.h>
++ #include <linux/bit_spinlock.h>
++ ],[],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_BIT_SPINLOCK_H, 1, [Kernel has bit_spinlock.h])
++ ],[
++ AC_MSG_RESULT([no])
++ ])
++],
++[])
++])
++
++#
++# LC_POSIX_ACL_XATTR
++#
++# If we have xattr_acl.h
++#
++AC_DEFUN([LC_XATTR_ACL],
++[LB_CHECK_FILE([$LINUX/include/linux/xattr_acl.h],[
++ AC_MSG_CHECKING([if xattr_acl.h can be compiled])
++ LB_LINUX_TRY_COMPILE([
++ #include <linux/xattr_acl.h>
++ ],[],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_XATTR_ACL, 1, [Kernel has xattr_acl])
++ ],[
++ AC_MSG_RESULT([no])
++ ])
++],
++[])
++])
++
++#
++# LC_LINUX_FIEMAP_H
++#
++# If we have fiemap.h
++# after 2.6.27 use fiemap.h in include/linux
++#
++AC_DEFUN([LC_LINUX_FIEMAP_H],
++[LB_CHECK_FILE([$LINUX/include/linux/fiemap.h],[
++ AC_MSG_CHECKING([if fiemap.h can be compiled])
++ LB_LINUX_TRY_COMPILE([
++ #include <linux/fiemap.h>
++ ],[],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_LINUX_FIEMAP_H, 1, [Kernel has fiemap.h])
++ ],[
++ AC_MSG_RESULT([no])
++ ])
++],
++[])
++])
++
++
++AC_DEFUN([LC_STRUCT_INTENT_FILE],
++[AC_MSG_CHECKING([if struct open_intent has a file field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++ #include <linux/namei.h>
++],[
++ struct open_intent intent;
++ &intent.file;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FILE_IN_STRUCT_INTENT, 1, [struct open_intent has a file field])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++
++AC_DEFUN([LC_POSIX_ACL_XATTR_H],
++[LB_CHECK_FILE([$LINUX/include/linux/posix_acl_xattr.h],[
++ AC_MSG_CHECKING([if linux/posix_acl_xattr.h can be compiled])
++ LB_LINUX_TRY_COMPILE([
++ #include <linux/posix_acl_xattr.h>
++ ],[],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_LINUX_POSIX_ACL_XATTR_H, 1, [linux/posix_acl_xattr.h found])
++
++ ],[
++ AC_MSG_RESULT([no])
++ ])
++$1
++],[
++AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_EXPORT___IGET
++# starting from 2.6.19 linux kernel exports __iget()
++#
++AC_DEFUN([LC_EXPORT___IGET],
++[LB_CHECK_SYMBOL_EXPORT([__iget],
++[fs/inode.c],[
++ AC_DEFINE(HAVE_EXPORT___IGET, 1, [kernel exports __iget])
++],[
++])
++])
++
++
++AC_DEFUN([LC_LUSTRE_VERSION_H],
++[LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
++ rm -f "$LUSTRE/include/linux/lustre_version.h"
++],[
++ touch "$LUSTRE/include/linux/lustre_version.h"
++ if test x$enable_server = xyes ; then
++ AC_MSG_WARN([Unpatched kernel detected.])
++ AC_MSG_WARN([Lustre servers cannot be built with an unpatched kernel;])
++ AC_MSG_WARN([disabling server build])
++ enable_server='no'
++ fi
++])
++])
++
++AC_DEFUN([LC_FUNC_SET_FS_PWD],
++[LB_CHECK_SYMBOL_EXPORT([set_fs_pwd],
++[fs/namespace.c],[
++ AC_DEFINE(HAVE_SET_FS_PWD, 1, [set_fs_pwd is exported])
++],[
++])
++])
++
++#
++# check for FS_RENAME_DOES_D_MOVE flag
++#
++AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE],
++[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int v = FS_RENAME_DOES_D_MOVE;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_MS_FLOCK_LOCK
++#
++# SLES9 kernel has MS_FLOCK_LOCK sb flag
++#
++AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK],
++[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int flags = MS_FLOCK_LOCK;
++],[
++ AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1,
++ [kernel has MS_FLOCK_LOCK flag])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_HAVE_CAN_SLEEP_ARG
++#
++# SLES9 kernel has third arg can_sleep
++# in fs/locks.c: flock_lock_file_wait()
++#
++AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG],
++[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int cansleep;
++ struct file *file;
++ struct file_lock *file_lock;
++ flock_lock_file_wait(file, file_lock, cansleep);
++],[
++ AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1,
++ [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_F_OP_FLOCK
++#
++# rhel4.2 kernel has f_op->flock field
++#
++AC_DEFUN([LC_FUNC_F_OP_FLOCK],
++[AC_MSG_CHECKING([if struct file_operations has flock field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations ll_file_operations_flock;
++ ll_file_operations_flock.flock = NULL;
++],[
++ AC_DEFINE(HAVE_F_OP_FLOCK, 1,
++ [struct file_operations has flock field])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_MS_FLOCK_LOCK
++#
++# SLES9 kernel has MS_FLOCK_LOCK sb flag
++#
++AC_DEFUN([LC_FUNC_MS_FLOCK_LOCK],
++[AC_MSG_CHECKING([if kernel has MS_FLOCK_LOCK sb flag])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int flags = MS_FLOCK_LOCK;
++],[
++ AC_DEFINE(HAVE_MS_FLOCK_LOCK, 1,
++ [kernel has MS_FLOCK_LOCK flag])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_HAVE_CAN_SLEEP_ARG
++#
++# SLES9 kernel has third arg can_sleep
++# in fs/locks.c: flock_lock_file_wait()
++#
++AC_DEFUN([LC_FUNC_HAVE_CAN_SLEEP_ARG],
++[AC_MSG_CHECKING([if kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int cansleep;
++ struct file *file;
++ struct file_lock *file_lock;
++ flock_lock_file_wait(file, file_lock, cansleep);
++],[
++ AC_DEFINE(HAVE_CAN_SLEEP_ARG, 1,
++ [kernel has third arg can_sleep in fs/locks.c: flock_lock_file_wait()])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_TASK_PPTR
++#
++# task struct has p_pptr instead of parent
++#
++AC_DEFUN([LC_TASK_PPTR],
++[AC_MSG_CHECKING([task p_pptr found])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/sched.h>
++],[
++ struct task_struct *p;
++
++ p = p->p_pptr;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_TASK_PPTR, 1, [task p_pptr found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_F_OP_FLOCK
++#
++# rhel4.2 kernel has f_op->flock field
++#
++AC_DEFUN([LC_FUNC_F_OP_FLOCK],
++[AC_MSG_CHECKING([if struct file_operations has flock field])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations ll_file_operations_flock;
++ ll_file_operations_flock.flock = NULL;
++],[
++ AC_DEFINE(HAVE_F_OP_FLOCK, 1,
++ [struct file_operations has flock field])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# LC_INODE_I_MUTEX
++# after 2.6.15 inode have i_mutex intead of i_sem
++AC_DEFUN([LC_INODE_I_MUTEX],
++[AC_MSG_CHECKING([if inode has i_mutex ])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mutex.h>
++ #include <linux/fs.h>
++ #undef i_mutex
++],[
++ struct inode i;
++
++ mutex_unlock(&i.i_mutex);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_INODE_I_MUTEX, 1,
++ [after 2.6.15 inode have i_mutex intead of i_sem])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_DQUOTOFF_MUTEX
++# after 2.6.17 dquote use mutex instead if semaphore
++AC_DEFUN([LC_DQUOTOFF_MUTEX],
++[AC_MSG_CHECKING([use dqonoff_mutex])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mutex.h>
++ #include <linux/fs.h>
++ #include <linux/quota.h>
++],[
++ struct quota_info dq;
++
++ mutex_unlock(&dq.dqonoff_mutex);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_DQUOTOFF_MUTEX, 1,
++ [after 2.6.17 dquote use mutex instead if semaphore])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++#
++# LC_STATFS_DENTRY_PARAM
++# starting from 2.6.18 linux kernel uses dentry instead of
++# super_block for first vfs_statfs argument
++#
++AC_DEFUN([LC_STATFS_DENTRY_PARAM],
++[AC_MSG_CHECKING([first vfs_statfs parameter is dentry])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int vfs_statfs(struct dentry *, struct kstatfs *);
++],[
++ AC_DEFINE(HAVE_STATFS_DENTRY_PARAM, 1,
++ [first parameter of vfs_statfs is dentry])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_VFS_KERN_MOUNT
++# starting from 2.6.18 kernel don't export do_kern_mount
++# and want to use vfs_kern_mount instead.
++#
++AC_DEFUN([LC_VFS_KERN_MOUNT],
++[AC_MSG_CHECKING([vfs_kern_mount exist in kernel])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mount.h>
++],[
++ vfs_kern_mount(NULL, 0, NULL, NULL);
++],[
++ AC_DEFINE(HAVE_VFS_KERN_MOUNT, 1,
++ [vfs_kern_mount exist in kernel])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_INVALIDATEPAGE_RETURN_INT
++# more 2.6 api changes. return type for the invalidatepage
++# address_space_operation is 'void' in new kernels but 'int' in old
++#
++AC_DEFUN([LC_INVALIDATEPAGE_RETURN_INT],
++[AC_MSG_CHECKING([invalidatepage has return int])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/buffer_head.h>
++],[
++ int rc = block_invalidatepage(NULL, 0);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_INVALIDATEPAGE_RETURN_INT, 1,
++ [Define if return type of invalidatepage should be int])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_UMOUNTBEGIN_HAS_VFSMOUNT
++# more 2.6 API changes. 2.6.18 umount_begin has different parameters
++AC_DEFUN([LC_UMOUNTBEGIN_HAS_VFSMOUNT],
++[AC_MSG_CHECKING([if umount_begin needs vfsmount parameter instead of super_block])
++tmp_flags="$EXTRA_KCFLAGS"
++EXTRA_KCFLAGS="-Werror"
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++
++ struct vfsmount;
++ static void cfg_umount_begin (struct vfsmount *v, int flags)
++ {
++ ;
++ }
++
++ static struct super_operations cfg_super_operations = {
++ .umount_begin = cfg_umount_begin,
++ };
++],[
++ cfg_super_operations.umount_begin(NULL,0);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_UMOUNTBEGIN_VFSMOUNT, 1,
++ [Define umount_begin need second argument])
++],[
++ AC_MSG_RESULT(no)
++])
++EXTRA_KCFLAGS="$tmp_flags"
++])
++
++# 2.6.19 API changes
++# inode don't have i_blksize field
++AC_DEFUN([LC_INODE_BLKSIZE],
++[AC_MSG_CHECKING([inode has i_blksize field])
++LB_LINUX_TRY_COMPILE([
++#include <linux/fs.h>
++],[
++ struct inode i;
++ i.i_blksize = 0;
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_INODE_BLKSIZE, 1,
++ [struct inode has i_blksize field])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_VFS_READDIR_U64_INO
++# 2.6.19 use u64 for inode number instead of inode_t
++AC_DEFUN([LC_VFS_READDIR_U64_INO],
++[AC_MSG_CHECKING([check vfs_readdir need 64bit inode number])
++tmp_flags="$EXTRA_KCFLAGS"
++EXTRA_KCFLAGS="-Werror"
++LB_LINUX_TRY_COMPILE([
++#include <linux/fs.h>
++ int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
++ u64 ino, unsigned int d_type)
++ {
++ return 0;
++ }
++],[
++ filldir_t filter;
++
++ filter = fillonedir;
++ return 1;
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_VFS_READDIR_U64_INO, 1,
++ [if vfs_readdir need 64bit inode number])
++],[
++ AC_MSG_RESULT(no)
++])
++EXTRA_KCFLAGS="$tmp_flags"
++])
++
++# LC_FILE_WRITEV
++# 2.6.19 replaced writev with aio_write
++AC_DEFUN([LC_FILE_WRITEV],
++[AC_MSG_CHECKING([writev in fops])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations *fops = NULL;
++ fops->writev = NULL;
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_FILE_WRITEV, 1,
++ [use fops->writev])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_GENERIC_FILE_READ
++# 2.6.19 replaced readv with aio_read
++AC_DEFUN([LC_FILE_READV],
++[AC_MSG_CHECKING([readv in fops])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations *fops = NULL;
++ fops->readv = NULL;
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_FILE_READV, 1,
++ [use fops->readv])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_NR_PAGECACHE
++# 2.6.18 don't export nr_pagecahe
++AC_DEFUN([LC_NR_PAGECACHE],
++[AC_MSG_CHECKING([kernel export nr_pagecache])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/pagemap.h>
++],[
++ return atomic_read(&nr_pagecache);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_NR_PAGECACHE, 1,
++ [is kernel export nr_pagecache])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# LC_CANCEL_DIRTY_PAGE
++# 2.6.20 introduse cancel_dirty_page instead of
++# clear_page_dirty.
++AC_DEFUN([LC_CANCEL_DIRTY_PAGE],
++[AC_MSG_CHECKING([kernel has cancel_dirty_page])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++ #include <linux/page-flags.h>
++],[
++ cancel_dirty_page(NULL, 0);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_CANCEL_DIRTY_PAGE, 1,
++ [kernel has cancel_dirty_page instead of clear_page_dirty])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++#
++# LC_PAGE_CONSTANT
++#
++# In order to support raid5 zerocopy patch, we have to patch the kernel to make
++# it support constant page, which means the page won't be modified during the
++# IO.
++#
++AC_DEFUN([LC_PAGE_CONSTANT],
++[AC_MSG_CHECKING([if kernel have PageConstant defined])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++ #include <linux/page-flags.h>
++],[
++ #ifndef PG_constant
++ #error "Have no raid5 zcopy patch"
++ #endif
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_PAGE_CONSTANT, 1, [kernel have PageConstant supported])
++],[
++ AC_MSG_RESULT(no);
++])
++])
++
++# RHEL5 in FS-cache patch rename PG_checked flag
++# into PG_fs_misc
++AC_DEFUN([LC_PG_FS_MISC],
++[AC_MSG_CHECKING([kernel has PG_fs_misc])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++ #include <linux/page-flags.h>
++],[
++ #ifndef PG_fs_misc
++ #error PG_fs_misc not defined in kernel
++ #endif
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_PG_FS_MISC, 1,
++ [is kernel have PG_fs_misc])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++# RHEL5 PageChecked and SetPageChecked defined
++AC_DEFUN([LC_PAGE_CHECKED],
++[AC_MSG_CHECKING([kernel has PageChecked and SetPageChecked])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/mm.h>
++ #include <linux/page-flags.h>
++],[
++ #ifndef PageChecked
++ #error PageChecked not defined in kernel
++ #endif
++ #ifndef SetPageChecked
++ #error SetPageChecked not defined in kernel
++ #endif
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_PAGE_CHECKED, 1,
++ [does kernel have PageChecked and SetPageChecked])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++AC_DEFUN([LC_EXPORT_TRUNCATE_COMPLETE],
++[LB_CHECK_SYMBOL_EXPORT([truncate_complete_page],
++[mm/truncate.c],[
++AC_DEFINE(HAVE_TRUNCATE_COMPLETE_PAGE, 1,
++ [kernel export truncate_complete_page])
++],[
++])
++])
++
++AC_DEFUN([LC_EXPORT_D_REHASH_COND],
++[LB_CHECK_SYMBOL_EXPORT([d_rehash_cond],
++[fs/dcache.c],[
++AC_DEFINE(HAVE_D_REHASH_COND, 1,
++ [d_rehash_cond is exported by the kernel])
++],[
++])
++])
++
++AC_DEFUN([LC_EXPORT___D_REHASH],
++[LB_CHECK_SYMBOL_EXPORT([__d_rehash],
++[fs/dcache.c],[
++AC_DEFINE(HAVE___D_REHASH, 1,
++ [__d_rehash is exported by the kernel])
++],[
++])
++])
++
++AC_DEFUN([LC_EXPORT_D_MOVE_LOCKED],
++[LB_CHECK_SYMBOL_EXPORT([d_move_locked],
++[fs/dcache.c],[
++AC_DEFINE(HAVE_D_MOVE_LOCKED, 1,
++ [d_move_locked is exported by the kernel])
++],[
++])
++])
++
++AC_DEFUN([LC_EXPORT___D_MOVE],
++[LB_CHECK_SYMBOL_EXPORT([__d_move],
++[fs/dcache.c],[
++AC_DEFINE(HAVE___D_MOVE, 1,
++ [__d_move is exported by the kernel])
++],[
++])
++])
++
++# The actual symbol exported varies among architectures, so we need
++# to check many symbols (but only in the current architecture.) No
++# matter what symbol is exported, the kernel #defines node_to_cpumask
++# to the appropriate function and that's what we use.
++AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK],
++ [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask],
++ [arch/$LINUX_ARCH/mm/numa.c],
++ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
++ [node_to_cpumask is exported by
++ the kernel])]) # x86_64
++ LB_CHECK_SYMBOL_EXPORT([node_to_cpu_mask],
++ [arch/$LINUX_ARCH/kernel/smpboot.c],
++ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
++ [node_to_cpumask is exported by
++ the kernel])]) # ia64
++ LB_CHECK_SYMBOL_EXPORT([node_2_cpu_mask],
++ [arch/$LINUX_ARCH/kernel/smpboot.c],
++ [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1,
++ [node_to_cpumask is exported by
++ the kernel])]) # i386
++ ])
++
++#
++# LC_VFS_INTENT_PATCHES
++#
++# check if the kernel has the VFS intent patches
++AC_DEFUN([LC_VFS_INTENT_PATCHES],
++[AC_MSG_CHECKING([if the kernel has the VFS intent patches])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++ #include <linux/namei.h>
++],[
++ struct nameidata nd;
++ struct lookup_intent *it;
++
++ it = &nd.intent;
++ intent_init(it, IT_OPEN);
++ it->d.lustre.it_disposition = 0;
++ it->d.lustre.it_data = NULL;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_VFS_INTENT_PATCHES, 1, [VFS intent patches are applied])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.22 lost second parameter for invalidate_bdev
++AC_DEFUN([LC_INVALIDATE_BDEV_2ARG],
++[AC_MSG_CHECKING([if invalidate_bdev has second argument])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/buffer_head.h>
++],[
++ invalidate_bdev(NULL,0);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_INVALIDATE_BDEV_2ARG, 1,
++ [invalidate_bdev has second argument])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.23 have return type 'void' for unregister_blkdev
++AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT],
++[AC_MSG_CHECKING([if unregister_blkdev return int])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ int i = unregister_blkdev(0,NULL);
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_UNREGISTER_BLKDEV_RETURN_INT, 1,
++ [unregister_blkdev return int])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.23 change .sendfile to .splice_read
++AC_DEFUN([LC_KERNEL_SPLICE_READ],
++[AC_MSG_CHECKING([if kernel has .splice_read])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct file_operations file;
++
++ file.splice_read = NULL;
++], [
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(HAVE_KERNEL_SPLICE_READ, 1,
++ [kernel has .slice_read])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++# 2.6.23 extract nfs export related data into exportfs.h
++AC_DEFUN([LC_HAVE_EXPORTFS_H],
++[
++tmpfl="$CFLAGS"
++CFLAGS="$CFLAGS -I$LINUX_OBJ/include"
++AC_CHECK_HEADERS([linux/exportfs.h])
++CFLAGS="$tmpfl"
++])
++
++#
++# LC_PROG_LINUX
++#
++# Lustre linux kernel checks
++#
++AC_DEFUN([LC_PROG_LINUX],
++ [LC_LUSTRE_VERSION_H
++ if test x$enable_server = xyes ; then
++ LC_CONFIG_BACKINGFS
++ fi
++ LC_CONFIG_PINGER
++ LC_CONFIG_CHECKSUM
++ LC_CONFIG_LIBLUSTRE_RECOVERY
++ LC_CONFIG_HEALTH_CHECK_WRITE
++ LC_CONFIG_LRU_RESIZE
++ LC_CONFIG_ADAPTIVE_TIMEOUTS
++ LC_QUOTA_MODULE
++
++ LC_TASK_PPTR
++ # RHEL4 patches
++ LC_EXPORT_TRUNCATE_COMPLETE
++ LC_EXPORT_D_REHASH_COND
++ LC_EXPORT___D_REHASH
++ LC_EXPORT_D_MOVE_LOCKED
++ LC_EXPORT___D_MOVE
++ LC_EXPORT_NODE_TO_CPUMASK
++
++ LC_STRUCT_KIOBUF
++ LC_FUNC_COND_RESCHED
++ LC_FUNC_ZAP_PAGE_RANGE
++ LC_FUNC_PDE
++ LC_FUNC_DIRECT_IO
++ LC_HEADER_MM_INLINE
++ LC_STRUCT_INODE
++ LC_FUNC_REGISTER_CACHE
++ LC_FUNC_GRAB_CACHE_PAGE_NOWAIT_GFP
++ LC_FUNC_DEV_SET_RDONLY
++ LC_FUNC_FILEMAP_FDATAWRITE
++ LC_STRUCT_STATFS
++ LC_FUNC_PAGE_MAPPED
++ LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL
++ LC_FILEMAP_POPULATE
++ LC_D_ADD_UNIQUE
++ LC_BIT_SPINLOCK_H
++ LC_XATTR_ACL
++ LC_STRUCT_INTENT_FILE
++ LC_POSIX_ACL_XATTR_H
++ LC_EXPORT___IGET
++ LC_FUNC_SET_FS_PWD
++ LC_FUNC_MS_FLOCK_LOCK
++ LC_FUNC_HAVE_CAN_SLEEP_ARG
++ LC_FUNC_F_OP_FLOCK
++ LC_QUOTA_READ
++ LC_COOKIE_FOLLOW_LINK
++ LC_FUNC_RCU
++ LC_QUOTA64
++
++ # does the kernel have VFS intent patches?
++ LC_VFS_INTENT_PATCHES
++
++ # 2.6.15
++ LC_INODE_I_MUTEX
++
++ # 2.6.16
++ LC_SECURITY_PLUG # for SLES10 SP2
++
++ # 2.6.17
++ LC_DQUOTOFF_MUTEX
++
++ # 2.6.18
++ LC_NR_PAGECACHE
++ LC_STATFS_DENTRY_PARAM
++ LC_VFS_KERN_MOUNT
++ LC_INVALIDATEPAGE_RETURN_INT
++ LC_UMOUNTBEGIN_HAS_VFSMOUNT
++
++ #2.6.18 + RHEL5 (fc6)
++ LC_PG_FS_MISC
++ LC_PAGE_CHECKED
++
++ # 2.6.19
++ LC_INODE_BLKSIZE
++ LC_VFS_READDIR_U64_INO
++ LC_FILE_WRITEV
++ LC_FILE_READV
++
++ # 2.6.20
++ LC_CANCEL_DIRTY_PAGE
++
++ # raid5-zerocopy patch
++ LC_PAGE_CONSTANT
++
++ # 2.6.22
++ LC_INVALIDATE_BDEV_2ARG
++ LC_FS_RENAME_DOES_D_MOVE
++ # 2.6.23
++ LC_UNREGISTER_BLKDEV_RETURN_INT
++ LC_KERNEL_SPLICE_READ
++ LC_HAVE_EXPORTFS_H
++])
++
++#
++# LC_CONFIG_CLIENT_SERVER
++#
++# Build client/server sides of Lustre
++#
++AC_DEFUN([LC_CONFIG_CLIENT_SERVER],
++[AC_MSG_CHECKING([whether to build Lustre server support])
++AC_ARG_ENABLE([server],
++ AC_HELP_STRING([--disable-server],
++ [disable Lustre server support]),
++ [],[enable_server='yes'])
++AC_MSG_RESULT([$enable_server])
++
++AC_MSG_CHECKING([whether to build Lustre client support])
++AC_ARG_ENABLE([client],
++ AC_HELP_STRING([--disable-client],
++ [disable Lustre client support]),
++ [],[enable_client='yes'])
++AC_MSG_RESULT([$enable_client])])
++
++#
++# LC_CONFIG_LIBLUSTRE
++#
++# whether to build liblustre
++#
++AC_DEFUN([LC_CONFIG_LIBLUSTRE],
++[AC_MSG_CHECKING([whether to build Lustre library])
++AC_ARG_ENABLE([liblustre],
++ AC_HELP_STRING([--disable-liblustre],
++ [disable building of Lustre library]),
++ [],[enable_liblustre=$with_sysio])
++AC_MSG_RESULT([$enable_liblustre])
++# only build sysio if liblustre is built
++with_sysio="$enable_liblustre"
++
++AC_MSG_CHECKING([whether to build liblustre tests])
++AC_ARG_ENABLE([liblustre-tests],
++ AC_HELP_STRING([--enable-liblustre-tests],
++ [enable liblustre tests, if --disable-tests is used]),
++ [],[enable_liblustre_tests=$enable_tests])
++if test x$enable_liblustre != xyes ; then
++ enable_liblustre_tests='no'
++fi
++AC_MSG_RESULT([$enable_liblustre_tests])
++
++AC_MSG_CHECKING([whether to enable liblustre acl])
++AC_ARG_ENABLE([liblustre-acl],
++ AC_HELP_STRING([--disable-liblustre-acl],
++ [disable ACL support for liblustre]),
++ [],[enable_liblustre_acl=yes])
++AC_MSG_RESULT([$enable_liblustre_acl])
++if test x$enable_liblustre_acl = xyes ; then
++ AC_DEFINE(LIBLUSTRE_POSIX_ACL, 1, Liblustre Support ACL-enabled MDS)
++fi
++
++#
++# --enable-mpitest
++#
++AC_ARG_ENABLE(mpitests,
++ AC_HELP_STRING([--enable-mpitest=yes|no|mpich directory],
++ [include mpi tests]),
++ [
++ enable_mpitests=yes
++ case $enableval in
++ yes)
++ MPI_ROOT=/opt/mpich
++ LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
++ CFLAGS="$CFLAGS -I$MPI_ROOT/include"
++ ;;
++ no)
++ enable_mpitests=no
++ ;;
++ [[\\/$]]* | ?:[[\\/]]* )
++ MPI_ROOT=$enableval
++ LDFLAGS="$LDFLAGS -L$with_mpi/lib"
++ CFLAGS="$CFLAGS -I$MPI_ROOT/include"
++ ;;
++ *)
++ AC_MSG_ERROR([expected absolute directory name for --enable-mpitests or yes or no])
++ ;;
++ esac
++ ],
++ [
++ MPI_ROOT=/opt/mpich
++ LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
++ CFLAGS="$CFLAGS -I$MPI_ROOT/include"
++ enable_mpitests=yes
++ ]
++)
++AC_SUBST(MPI_ROOT)
++
++if test x$enable_mpitests != xno; then
++ AC_MSG_CHECKING([whether to mpitests can be built])
++ AC_CHECK_FILE([$MPI_ROOT/include/mpi.h],
++ [AC_CHECK_LIB([mpich],[MPI_Start],[enable_mpitests=yes],[enable_mpitests=no])],
++ [enable_mpitests=no])
++fi
++AC_MSG_RESULT([$enable_mpitests])
++
++
++AC_MSG_NOTICE([Enabling Lustre configure options for libsysio])
++ac_configure_args="$ac_configure_args --with-lustre-hack --with-sockets"
++
++LC_CONFIG_PINGER
++LC_CONFIG_LIBLUSTRE_RECOVERY
++])
++
++AC_DEFUN([LC_CONFIG_LRU_RESIZE],
++[AC_MSG_CHECKING([whether to enable lru self-adjusting])
++AC_ARG_ENABLE([lru_resize],
++ AC_HELP_STRING([--enable-lru-resize],
++ [enable lru resize support]),
++ [],[enable_lru_resize='yes'])
++AC_MSG_RESULT([$enable_lru_resize])
++if test x$enable_lru_resize != xno; then
++ AC_DEFINE(HAVE_LRU_RESIZE_SUPPORT, 1, [Enable lru resize support])
++fi
++])
++
++AC_DEFUN([LC_CONFIG_ADAPTIVE_TIMEOUTS],
++[AC_MSG_CHECKING([whether to enable ptlrpc adaptive timeouts support])
++AC_ARG_ENABLE([adaptive_timeouts],
++ AC_HELP_STRING([--enable-adaptive-timeouts],
++ [enable ptlrpc adaptive timeouts support]),
++ [],[enable_adaptive_timeouts='no'])
++AC_MSG_RESULT([$enable_adaptive_timeouts])
++if test x$enable_adaptive_timeouts == xyes; then
++ AC_DEFINE(HAVE_AT_SUPPORT, 1, [Enable adaptive timeouts support])
++fi
++])
++
++#
++# LC_CONFIG_QUOTA
++#
++# whether to enable quota support global control
++#
++AC_DEFUN([LC_CONFIG_QUOTA],
++[AC_ARG_ENABLE([quota],
++ AC_HELP_STRING([--enable-quota],
++ [enable quota support]),
++ [],[enable_quota='yes'])
++])
++
++# whether to enable quota support(kernel modules)
++AC_DEFUN([LC_QUOTA_MODULE],
++[if test x$enable_quota != xno; then
++ LB_LINUX_CONFIG([QUOTA],[
++ enable_quota_module='yes'
++ AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
++ ],[
++ enable_quota_module='no'
++ AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support])
++ ])
++fi
++])
++
++AC_DEFUN([LC_QUOTA],
++[#check global
++LC_CONFIG_QUOTA
++#check for utils
++AC_CHECK_HEADER(sys/quota.h,
++ [AC_DEFINE(HAVE_SYS_QUOTA_H, 1, [Define to 1 if you have <sys/quota.h>.])],
++ [AC_MSG_ERROR([don't find <sys/quota.h> in your system])])
++])
++
++AC_DEFUN([LC_QUOTA_READ],
++[AC_MSG_CHECKING([if kernel supports quota_read])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct super_operations sp;
++ void *i = (void *)sp.quota_read;
++],[
++ AC_MSG_RESULT([yes])
++ AC_DEFINE(KERNEL_SUPPORTS_QUOTA_READ, 1, [quota_read found])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_COOKIE_FOLLOW_LINK
++#
++# kernel 2.6.13+ ->follow_link returns a cookie
++#
++
++AC_DEFUN([LC_COOKIE_FOLLOW_LINK],
++[AC_MSG_CHECKING([if inode_operations->follow_link returns a cookie])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++ #include <linux/namei.h>
++],[
++ struct dentry dentry;
++ struct nameidata nd;
++
++ dentry.d_inode->i_op->put_link(&dentry, &nd, NULL);
++],[
++ AC_DEFINE(HAVE_COOKIE_FOLLOW_LINK, 1, [inode_operations->follow_link returns a cookie])
++ AC_MSG_RESULT([yes])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_FUNC_RCU
++#
++# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE),
++# call_rcu takes three parameters.
++#
++AC_DEFUN([LC_FUNC_RCU],
++[AC_MSG_CHECKING([if kernel have RCU supported])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/rcupdate.h>
++],[],[
++ AC_DEFINE(HAVE_RCU, 1, [have RCU defined])
++ AC_MSG_RESULT([yes])
++
++ AC_MSG_CHECKING([if call_rcu takes three parameters])
++ LB_LINUX_TRY_COMPILE([
++ #include <linux/rcupdate.h>
++ ],[
++ struct rcu_head rh;
++ call_rcu(&rh, (void (*)(struct rcu_head *))1, NULL);
++ ],[
++ AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters])
++ AC_MSG_RESULT([yes])
++ ],[
++ AC_MSG_RESULT([no])
++ ])
++],[
++ AC_MSG_RESULT([no])
++])
++])
++
++#
++# LC_QUOTA64
++# linux kernel may have 64-bit limits support
++#
++AC_DEFUN([LC_QUOTA64],
++[AC_MSG_CHECKING([if kernel has 64-bit quota limits support])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/kernel.h>
++ #include <linux/fs.h>
++ #include <linux/quotaio_v2.h>
++ int versions[] = V2_INITQVERSIONS_R1;
++ struct v2_disk_dqblk_r1 dqblk_r1;
++],[],[
++ AC_DEFINE(HAVE_QUOTA64, 1, [have quota64])
++ AC_MSG_RESULT([yes])
++
++],[
++ AC_MSG_WARN([4 TB (or larger) block quota limits can only be used with OSTs not larger than 4 TB.])
++ AC_MSG_WARN([Continuing with limited quota support.])
++ AC_MSG_WARN([quotacheck is needed for filesystems with recent quota versions.])
++ AC_MSG_RESULT([no])
++])
++])
++
++# LC_SECURITY_PLUG # for SLES10 SP2
++# check security plug in sles10 sp2 kernel
++AC_DEFUN([LC_SECURITY_PLUG],
++[AC_MSG_CHECKING([If kernel has security plug support])
++LB_LINUX_TRY_COMPILE([
++ #include <linux/fs.h>
++],[
++ struct dentry *dentry;
++ struct vfsmount *mnt;
++ struct iattr *iattr;
++
++ notify_change(dentry, mnt, iattr);
++],[
++ AC_MSG_RESULT(yes)
++ AC_DEFINE(HAVE_SECURITY_PLUG, 1,
++ [SLES10 SP2 use extra parameter in vfs])
++],[
++ AC_MSG_RESULT(no)
++])
++])
++
++#
++# LC_CONFIGURE
++#
++# other configure checks
++#
++AC_DEFUN([LC_CONFIGURE],
++[LC_CONFIG_OBD_BUFFER_SIZE
++
++# include/liblustre.h
++AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h])
++
++# liblustre/llite_lib.h
++AC_CHECK_HEADERS([xtio.h file.h])
++
++# liblustre/dir.c
++AC_CHECK_HEADERS([linux/types.h sys/types.h linux/unistd.h unistd.h])
++
++# liblustre/lutil.c
++AC_CHECK_HEADERS([netinet/in.h arpa/inet.h catamount/data.h])
++AC_CHECK_FUNCS([inet_ntoa])
++
++# libsysio/src/readlink.c
++LC_READLINK_SSIZE_T
++
++# lvfs/prng.c - depends on linux/types.h from liblustre/dir.c
++AC_CHECK_HEADERS([linux/random.h], [], [],
++ [#ifdef HAVE_LINUX_TYPES_H
++ # include <linux/types.h>
++ #endif
++ ])
++
++# utils/llverfs.c
++AC_CHECK_HEADERS([ext2fs/ext2fs.h])
++
++# check for -lz support
++ZLIB=""
++AC_CHECK_LIB([z],
++ [adler32],
++ [AC_CHECK_HEADERS([zlib.h],
++ [ZLIB="-lz"
++ AC_DEFINE([HAVE_ADLER], 1,
++ [support alder32 checksum type])],
++ [AC_MSG_WARN([No zlib-devel package found,
++ unable to use adler32 checksum])])],
++ [AC_MSG_WARN([No zlib package found, unable to use adler32 checksum])]
++)
++AC_SUBST(ZLIB)
++
++# Super safe df
++AC_ARG_ENABLE([mindf],
++ AC_HELP_STRING([--enable-mindf],
++ [Make statfs report the minimum available space on any single OST instead of the sum of free space on all OSTs]),
++ [],[])
++if test "$enable_mindf" = "yes" ; then
++ AC_DEFINE([MIN_DF], 1, [Report minimum OST free space])
++fi
++
++AC_ARG_ENABLE([fail_alloc],
++ AC_HELP_STRING([--disable-fail-alloc],
++ [disable randomly alloc failure]),
++ [],[enable_fail_alloc=yes])
++AC_MSG_CHECKING([whether to randomly failing memory alloc])
++AC_MSG_RESULT([$enable_fail_alloc])
++if test x$enable_fail_alloc != xno ; then
++ AC_DEFINE([RANDOM_FAIL_ALLOC], 1, [enable randomly alloc failure])
++fi
++
++])
++
++#
++# LC_CONDITIONALS
++#
++# AM_CONDITIONALS for lustre
++#
++AC_DEFUN([LC_CONDITIONALS],
++[AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes)
++AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno)
++AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes)
++AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
++AM_CONDITIONAL(CLIENT, test x$enable_client = xyes)
++AM_CONDITIONAL(SERVER, test x$enable_server = xyes)
++AM_CONDITIONAL(QUOTA, test x$enable_quota_module = xyes)
++AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes)
++AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes)
++AM_CONDITIONAL(LIBPTHREAD, test x$enable_libpthread = xyes)
++])
++
++#
++# LC_CONFIG_FILES
++#
++# files that should be generated with AC_OUTPUT
++#
++AC_DEFUN([LC_CONFIG_FILES],
++[AC_CONFIG_FILES([
++lustre/Makefile
++lustre/autoMakefile
++lustre/autoconf/Makefile
++lustre/contrib/Makefile
++lustre/doc/Makefile
++lustre/include/Makefile
++lustre/include/lustre_ver.h
++lustre/include/linux/Makefile
++lustre/include/lustre/Makefile
++lustre/kernel_patches/targets/2.6-suse.target
++lustre/kernel_patches/targets/2.6-vanilla.target
++lustre/kernel_patches/targets/2.6-rhel4.target
++lustre/kernel_patches/targets/2.6-rhel5.target
++lustre/kernel_patches/targets/2.6-fc5.target
++lustre/kernel_patches/targets/2.6-patchless.target
++lustre/kernel_patches/targets/2.6-sles10.target
++lustre/kernel_patches/targets/hp_pnnl-2.4.target
++lustre/kernel_patches/targets/rh-2.4.target
++lustre/kernel_patches/targets/rhel-2.4.target
++lustre/kernel_patches/targets/suse-2.4.21-2.target
++lustre/kernel_patches/targets/sles-2.4.target
++lustre/ldlm/Makefile
++lustre/liblustre/Makefile
++lustre/liblustre/tests/Makefile
++lustre/llite/Makefile
++lustre/llite/autoMakefile
++lustre/lov/Makefile
++lustre/lov/autoMakefile
++lustre/lvfs/Makefile
++lustre/lvfs/autoMakefile
++lustre/mdc/Makefile
++lustre/mdc/autoMakefile
++lustre/mds/Makefile
++lustre/mds/autoMakefile
++lustre/obdclass/Makefile
++lustre/obdclass/autoMakefile
++lustre/obdclass/linux/Makefile
++lustre/obdecho/Makefile
++lustre/obdecho/autoMakefile
++lustre/obdfilter/Makefile
++lustre/obdfilter/autoMakefile
++lustre/osc/Makefile
++lustre/osc/autoMakefile
++lustre/ost/Makefile
++lustre/ost/autoMakefile
++lustre/mgc/Makefile
++lustre/mgc/autoMakefile
++lustre/mgs/Makefile
++lustre/mgs/autoMakefile
++lustre/ptlrpc/Makefile
++lustre/ptlrpc/autoMakefile
++lustre/quota/Makefile
++lustre/quota/autoMakefile
++lustre/scripts/Makefile
++lustre/scripts/version_tag.pl
++lustre/tests/Makefile
++lustre/utils/Makefile
++])
++case $lb_target_os in
++ darwin)
++ AC_CONFIG_FILES([ lustre/obdclass/darwin/Makefile ])
++ ;;
++esac
++
++])
+diff -urNad lustre~/lustre/include/linux/lustre_compat25.h lustre/lustre/include/linux/lustre_compat25.h
+--- lustre~/lustre/include/linux/lustre_compat25.h 2009-03-12 10:33:45.000000000 +0100
++++ lustre/lustre/include/linux/lustre_compat25.h 2009-03-12 11:02:51.000000000 +0100
+@@ -57,6 +57,28 @@
+ #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) */
+
+ #ifndef HAVE_SET_FS_PWD
++
++#ifdef HAVE_FS_STRUCT_USE_PATH
++static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
++ struct dentry *dentry)
++{
++ struct path path;
++ struct path old_pwd;
++
++ path.mnt = mnt;
++ path.dentry = dentry;
++ write_lock(&fs->lock);
++ old_pwd = fs->pwd;
++ path_get(&path);
++ fs->pwd = path;
++ write_unlock(&fs->lock);
++
++ if (old_pwd.dentry)
++ path_put(&old_pwd);
++}
++
++#else
++
+ static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+ struct dentry *dentry)
+ {
+@@ -75,6 +97,7 @@
+ mntput(old_pwdmnt);
+ }
+ }
++#endif
+ #else
+ #define ll_set_fs_pwd set_fs_pwd
+ #endif /* HAVE_SET_FS_PWD */
+@@ -151,7 +174,12 @@
+ #endif
+
+ /* XXX our code should be using the 2.6 calls, not the other way around */
++#ifndef HAVE_TRYLOCK_PAGE
+ #define TryLockPage(page) TestSetPageLocked(page)
++#else
++#define TryLockPage(page) (!trylock_page(page))
++#endif
++
+ #define Page_Uptodate(page) PageUptodate(page)
+ #define ll_redirty_page(page) set_page_dirty(page)
+
+@@ -364,8 +392,17 @@
+ #define LL_RENAME_DOES_D_MOVE FS_ODD_RENAME
+ #endif
+
++#ifdef HAVE_FILE_REMOVE_SUID
++#define ll_remove_suid(file, mnt) file_remove_suid(file)
++#else
++ #ifdef HAVE_SECURITY_PLUG
++ #define ll_remove_suid(file,mnt) remove_suid(file->f_dentry,mnt)
++ #else
++ #define ll_remove_suid(file,mnt) remove_suid(file->f_dentry)
++ #endif
++#endif
++
+ #ifdef HAVE_SECURITY_PLUG
+-#define ll_remove_suid(inode,mnt) remove_suid(inode,mnt)
+ #define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry,mnt)
+ #define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mnt,mode)
+ #define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,mnt,dir,new,mnt1)
+@@ -377,7 +414,6 @@
+ #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+ vfs_rename(old,old_dir,mnt,new,new_dir,mnt1)
+ #else
+-#define ll_remove_suid(inode,mnt) remove_suid(inode)
+ #define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry)
+ #define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode)
+ #define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new)
+@@ -388,6 +424,57 @@
+ vfs_rename(old,old_dir,new,new_dir)
+ #endif
+
++#ifdef HAVE_REGISTER_SHRINKER
++typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
++
++static inline
++struct shrinker *set_shrinker(int seek, shrinker_t func)
++{
++ struct shrinker *s;
++
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return (NULL);
++
++ s->shrink = func;
++ s->seeks = seek;
++
++ register_shrinker(s);
++
++ return s;
++}
++
++static inline
++void remove_shrinker(struct shrinker *shrinker)
++{
++ if (shrinker == NULL)
++ return;
++
++ unregister_shrinker(shrinker);
++ kfree(shrinker);
++}
++#endif
++
++#ifdef HAVE_BIO_ENDIO_2ARG
++#define cfs_bio_io_error(a,b) bio_io_error((a))
++#define cfs_bio_endio(a,b,c) bio_endio((a),(c))
++#else
++#define cfs_bio_io_error(a,b) bio_io_error((a),(b))
++#define cfs_bio_endio(a,b,c) bio_endio((a),(b),(c))
++#endif
++
++#ifdef HAVE_FS_STRUCT_USE_PATH
++#define cfs_fs_pwd(fs) ((fs)->pwd.dentry)
++#define cfs_fs_mnt(fs) ((fs)->pwd.mnt)
++#else
++#define cfs_fs_pwd(fs) ((fs)->pwd)
++#define cfs_fs_mnt(fs) ((fs)->pwdmnt)
++#endif
++
++#ifndef list_for_each_safe_rcu
++#define list_for_each_safe_rcu(a,b,c) list_for_each_rcu(a, c)
++#endif
++
+ #ifndef abs
+ static inline int abs(int x)
+ {
+diff -urNad lustre~/lustre/include/linux/lustre_compat25.h.orig lustre/lustre/include/linux/lustre_compat25.h.orig
+--- lustre~/lustre/include/linux/lustre_compat25.h.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/include/linux/lustre_compat25.h.orig 2009-03-12 10:33:45.000000000 +0100
+@@ -0,0 +1,411 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ */
++
++#ifndef _LINUX_COMPAT25_H
++#define _LINUX_COMPAT25_H
++
++#ifdef __KERNEL__
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5)
++#error sorry, lustre requires at least 2.6.5
++#endif
++
++#include <libcfs/linux/portals_compat25.h>
++
++#include <linux/lustre_patchless_compat.h>
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
++struct ll_iattr_struct {
++ struct iattr iattr;
++ unsigned int ia_attr_flags;
++};
++#else
++#define ll_iattr_struct iattr
++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14) */
++
++#ifndef HAVE_SET_FS_PWD
++static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
++ struct dentry *dentry)
++{
++ struct dentry *old_pwd;
++ struct vfsmount *old_pwdmnt;
++
++ write_lock(&fs->lock);
++ old_pwd = fs->pwd;
++ old_pwdmnt = fs->pwdmnt;
++ fs->pwdmnt = mntget(mnt);
++ fs->pwd = dget(dentry);
++ write_unlock(&fs->lock);
++
++ if (old_pwd) {
++ dput(old_pwd);
++ mntput(old_pwdmnt);
++ }
++}
++#else
++#define ll_set_fs_pwd set_fs_pwd
++#endif /* HAVE_SET_FS_PWD */
++
++#ifdef HAVE_INODE_I_MUTEX
++#define UNLOCK_INODE_MUTEX(inode) do {mutex_unlock(&(inode)->i_mutex); } while(0)
++#define LOCK_INODE_MUTEX(inode) do {mutex_lock(&(inode)->i_mutex); } while(0)
++#define TRYLOCK_INODE_MUTEX(inode) mutex_trylock(&(inode)->i_mutex)
++#else
++#define UNLOCK_INODE_MUTEX(inode) do {up(&(inode)->i_sem); } while(0)
++#define LOCK_INODE_MUTEX(inode) do {down(&(inode)->i_sem); } while(0)
++#define TRYLOCK_INODE_MUTEX(inode) (!down_trylock(&(inode)->i_sem))
++#endif /* HAVE_INODE_I_MUTEX */
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15)
++#define d_child d_u.d_child
++#define d_rcu d_u.d_rcu
++#endif
++
++#ifdef HAVE_DQUOTOFF_MUTEX
++#define UNLOCK_DQONOFF_MUTEX(dqopt) do {mutex_unlock(&(dqopt)->dqonoff_mutex); } while(0)
++#define LOCK_DQONOFF_MUTEX(dqopt) do {mutex_lock(&(dqopt)->dqonoff_mutex); } while(0)
++#else
++#define UNLOCK_DQONOFF_MUTEX(dqopt) do {up(&(dqopt)->dqonoff_sem); } while(0)
++#define LOCK_DQONOFF_MUTEX(dqopt) do {down(&(dqopt)->dqonoff_sem); } while(0)
++#endif /* HAVE_DQUOTOFF_MUTEX */
++
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
++#define NGROUPS_SMALL NGROUPS
++#define NGROUPS_PER_BLOCK ((int)(EXEC_PAGESIZE / sizeof(gid_t)))
++
++struct group_info {
++ int ngroups;
++ atomic_t usage;
++ gid_t small_block[NGROUPS_SMALL];
++ int nblocks;
++ gid_t *blocks[0];
++};
++#define current_ngroups current->ngroups
++#define current_groups current->groups
++
++struct group_info *groups_alloc(int gidsetsize);
++void groups_free(struct group_info *ginfo);
++#else /* >= 2.6.4 */
++
++#define current_ngroups current->group_info->ngroups
++#define current_groups current->group_info->small_block
++
++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) */
++
++#ifndef page_private
++#define page_private(page) ((page)->private)
++#define set_page_private(page, v) ((page)->private = (v))
++#endif
++
++#ifndef HAVE_GFP_T
++#define gfp_t int
++#endif
++
++#define lock_dentry(___dentry) spin_lock(&(___dentry)->d_lock)
++#define unlock_dentry(___dentry) spin_unlock(&(___dentry)->d_lock)
++
++#define ll_kernel_locked() kernel_locked()
++
++/*
++ * OBD need working random driver, thus all our
++ * initialization routines must be called after device
++ * driver initialization
++ */
++#ifndef MODULE
++#undef module_init
++#define module_init(a) late_initcall(a)
++#endif
++
++/* XXX our code should be using the 2.6 calls, not the other way around */
++#define TryLockPage(page) TestSetPageLocked(page)
++#define Page_Uptodate(page) PageUptodate(page)
++#define ll_redirty_page(page) set_page_dirty(page)
++
++#define KDEVT_INIT(val) (val)
++
++#define LTIME_S(time) (time.tv_sec)
++#define ll_path_lookup path_lookup
++#define ll_permission(inode,mask,nd) permission(inode,mask,nd)
++
++#define ll_pgcache_lock(mapping) spin_lock(&mapping->page_lock)
++#define ll_pgcache_unlock(mapping) spin_unlock(&mapping->page_lock)
++#define ll_call_writepage(inode, page) \
++ (inode)->i_mapping->a_ops->writepage(page, NULL)
++#define ll_invalidate_inode_pages(inode) \
++ invalidate_inode_pages((inode)->i_mapping)
++#define ll_truncate_complete_page(page) \
++ truncate_complete_page(page->mapping, page)
++
++#define ll_vfs_create(a,b,c,d) vfs_create(a,b,c,d)
++#define ll_dev_t dev_t
++#define kdev_t dev_t
++#define to_kdev_t(dev) (dev)
++#define kdev_t_to_nr(dev) (dev)
++#define val_to_kdev(dev) (dev)
++#define ILOOKUP(sb, ino, test, data) ilookup5(sb, ino, test, data);
++
++#include <linux/writeback.h>
++
++static inline int cleanup_group_info(void)
++{
++ struct group_info *ginfo;
++
++ ginfo = groups_alloc(0);
++ if (!ginfo)
++ return -ENOMEM;
++
++ set_current_groups(ginfo);
++ put_group_info(ginfo);
++
++ return 0;
++}
++
++#define __set_page_ll_data(page, llap) \
++ do { \
++ page_cache_get(page); \
++ SetPagePrivate(page); \
++ set_page_private(page, (unsigned long)llap); \
++ } while (0)
++#define __clear_page_ll_data(page) \
++ do { \
++ ClearPagePrivate(page); \
++ set_page_private(page, 0); \
++ page_cache_release(page); \
++ } while(0)
++
++#define kiobuf bio
++
++#include <linux/proc_fs.h>
++
++#if !defined(HAVE_D_REHASH_COND) && defined(HAVE___D_REHASH)
++#define d_rehash_cond(dentry, lock) __d_rehash(dentry, lock)
++extern void __d_rehash(struct dentry *dentry, int lock);
++#endif
++
++#if !defined(HAVE_D_MOVE_LOCKED) && defined(HAVE___D_MOVE)
++#define d_move_locked(dentry, target) __d_move(dentry, target)
++extern void __d_move(struct dentry *dentry, struct dentry *target);
++#endif
++
++#ifdef HAVE_CAN_SLEEP_ARG
++#define ll_flock_lock_file_wait(file, lock, can_sleep) \
++ flock_lock_file_wait(file, lock, can_sleep)
++#else
++#define ll_flock_lock_file_wait(file, lock, can_sleep) \
++ flock_lock_file_wait(file, lock)
++#endif
++
++#define CheckWriteback(page, cmd) \
++ ((!PageWriteback(page) && (cmd & OBD_BRW_READ)) || \
++ (PageWriteback(page) && (cmd & OBD_BRW_WRITE)))
++
++
++#ifdef HAVE_PAGE_LIST
++static inline int mapping_has_pages(struct address_space *mapping)
++{
++ int rc = 1;
++
++ ll_pgcache_lock(mapping);
++ if (list_empty(&mapping->dirty_pages) &&
++ list_empty(&mapping->clean_pages) &&
++ list_empty(&mapping->locked_pages)) {
++ rc = 0;
++ }
++ ll_pgcache_unlock(mapping);
++
++ return rc;
++}
++#else
++static inline int mapping_has_pages(struct address_space *mapping)
++{
++ return mapping->nrpages > 0;
++}
++#endif
++
++#ifdef HAVE_KIOBUF_KIO_BLOCKS
++#define KIOBUF_GET_BLOCKS(k) ((k)->kio_blocks)
++#else
++#define KIOBUF_GET_BLOCKS(k) ((k)->blocks)
++#endif
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7))
++#define ll_set_dflags(dentry, flags) do { dentry->d_vfs_flags |= flags; } while(0)
++#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
++ vfs_symlink(dir, dentry, path)
++#else
++#define ll_set_dflags(dentry, flags) do { \
++ spin_lock(&dentry->d_lock); \
++ dentry->d_flags |= flags; \
++ spin_unlock(&dentry->d_lock); \
++ } while(0)
++#ifdef HAVE_SECURITY_PLUG
++#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
++ vfs_symlink(dir, dentry, mnt, path, mode)
++#else
++#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
++ vfs_symlink(dir, dentry, path, mode)
++#endif
++#endif
++
++#ifndef container_of
++#define container_of(ptr, type, member) ({ \
++ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
++ (type *)( (char *)__mptr - offsetof(type,member) );})
++#endif
++
++#ifdef HAVE_I_ALLOC_SEM
++#define UP_WRITE_I_ALLOC_SEM(i) do { up_write(&(i)->i_alloc_sem); } while (0)
++#define DOWN_WRITE_I_ALLOC_SEM(i) do { down_write(&(i)->i_alloc_sem); } while(0)
++#define LASSERT_I_ALLOC_SEM_WRITE_LOCKED(i) LASSERT(down_read_trylock(&(i)->i_alloc_sem) == 0)
++
++#define UP_READ_I_ALLOC_SEM(i) do { up_read(&(i)->i_alloc_sem); } while (0)
++#define DOWN_READ_I_ALLOC_SEM(i) do { down_read(&(i)->i_alloc_sem); } while (0)
++#define LASSERT_I_ALLOC_SEM_READ_LOCKED(i) LASSERT(down_write_trylock(&(i)->i_alloc_sem) == 0)
++#else
++#define UP_READ_I_ALLOC_SEM(i) do { } while (0)
++#define DOWN_READ_I_ALLOC_SEM(i) do { } while (0)
++#define LASSERT_I_ALLOC_SEM_READ_LOCKED(i) do { } while (0)
++
++#define UP_WRITE_I_ALLOC_SEM(i) do { } while (0)
++#define DOWN_WRITE_I_ALLOC_SEM(i) do { } while (0)
++#define LASSERT_I_ALLOC_SEM_WRITE_LOCKED(i) do { } while (0)
++#endif
++
++#ifndef HAVE_GRAB_CACHE_PAGE_NOWAIT_GFP
++#define grab_cache_page_nowait_gfp(x, y, z) grab_cache_page_nowait((x), (y))
++#endif
++
++#ifndef HAVE_FILEMAP_FDATAWRITE
++#define filemap_fdatawrite(mapping) filemap_fdatasync(mapping)
++#endif
++
++#ifdef HAVE_VFS_KERN_MOUNT
++static inline
++struct vfsmount *
++ll_kern_mount(const char *fstype, int flags, const char *name, void *data)
++{
++ struct file_system_type *type = get_fs_type(fstype);
++ struct vfsmount *mnt;
++ if (!type)
++ return ERR_PTR(-ENODEV);
++ mnt = vfs_kern_mount(type, flags, name, data);
++ module_put(type->owner);
++ return mnt;
++}
++#else
++#define ll_kern_mount(fstype, flags, name, data) do_kern_mount((fstype), (flags), (name), (data))
++#endif
++
++#ifdef HAVE_STATFS_DENTRY_PARAM
++#define ll_do_statfs(sb, sfs) (sb)->s_op->statfs((sb)->s_root, (sfs))
++#else
++#define ll_do_statfs(sb, sfs) (sb)->s_op->statfs((sb), (sfs))
++#endif
++
++/* task_struct */
++#ifndef HAVE_TASK_PPTR
++#define p_pptr parent
++#endif
++
++#ifdef HAVE_UNREGISTER_BLKDEV_RETURN_INT
++#define ll_unregister_blkdev(a,b) unregister_blkdev((a),(b))
++#else
++static inline
++int ll_unregister_blkdev(unsigned int dev, const char *name)
++{
++ unregister_blkdev(dev, name);
++ return 0;
++}
++#endif
++
++#ifdef HAVE_INVALIDATE_BDEV_2ARG
++#define ll_invalidate_bdev(a,b) invalidate_bdev((a),(b))
++#else
++#define ll_invalidate_bdev(a,b) invalidate_bdev((a))
++#endif
++
++#ifdef HAVE_FS_RENAME_DOES_D_MOVE
++#define LL_RENAME_DOES_D_MOVE FS_RENAME_DOES_D_MOVE
++#else
++#define LL_RENAME_DOES_D_MOVE FS_ODD_RENAME
++#endif
++
++#ifdef HAVE_SECURITY_PLUG
++#define ll_remove_suid(inode,mnt) remove_suid(inode,mnt)
++#define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry,mnt)
++#define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mnt,mode)
++#define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,mnt,dir,new,mnt1)
++#define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry,mnt)
++#define ll_vfs_mknod(dir,entry,mnt,mode,dev) \
++ vfs_mknod(dir,entry,mnt,mode,dev)
++#define ll_security_inode_unlink(dir,entry,mnt) \
++ security_inode_unlink(dir,entry,mnt)
++#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
++ vfs_rename(old,old_dir,mnt,new,new_dir,mnt1)
++#else
++#define ll_remove_suid(inode,mnt) remove_suid(inode)
++#define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry)
++#define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode)
++#define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new)
++#define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry)
++#define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev)
++#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
++#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
++ vfs_rename(old,old_dir,new,new_dir)
++#endif
++
++#ifndef abs
++static inline int abs(int x)
++{
++ return (x < 0) ? -x : x;
++}
++#endif
++
++#ifndef labs
++static inline long labs(long x)
++{
++ return (x < 0) ? -x : x;
++}
++#endif
++
++/* Using kernel fls(). Userspace will use one defined in user-bitops.h. */
++#ifndef __fls
++#define __fls fls
++#endif
++
++#endif /* __KERNEL__ */
++#endif /* _COMPAT25_H */
+diff -urNad lustre~/lustre/include/linux/lustre_lib.h lustre/lustre/include/linux/lustre_lib.h
+--- lustre~/lustre/include/linux/lustre_lib.h 2008-08-07 11:52:06.000000000 +0200
++++ lustre/lustre/include/linux/lustre_lib.h 2009-03-12 11:02:51.000000000 +0100
+@@ -49,7 +49,6 @@
+ # include <string.h>
+ # include <sys/types.h>
+ #else
+-# include <asm/semaphore.h>
+ # include <linux/rwsem.h>
+ # include <linux/sched.h>
+ # include <linux/signal.h>
+diff -urNad lustre~/lustre/include/linux/lustre_patchless_compat.h lustre/lustre/include/linux/lustre_patchless_compat.h
+--- lustre~/lustre/include/linux/lustre_patchless_compat.h 2008-08-07 11:52:10.000000000 +0200
++++ lustre/lustre/include/linux/lustre_patchless_compat.h 2009-03-12 11:02:51.000000000 +0100
+@@ -52,7 +52,7 @@
+
+ BUG_ON(!PageLocked(page));
+
+-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15))
++#ifdef HAVE_RW_TREE_LOCK
+ write_lock_irq(&mapping->tree_lock);
+ #else
+ spin_lock_irq(&mapping->tree_lock);
+@@ -65,7 +65,7 @@
+ #else
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ #endif
+-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15))
++#ifdef HAVE_RW_TREE_LOCK
+ write_unlock_irq(&mapping->tree_lock);
+ #else
+ spin_unlock_irq(&mapping->tree_lock);
+diff -urNad lustre~/lustre/include/lprocfs_status.h lustre/lustre/include/lprocfs_status.h
+--- lustre~/lustre/include/lprocfs_status.h 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/include/lprocfs_status.h 2009-03-12 11:02:51.000000000 +0100
+@@ -521,6 +521,8 @@
+ #define LPROCFS_EXIT() do { \
+ up_read(&_lprocfs_lock); \
+ } while(0)
++
++#ifdef HAVE_PROCFS_DELETED
+ #define LPROCFS_ENTRY_AND_CHECK(dp) do { \
+ typecheck(struct proc_dir_entry *, dp); \
+ LPROCFS_ENTRY(); \
+@@ -529,6 +531,14 @@
+ return -ENODEV; \
+ } \
+ } while(0)
++#define LPROCFS_CHECK_DELETED(dp) ((dp)->deleted)
++#else
++
++#define LPROCFS_ENTRY_AND_CHECK(dp) \
++ LPROCFS_ENTRY();
++#define LPROCFS_CHECK_DELETED(dp) (0)
++#endif
++
+ #define LPROCFS_WRITE_ENTRY() do { \
+ down_write(&_lprocfs_lock); \
+ } while(0)
+@@ -536,6 +546,7 @@
+ up_write(&_lprocfs_lock); \
+ } while(0)
+
++
+ /* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+ #define LPROCFS_CLIMP_CHECK(obd) do { \
+diff -urNad lustre~/lustre/include/lprocfs_status.h.orig lustre/lustre/include/lprocfs_status.h.orig
+--- lustre~/lustre/include/lprocfs_status.h.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/include/lprocfs_status.h.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,817 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ *
++ * lustre/include/lprocfs_status.h
++ *
++ * Top level header file for LProc SNMP
++ *
++ * Author: Hariharan Thantry thantry at users.sourceforge.net
++ */
++#ifndef _LPROCFS_SNMP_H
++#define _LPROCFS_SNMP_H
++
++#include <lustre/lustre_idl.h>
++#if defined(__linux__)
++#include <linux/lprocfs_status.h>
++#elif defined(__APPLE__)
++#include <darwin/lprocfs_status.h>
++#elif defined(__WINNT__)
++#include <winnt/lprocfs_status.h>
++#else
++#error Unsupported operating system.
++#endif
++
++#undef LPROCFS
++#if (defined(__KERNEL__) && defined(CONFIG_PROC_FS))
++# define LPROCFS
++#endif
++
++struct lprocfs_vars {
++ const char *name;
++ cfs_read_proc_t *read_fptr;
++ cfs_write_proc_t *write_fptr;
++ void *data;
++ struct file_operations *fops;
++ /**
++ * /proc file mode.
++ */
++ mode_t proc_mode;
++};
++
++struct lprocfs_static_vars {
++ struct lprocfs_vars *module_vars;
++ struct lprocfs_vars *obd_vars;
++};
++
++/* if we find more consumers this could be generalized */
++#define OBD_HIST_MAX 32
++struct obd_histogram {
++ spinlock_t oh_lock;
++ unsigned long oh_buckets[OBD_HIST_MAX];
++};
++
++enum {
++ BRW_R_PAGES = 0,
++ BRW_W_PAGES,
++ BRW_R_RPC_HIST,
++ BRW_W_RPC_HIST,
++ BRW_R_IO_TIME,
++ BRW_W_IO_TIME,
++ BRW_R_DISCONT_PAGES,
++ BRW_W_DISCONT_PAGES,
++ BRW_R_DISCONT_BLOCKS,
++ BRW_W_DISCONT_BLOCKS,
++ BRW_R_DISK_IOSIZE,
++ BRW_W_DISK_IOSIZE,
++ BRW_R_DIO_FRAGS,
++ BRW_W_DIO_FRAGS,
++ BRW_LAST,
++};
++
++struct brw_stats {
++ struct obd_histogram hist[BRW_LAST];
++};
++
++
++/* An lprocfs counter can be configured using the enum bit masks below.
++ *
++ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
++ * protects this counter from concurrent updates. If not specified,
++ * lprocfs an internal per-counter lock variable. External locks are
++ * not used to protect counter increments, but are used to protect
++ * counter readout and resets.
++ *
++ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
++ * (i.e. counter can be incremented by more than "1"). When specified,
++ * the counter maintains min, max and sum in addition to a simple
++ * invocation count. This allows averages to be be computed.
++ * If not specified, the counter is an increment-by-1 counter.
++ * min, max, sum, etc. are not maintained.
++ *
++ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
++ * squares (for multi-valued counter samples only). This allows
++ * external computation of standard deviation, but involves a 64-bit
++ * multiply per counter increment.
++ */
++
++enum {
++ LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
++ LPROCFS_CNTR_AVGMINMAX = 0x0002,
++ LPROCFS_CNTR_STDDEV = 0x0004,
++
++ /* counter data type */
++ LPROCFS_TYPE_REGS = 0x0100,
++ LPROCFS_TYPE_BYTES = 0x0200,
++ LPROCFS_TYPE_PAGES = 0x0400,
++ LPROCFS_TYPE_CYCLE = 0x0800,
++};
++
++struct lprocfs_atomic {
++ atomic_t la_entry;
++ atomic_t la_exit;
++};
++
++#define LC_MIN_INIT ((~(__u64)0) >> 1)
++
++struct lprocfs_counter {
++ struct lprocfs_atomic lc_cntl; /* may need to move to per set */
++ unsigned int lc_config;
++ __s64 lc_count;
++ __s64 lc_sum;
++ __s64 lc_min;
++ __s64 lc_max;
++ __s64 lc_sumsquare;
++ const char *lc_name; /* must be static */
++ const char *lc_units; /* must be static */
++};
++
++struct lprocfs_percpu {
++ struct lprocfs_counter lp_cntr[0];
++};
++
++#define LPROCFS_GET_NUM_CPU 0x0001
++#define LPROCFS_GET_SMP_ID 0x0002
++
++enum lprocfs_stats_flags {
++ LPROCFS_STATS_FLAG_PERCPU = 0x0000, /* per cpu counter */
++ LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
++ * area and need locking */
++};
++
++enum lprocfs_fields_flags {
++ LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001,
++ LPROCFS_FIELDS_FLAGS_SUM = 0x0002,
++ LPROCFS_FIELDS_FLAGS_MIN = 0x0003,
++ LPROCFS_FIELDS_FLAGS_MAX = 0x0004,
++ LPROCFS_FIELDS_FLAGS_AVG = 0x0005,
++ LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006,
++ LPROCFS_FIELDS_FLAGS_COUNT = 0x0007,
++};
++
++struct lprocfs_stats {
++ unsigned int ls_num; /* # of counters */
++ int ls_flags; /* See LPROCFS_STATS_FLAG_* */
++ spinlock_t ls_lock; /* Lock used only when there are
++ * no percpu stats areas */
++ struct lprocfs_percpu *ls_percpu[0];
++};
++
++static inline int opcode_offset(__u32 opc) {
++ if (opc < OST_LAST_OPC) {
++ /* OST opcode */
++ return (opc - OST_FIRST_OPC);
++ } else if (opc < MDS_LAST_OPC) {
++ /* MDS opcode */
++ return (opc - MDS_FIRST_OPC +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else if (opc < LDLM_LAST_OPC) {
++ /* LDLM Opcode */
++ return (opc - LDLM_FIRST_OPC +
++ (MDS_LAST_OPC - MDS_FIRST_OPC) +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else if (opc < MGS_LAST_OPC) {
++ /* MGS Opcode */
++ return (opc - MGS_FIRST_OPC +
++ (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
++ (MDS_LAST_OPC - MDS_FIRST_OPC) +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else if (opc < OBD_LAST_OPC) {
++ /* OBD Ping */
++ return (opc - OBD_FIRST_OPC +
++ (MGS_LAST_OPC - MGS_FIRST_OPC) +
++ (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
++ (MDS_LAST_OPC - MDS_FIRST_OPC) +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else if (opc < LLOG_LAST_OPC) {
++ /* LLOG Opcode */
++ return (opc - LLOG_FIRST_OPC +
++ (OBD_LAST_OPC - OBD_FIRST_OPC) +
++ (MGS_LAST_OPC - MGS_FIRST_OPC) +
++ (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
++ (MDS_LAST_OPC - MDS_FIRST_OPC) +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else if (opc < QUOTA_LAST_OPC) {
++ /* LQUOTA Opcode */
++ return (opc - QUOTA_FIRST_OPC +
++ (LLOG_LAST_OPC - LLOG_FIRST_OPC) +
++ (OBD_LAST_OPC - OBD_FIRST_OPC) +
++ (MGS_LAST_OPC - MGS_FIRST_OPC) +
++ (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
++ (MDS_LAST_OPC - MDS_FIRST_OPC) +
++ (OST_LAST_OPC - OST_FIRST_OPC));
++ } else {
++ /* Unknown Opcode */
++ return -1;
++ }
++}
++
++#define LUSTRE_MAX_OPCODES ((OST_LAST_OPC - OST_FIRST_OPC) + \
++ (MDS_LAST_OPC - MDS_FIRST_OPC) + \
++ (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \
++ (MGS_LAST_OPC - MGS_FIRST_OPC) + \
++ (OBD_LAST_OPC - OBD_FIRST_OPC) + \
++ (LLOG_LAST_OPC - LLOG_FIRST_OPC) + \
++ (QUOTA_LAST_OPC - QUOTA_FIRST_OPC))
++
++#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \
++ (EXTRA_LAST_OPC - EXTRA_FIRST_OPC))
++
++enum {
++ PTLRPC_REQWAIT_CNTR = 0,
++ PTLRPC_REQQDEPTH_CNTR,
++ PTLRPC_REQACTIVE_CNTR,
++ PTLRPC_TIMEOUT,
++ PTLRPC_REQBUF_AVAIL_CNTR,
++ PTLRPC_LAST_CNTR
++};
++
++#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
++
++enum {
++ LDLM_GLIMPSE_ENQUEUE = 0,
++ LDLM_PLAIN_ENQUEUE,
++ LDLM_EXTENT_ENQUEUE,
++ LDLM_FLOCK_ENQUEUE,
++ LDLM_IBITS_ENQUEUE,
++ MDS_REINT_SETATTR,
++ MDS_REINT_CREATE,
++ MDS_REINT_LINK,
++ MDS_REINT_UNLINK,
++ MDS_REINT_RENAME,
++ MDS_REINT_OPEN,
++ BRW_READ_BYTES,
++ BRW_WRITE_BYTES,
++ EXTRA_LAST_OPC
++};
++
++#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
++/* class_obd.c */
++extern cfs_proc_dir_entry_t *proc_lustre_root;
++
++struct obd_device;
++struct file;
++struct obd_histogram;
++
++/* Days / hours / mins / seconds format */
++struct dhms {
++ int d,h,m,s;
++};
++static inline void s2dhms(struct dhms *ts, time_t secs)
++{
++ ts->d = secs / 86400;
++ secs = secs % 86400;
++ ts->h = secs / 3600;
++ secs = secs % 3600;
++ ts->m = secs / 60;
++ ts->s = secs % 60;
++}
++#define DHMS_FMT "%dd%dh%02dm%02ds"
++#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
++
++
++#ifdef LPROCFS
++
++static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int type)
++{
++ int rc = 0;
++
++ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
++ if (type & LPROCFS_GET_NUM_CPU)
++ rc = 1;
++ if (type & LPROCFS_GET_SMP_ID)
++ rc = 0;
++ spin_lock(&stats->ls_lock);
++ } else {
++ if (type & LPROCFS_GET_NUM_CPU)
++ rc = num_possible_cpus();
++ if (type & LPROCFS_GET_SMP_ID)
++ rc = smp_processor_id();
++ }
++ return rc;
++}
++
++static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats)
++{
++ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
++ spin_unlock(&stats->ls_lock);
++}
++
++/* Two optimized LPROCFS counter increment functions are provided:
++ * lprocfs_counter_incr(cntr, value) - optimized for by-one counters
++ * lprocfs_counter_add(cntr) - use for multi-valued counters
++ * Counter data layout allows config flag, counter lock and the
++ * count itself to reside within a single cache line.
++ */
++
++extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
++ long amount);
++extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
++ long amount);
++
++#define lprocfs_counter_incr(stats, idx) \
++ lprocfs_counter_add(stats, idx, 1)
++#define lprocfs_counter_decr(stats, idx) \
++ lprocfs_counter_sub(stats, idx, 1)
++
++extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
++ enum lprocfs_fields_flags field);
++
++static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
++ int idx,
++ enum lprocfs_fields_flags field)
++{
++ __u64 ret = 0;
++ int i;
++
++ LASSERT(stats != NULL);
++ for (i = 0; i < num_possible_cpus(); i++)
++ ret += lprocfs_read_helper(&(stats->ls_percpu[i]->lp_cntr[idx]),
++ field);
++ return ret;
++}
++
++extern struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
++ enum lprocfs_stats_flags flags);
++extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
++extern void lprocfs_free_stats(struct lprocfs_stats **stats);
++extern void lprocfs_init_ops_stats(int num_private_stats,
++ struct lprocfs_stats *stats);
++extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
++extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
++ unsigned int num_private_stats);
++extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
++ unsigned conf, const char *name,
++ const char *units);
++extern void lprocfs_free_obd_stats(struct obd_device *obddev);
++struct obd_export;
++extern int lprocfs_add_clear_entry(struct obd_device * obd,
++ cfs_proc_dir_entry_t *entry);
++extern int lprocfs_exp_setup(struct obd_export *exp,
++ lnet_nid_t *peer_nid, int *newnid);
++extern int lprocfs_exp_cleanup(struct obd_export *exp);
++extern int lprocfs_add_simple(struct proc_dir_entry *root,
++ char *name, read_proc_t *read_proc,
++ write_proc_t *write_proc, void *data);
++extern int lprocfs_register_stats(cfs_proc_dir_entry_t *root, const char *name,
++ struct lprocfs_stats *stats);
++
++/* lprocfs_status.c */
++extern int lprocfs_add_vars(cfs_proc_dir_entry_t *root,
++ struct lprocfs_vars *var,
++ void *data);
++
++extern cfs_proc_dir_entry_t *lprocfs_register(const char *name,
++ cfs_proc_dir_entry_t *parent,
++ struct lprocfs_vars *list,
++ void *data);
++
++extern void lprocfs_remove(cfs_proc_dir_entry_t **root);
++
++extern cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *root,
++ const char *name);
++
++extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
++extern int lprocfs_obd_cleanup(struct obd_device *obd);
++extern int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
++ read_proc_t *read_proc, write_proc_t *write_proc,
++ void *data);
++struct nid_stat;
++extern void lprocfs_free_per_client_stats(struct obd_device *obd);
++extern int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++
++
++extern struct file_operations lprocfs_evict_client_fops;
++
++extern int lprocfs_seq_create(cfs_proc_dir_entry_t *parent, char *name,
++ mode_t mode, struct file_operations *seq_fops,
++ void *data);
++extern int lprocfs_obd_seq_create(struct obd_device *dev, char *name,
++ mode_t mode, struct file_operations *seq_fops,
++ void *data);
++
++/* Generic callbacks */
++
++extern int lprocfs_rd_u64(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_atomic(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_rd_uint(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_wr_uint(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_rd_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_name(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_fstype(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_import(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++struct adaptive_timeout;
++extern int lprocfs_at_hist_helper(char *page, int count, int rc,
++ struct adaptive_timeout *at);
++extern int lprocfs_rd_timeouts(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_wr_ping(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++
++/* Statfs helpers */
++extern int lprocfs_rd_blksize(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_filestotal(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_filesfree(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_rd_filegroups(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++
++extern int lprocfs_write_helper(const char *buffer, unsigned long count,
++ int *val);
++extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
++ int *val, int mult);
++extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
++ long val, int mult);
++extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
++ __u64 *val);
++extern int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
++ __u64 *val, int mult);
++void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
++void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
++void lprocfs_oh_clear(struct obd_histogram *oh);
++unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
++
++/* lprocfs_status.c: counter read/write functions */
++extern int lprocfs_counter_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_counter_write(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++
++/* lprocfs_status.c: recovery status */
++int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++
++/* lprocfs_statuc.c: hash statistics */
++int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++
++extern int lprocfs_seq_release(struct inode *, struct file *);
++
++/* in lprocfs_stat.c, to protect the private data for proc entries */
++extern struct rw_semaphore _lprocfs_lock;
++#define LPROCFS_ENTRY() do { \
++ down_read(&_lprocfs_lock); \
++} while(0)
++#define LPROCFS_EXIT() do { \
++ up_read(&_lprocfs_lock); \
++} while(0)
++#define LPROCFS_ENTRY_AND_CHECK(dp) do { \
++ typecheck(struct proc_dir_entry *, dp); \
++ LPROCFS_ENTRY(); \
++ if ((dp)->deleted) { \
++ LPROCFS_EXIT(); \
++ return -ENODEV; \
++ } \
++} while(0)
++#define LPROCFS_WRITE_ENTRY() do { \
++ down_write(&_lprocfs_lock); \
++} while(0)
++#define LPROCFS_WRITE_EXIT() do { \
++ up_write(&_lprocfs_lock); \
++} while(0)
++
++/* You must use these macros when you want to refer to
++ * the import in a client obd_device for a lprocfs entry */
++#define LPROCFS_CLIMP_CHECK(obd) do { \
++ typecheck(struct obd_device *, obd); \
++ down_read(&(obd)->u.cli.cl_sem); \
++ if ((obd)->u.cli.cl_import == NULL) { \
++ up_read(&(obd)->u.cli.cl_sem); \
++ return -ENODEV; \
++ } \
++} while(0)
++#define LPROCFS_CLIMP_EXIT(obd) \
++ up_read(&(obd)->u.cli.cl_sem);
++
++
++/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
++ proc entries; otherwise, you will define name##_seq_write function also for
++ a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
++ call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
++#define __LPROC_SEQ_FOPS(name, custom_seq_write) \
++static int name##_seq_open(struct inode *inode, struct file *file) { \
++ struct proc_dir_entry *dp = PDE(inode); \
++ int rc; \
++ LPROCFS_ENTRY_AND_CHECK(dp); \
++ rc = single_open(file, name##_seq_show, dp->data); \
++ if (rc) { \
++ LPROCFS_EXIT(); \
++ return rc; \
++ } \
++ return 0; \
++} \
++struct file_operations name##_fops = { \
++ .owner = THIS_MODULE, \
++ .open = name##_seq_open, \
++ .read = seq_read, \
++ .write = custom_seq_write, \
++ .llseek = seq_lseek, \
++ .release = lprocfs_seq_release, \
++}
++
++#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL)
++#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write)
++
++/* lproc_ptlrpc.c */
++struct ptlrpc_request;
++extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
++
++#ifdef CRAY_XT3
++/* lprocfs_status.c: read recovery max time bz13079 */
++int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++
++/* lprocfs_status.c: write recovery max time bz13079 */
++int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++#endif
++
++/* all quota proc functions */
++extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count,
++ int *eof, void *data);
++extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_switch_qs(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_boundary_factor(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_least_bunit(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_least_iunit(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off,
++ int count, int *eof, void *data);
++extern int lprocfs_quota_wr_qs_factor(struct file *file, const char *buffer,
++ unsigned long count, void *data);
++
++#else
++/* LPROCFS is not defined */
++static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
++ int index, long amount) { return; }
++static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
++ int index) { return; }
++static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
++ int index, long amount) { return; }
++static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
++ int index, unsigned conf,
++ const char *name, const char *units)
++{ return; }
++
++static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
++ enum lprocfs_fields_flags field)
++{ return 0; }
++
++static inline struct lprocfs_stats* lprocfs_alloc_stats(unsigned int num,
++ enum lprocfs_stats_flags flags)
++{ return NULL; }
++static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
++{ return; }
++static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
++{ return; }
++static inline int lprocfs_register_stats(cfs_proc_dir_entry_t *root,
++ const char *name,
++ struct lprocfs_stats *stats)
++{ return 0; }
++static inline void lprocfs_init_ops_stats(int num_private_stats,
++ struct lprocfs_stats *stats)
++{ return; }
++static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
++{ return; }
++static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
++ unsigned int num_private_stats)
++{ return 0; }
++static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
++{ return; }
++
++struct obd_export;
++static inline int lprocfs_add_clear_entry(struct obd_export *exp)
++{ return 0; }
++static inline int lprocfs_exp_setup(struct obd_export *exp,
++ lnet_nid_t *peer_nid, int *newnid)
++{ return 0; }
++static inline int lprocfs_exp_cleanup(struct obd_export *exp)
++{ return 0; }
++static inline int lprocfs_add_simple(struct proc_dir_entry *root,
++ char *name,
++ read_proc_t *read_proc,
++ write_proc_t *write_proc,
++ void *data)
++{return 0; }
++struct nid_stat;
++static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
++{}
++static inline
++int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{return count;}
++static inline
++int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{return count;}
++
++
++static inline cfs_proc_dir_entry_t *
++lprocfs_register(const char *name, cfs_proc_dir_entry_t *parent,
++ struct lprocfs_vars *list, void *data) { return NULL; }
++static inline int lprocfs_add_vars(cfs_proc_dir_entry_t *root,
++ struct lprocfs_vars *var,
++ void *data) { return 0; }
++static inline void lprocfs_remove(cfs_proc_dir_entry_t **root) {};
++static inline cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *head,
++ const char *name) {return 0;}
++static inline int lprocfs_obd_setup(struct obd_device *dev,
++ struct lprocfs_vars *list) { return 0; }
++static inline int lprocfs_obd_cleanup(struct obd_device *dev) { return 0; }
++static inline int lprocfs_rd_u64(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline int lprocfs_rd_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline int lprocfs_rd_name(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++static inline int lprocfs_rd_import(char *page, char **start, off_t off, int count,
++ int *eof, void *data) { return 0; }
++static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++struct adaptive_timeout;
++static inline int lprocfs_at_hist_helper(char *page, int count, int rc,
++ struct adaptive_timeout *at)
++{ return 0; }
++static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{ return 0; }
++static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{ return 0; }
++static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{ return 0; }
++static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{ return 0; }
++
++
++/* Statfs helpers */
++static inline
++int lprocfs_rd_blksize(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_kbytestotal(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_kbytesfree(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_kbytesavail(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_filestotal(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_filesfree(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_rd_filegroups(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) {}
++static inline
++void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) {}
++static inline
++void lprocfs_oh_clear(struct obd_histogram *oh) {}
++static inline
++unsigned long lprocfs_oh_sum(struct obd_histogram *oh) { return 0; }
++static inline
++int lprocfs_counter_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data) { return 0; }
++static inline
++int lprocfs_counter_write(struct file *file, const char *buffer,
++ unsigned long count, void *data) { return 0; }
++
++static inline
++__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
++ enum lprocfs_fields_flags field)
++{ return (__u64)0; }
++
++#define LPROCFS_ENTRY()
++#define LPROCFS_EXIT()
++#define LPROCFS_ENTRY_AND_CHECK(dp)
++#define LPROC_SEQ_FOPS_RO(name)
++#define LPROC_SEQ_FOPS(name)
++
++/* lproc_ptlrpc.c */
++#define target_print_req NULL
++
++#endif /* LPROCFS */
++
++#endif /* LPROCFS_SNMP_H */
+diff -urNad lustre~/lustre/llite/file.c lustre/lustre/llite/file.c
+--- lustre~/lustre/llite/file.c 2009-03-12 11:02:39.000000000 +0100
++++ lustre/lustre/llite/file.c 2009-03-12 11:02:51.000000000 +0100
+@@ -1801,11 +1801,12 @@
+ #endif
+ }
+
++#ifdef HAVE_KERNEL_SENDFILE
+ /*
+ * Send file content (through pagecache) somewhere with helper
+ */
+-static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
+- read_actor_t actor, void *target)
++static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
++ size_t count, read_actor_t actor, void *target)
+ {
+ struct inode *inode = in_file->f_dentry->d_inode;
+ struct ll_inode_info *lli = ll_i2info(inode);
+@@ -1814,10 +1815,10 @@
+ struct ll_lock_tree_node *node;
+ struct ost_lvb lvb;
+ struct ll_ra_read bead;
+- int rc;
+- ssize_t retval;
++ ssize_t rc;
+ __u64 kms;
+ ENTRY;
++
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
+ inode->i_ino, inode->i_generation, inode, count, *ppos);
+
+@@ -1831,8 +1832,10 @@
+ in_file->f_ra.ra_pages = 0;
+
+ /* File with no objects, nothing to lock */
+- if (!lsm)
+- RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
++ if (!lsm) {
++ rc = generic_file_sendfile(in_file, ppos, count, actor, target);
++ RETURN(rc);
++ }
+
+ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
+ if (IS_ERR(node))
+@@ -1872,8 +1875,8 @@
+ /* A glimpse is necessary to determine whether we return a
+ * short read (B) or some zeroes at the end of the buffer (C) */
+ ll_inode_size_unlock(inode, 1);
+- retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
+- if (retval)
++ rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
++ if (rc)
+ goto out;
+ } else {
+ /* region is within kms and, hence, within real file size (A) */
+@@ -1889,13 +1892,115 @@
+ ll_ra_read_in(in_file, &bead);
+ /* BUG: 5972 */
+ file_accessed(in_file);
+- retval = generic_file_sendfile(in_file, ppos, count, actor, target);
++ rc = generic_file_sendfile(in_file, ppos, count, actor, target);
+ ll_ra_read_ex(in_file, &bead);
+
+ out:
+ ll_tree_unlock(&tree);
+- RETURN(retval);
++ RETURN(rc);
++}
++#endif
++
++/* change based on
++ * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27
++ */
++#ifdef HAVE_KERNEL_SPLICE_READ
++static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
++ struct pipe_inode_info *pipe, size_t count,
++ unsigned int flags)
++{
++ struct inode *inode = in_file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct ll_lock_tree tree;
++ struct ll_lock_tree_node *node;
++ struct ost_lvb lvb;
++ struct ll_ra_read bead;
++ ssize_t rc;
++ __u64 kms;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
++ inode->i_ino, inode->i_generation, inode, count, *ppos);
++
++ /* "If nbyte is 0, read() will return 0 and have no other results."
++ * -- Single Unix Spec */
++ if (count == 0)
++ RETURN(0);
++
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
++ /* turn off the kernel's read-ahead */
++ in_file->f_ra.ra_pages = 0;
++
++ /* File with no objects, nothing to lock */
++ if (!lsm) {
++ rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
++ RETURN(rc);
++ }
++
++ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
++ if (IS_ERR(node))
++ RETURN(PTR_ERR(node));
++
++ tree.lt_fd = LUSTRE_FPRIVATE(in_file);
++ rc = ll_tree_lock(&tree, node, NULL, count,
++ in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
++ if (rc != 0)
++ RETURN(rc);
++
++ ll_clear_file_contended(inode);
++ ll_inode_size_lock(inode, 1);
++ /*
++ * Consistency guarantees: following possibilities exist for the
++ * relation between region being read and real file size at this
++ * moment:
++ *
++ * (A): the region is completely inside of the file;
++ *
++ * (B-x): x bytes of region are inside of the file, the rest is
++ * outside;
++ *
++ * (C): the region is completely outside of the file.
++ *
++ * This classification is stable under DLM lock acquired by
++ * ll_tree_lock() above, because to change class, other client has to
++ * take DLM lock conflicting with our lock. Also, any updates to
++ * ->i_size by other threads on this client are serialized by
++ * ll_inode_size_lock(). This guarantees that short reads are handled
++ * correctly in the face of concurrent writes and truncates.
++ */
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
++ kms = lvb.lvb_size;
++ if (*ppos + count - 1 > kms) {
++ /* A glimpse is necessary to determine whether we return a
++ * short read (B) or some zeroes at the end of the buffer (C) */
++ ll_inode_size_unlock(inode, 1);
++ rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
++ if (rc)
++ goto out;
++ } else {
++ /* region is within kms and, hence, within real file size (A) */
++ i_size_write(inode, kms);
++ ll_inode_size_unlock(inode, 1);
++ }
++
++ CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
++ inode->i_ino, count, *ppos, i_size_read(inode));
++
++ bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
++ bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
++ ll_ra_read_in(in_file, &bead);
++ /* BUG: 5972 */
++ file_accessed(in_file);
++ rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
++ ll_ra_read_ex(in_file, &bead);
++
++ out:
++ ll_tree_unlock(&tree);
++ RETURN(rc);
+ }
++#endif
+
+ static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
+ unsigned long arg)
+@@ -3084,7 +3189,11 @@
+ }
+
+ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
++#ifndef HAVE_INODE_PERMISION_2ARGS
+ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
++#else
++int ll_inode_permission(struct inode *inode, int mask)
++#endif
+ {
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
+ inode->i_ino, inode->i_generation, inode, mask);
+@@ -3093,7 +3202,7 @@
+ return generic_permission(inode, mask, lustre_check_acl);
+ }
+ #else
+-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
++#ifndef HAVE_INODE_PERMISION_2ARGS
+ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+ #else
+ int ll_inode_permission(struct inode *inode, int mask)
+@@ -3163,7 +3272,12 @@
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
++#ifdef HAVE_KERNEL_SPLICE_READ
++ .splice_read = ll_file_splice_read,
++#endif
++#ifdef HAVE_KERNEL_SENDFILE
+ .sendfile = ll_file_sendfile,
++#endif
+ .fsync = ll_fsync,
+ };
+
+@@ -3185,7 +3299,12 @@
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
++#ifdef HAVE_KERNEL_SPLICE_READ
++ .splice_read = ll_file_splice_read,
++#endif
++#ifdef HAVE_KERNEL_SENDFILE
+ .sendfile = ll_file_sendfile,
++#endif
+ .fsync = ll_fsync,
+ #ifdef HAVE_F_OP_FLOCK
+ .flock = ll_file_flock,
+@@ -3212,7 +3331,12 @@
+ .release = ll_file_release,
+ .mmap = ll_file_mmap,
+ .llseek = ll_file_seek,
++#ifdef HAVE_KERNEL_SPLICE_READ
++ .splice_read = ll_file_splice_read,
++#endif
++#ifdef HAVE_KERNEL_SENDFILE
+ .sendfile = ll_file_sendfile,
++#endif
+ .fsync = ll_fsync,
+ #ifdef HAVE_F_OP_FLOCK
+ .flock = ll_file_noflock,
+diff -urNad lustre~/lustre/llite/file.c.orig lustre/lustre/llite/file.c.orig
+--- lustre~/lustre/llite/file.c.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/llite/file.c.orig 2009-03-12 11:02:39.000000000 +0100
+@@ -0,0 +1,3335 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ *
++ * lustre/llite/file.c
++ *
++ * Author: Peter Braam <braam at clusterfs.com>
++ * Author: Phil Schwan <phil at clusterfs.com>
++ * Author: Andreas Dilger <adilger at clusterfs.com>
++ */
++
++#define DEBUG_SUBSYSTEM S_LLITE
++#include <lustre_dlm.h>
++#include <lustre_lite.h>
++#include <linux/pagemap.h>
++#include <linux/file.h>
++#include <linux/posix_acl.h>
++#include "llite_internal.h"
++#include <lustre/ll_fiemap.h>
++
++/* also used by llite/special.c:ll_special_open() */
++struct ll_file_data *ll_file_data_get(void)
++{
++ struct ll_file_data *fd;
++
++ OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
++ return fd;
++}
++
++static void ll_file_data_put(struct ll_file_data *fd)
++{
++ if (fd != NULL)
++ OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
++}
++
++static int ll_close_inode_openhandle(struct inode *inode,
++ struct obd_client_handle *och)
++{
++ struct ptlrpc_request *req = NULL;
++ struct obd_device *obd;
++ struct obdo *oa;
++ int rc;
++ ENTRY;
++
++ obd = class_exp2obd(ll_i2mdcexp(inode));
++ if (obd == NULL) {
++ CERROR("Invalid MDC connection handle "LPX64"\n",
++ ll_i2mdcexp(inode)->exp_handle.h_cookie);
++ GOTO(out, rc = 0);
++ }
++
++ /*
++ * here we check if this is forced umount. If so this is called on
++ * canceling "open lock" and we do not call mdc_close() in this case, as
++ * it will not be successful, as import is already deactivated.
++ */
++ if (obd->obd_force)
++ GOTO(out, rc = 0);
++
++ OBDO_ALLOC(oa);
++ if (!oa)
++ RETURN(-ENOMEM); // XXX We leak openhandle and request here.
++
++ oa->o_id = inode->i_ino;
++ oa->o_valid = OBD_MD_FLID;
++ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
++ OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
++ OBD_MD_FLATIME | OBD_MD_FLMTIME |
++ OBD_MD_FLCTIME);
++ if (ll_is_inode_dirty(inode)) {
++ oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
++ oa->o_valid |= OBD_MD_FLFLAGS;
++ }
++
++ rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
++ if (rc == EAGAIN) {
++ /* We are the last writer, so the MDS has instructed us to get
++ * the file size and any write cookies, then close again. */
++ ll_queue_done_writing(inode);
++ rc = 0;
++ } else if (rc) {
++ CERROR("inode %lu mdc close failed: rc = %d\n",
++ inode->i_ino, rc);
++ }
++
++ OBDO_FREE(oa);
++
++ if (rc == 0) {
++ rc = ll_objects_destroy(req, inode);
++ if (rc)
++ CERROR("inode %lu ll_objects destroy: rc = %d\n",
++ inode->i_ino, rc);
++ }
++
++ ptlrpc_req_finished(req); /* This is close request */
++ EXIT;
++out:
++ mdc_clear_open_replay_data(och);
++
++ return rc;
++}
++
++int ll_mdc_real_close(struct inode *inode, int flags)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ int rc = 0;
++ struct obd_client_handle **och_p;
++ struct obd_client_handle *och;
++ __u64 *och_usecount;
++
++ ENTRY;
++
++ if (flags & FMODE_WRITE) {
++ och_p = &lli->lli_mds_write_och;
++ och_usecount = &lli->lli_open_fd_write_count;
++ } else if (flags & FMODE_EXEC) {
++ och_p = &lli->lli_mds_exec_och;
++ och_usecount = &lli->lli_open_fd_exec_count;
++ } else {
++ LASSERT(flags & FMODE_READ);
++ och_p = &lli->lli_mds_read_och;
++ och_usecount = &lli->lli_open_fd_read_count;
++ }
++
++ down(&lli->lli_och_sem);
++ if (*och_usecount) { /* There are still users of this handle, so
++ skip freeing it. */
++ up(&lli->lli_och_sem);
++ RETURN(0);
++ }
++ och=*och_p;
++ *och_p = NULL;
++ up(&lli->lli_och_sem);
++
++ if (och) { /* There might be a race and somebody have freed this och
++ already */
++ rc = ll_close_inode_openhandle(inode, och);
++ och->och_fh.cookie = DEAD_HANDLE_MAGIC;
++ OBD_FREE(och, sizeof *och);
++ }
++
++ RETURN(rc);
++}
++
++int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
++ struct file *file)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ struct ll_inode_info *lli = ll_i2info(inode);
++ int rc = 0;
++ ENTRY;
++
++ /* clear group lock, if present */
++ if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
++ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
++ fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
++ rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
++ &fd->fd_cwlockh);
++ }
++
++ /* Let's see if we have good enough OPEN lock on the file and if
++ we can skip talking to MDS */
++ if (file->f_dentry->d_inode) { /* Can this ever be false? */
++ int lockmode;
++ int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
++ struct lustre_handle lockh;
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ldlm_res_id file_res_id = {.name={inode->i_ino,
++ inode->i_generation}};
++ ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
++
++ down(&lli->lli_och_sem);
++ if (fd->fd_omode & FMODE_WRITE) {
++ lockmode = LCK_CW;
++ LASSERT(lli->lli_open_fd_write_count);
++ lli->lli_open_fd_write_count--;
++ } else if (fd->fd_omode & FMODE_EXEC) {
++ lockmode = LCK_PR;
++ LASSERT(lli->lli_open_fd_exec_count);
++ lli->lli_open_fd_exec_count--;
++ } else {
++ lockmode = LCK_CR;
++ LASSERT(lli->lli_open_fd_read_count);
++ lli->lli_open_fd_read_count--;
++ }
++ up(&lli->lli_och_sem);
++
++ if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
++ &file_res_id, LDLM_IBITS, &policy,lockmode,
++ &lockh)) {
++ rc = ll_mdc_real_close(file->f_dentry->d_inode,
++ fd->fd_omode);
++ }
++ } else {
++ CERROR("Releasing a file %p with negative dentry %p. Name %s",
++ file, file->f_dentry, file->f_dentry->d_name.name);
++ }
++
++ LUSTRE_FPRIVATE(file) = NULL;
++ ll_file_data_put(fd);
++
++ RETURN(rc);
++}
++
++int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
++
++/* While this returns an error code, fput() the caller does not, so we need
++ * to make every effort to clean up all of our state here. Also, applications
++ * rarely check close errors and even if an error is returned they will not
++ * re-try the close call.
++ */
++int ll_file_release(struct inode *inode, struct file *file)
++{
++ struct ll_file_data *fd;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ int rc;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
++ inode->i_generation, inode);
++
++
++ if (inode->i_sb->s_root != file->f_dentry)
++ ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
++ fd = LUSTRE_FPRIVATE(file);
++ LASSERT(fd != NULL);
++
++ /* The last ref on @file, maybe not the the owner pid of statahead.
++ * Different processes can open the same dir, "ll_opendir_key" means:
++ * it is me that should stop the statahead thread. */
++ if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
++ ll_stop_statahead(inode, lli->lli_opendir_key);
++
++ if (inode->i_sb->s_root == file->f_dentry) {
++ LUSTRE_FPRIVATE(file) = NULL;
++ ll_file_data_put(fd);
++ RETURN(0);
++ }
++
++ if (lsm)
++ lov_test_and_clear_async_rc(lsm);
++ lli->lli_async_rc = 0;
++
++ /* Ensure that dirty pages are flushed out with the right creds */
++ if (file->f_mode & FMODE_WRITE)
++ filemap_fdatawrite(file->f_mapping);
++
++ rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
++ RETURN(rc);
++}
++
++static int ll_intent_file_open(struct file *file, void *lmm,
++ int lmmsize, struct lookup_intent *itp)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
++ struct mdc_op_data data;
++ struct dentry *parent = file->f_dentry->d_parent;
++ const char *name = file->f_dentry->d_name.name;
++ const int len = file->f_dentry->d_name.len;
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ptlrpc_request *req;
++ int rc;
++ ENTRY;
++
++ if (!parent)
++ RETURN(-ENOENT);
++
++ ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
++ name, len, O_RDWR, NULL);
++
++ /* Usually we come here only for NFSD, and we want open lock.
++ But we can also get here with pre 2.6.15 patchless kernels, and in
++ that case that lock is also ok */
++ /* We can also get here if there was cached open handle in revalidate_it
++ * but it disappeared while we were getting from there to ll_file_open.
++ * But this means this file was closed and immediatelly opened which
++ * makes a good candidate for using OPEN lock */
++ /* If lmmsize & lmm are not 0, we are just setting stripe info
++ * parameters. No need for the open lock */
++ if (!lmm && !lmmsize)
++ itp->it_flags |= MDS_OPEN_LOCK;
++
++ rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
++ 0 /*unused */, &req, ll_mdc_blocking_ast, 0);
++ if (rc == -ESTALE) {
++ /* reason for keep own exit path - don`t flood log
++ * with messages with -ESTALE errors.
++ */
++ if (!it_disposition(itp, DISP_OPEN_OPEN) ||
++ it_open_error(DISP_OPEN_OPEN, itp))
++ GOTO(out, rc);
++ ll_release_openhandle(file->f_dentry, itp);
++ GOTO(out, rc);
++ }
++
++ if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
++ rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
++ CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
++ GOTO(out, rc);
++ }
++
++ if (itp->d.lustre.it_lock_mode)
++ mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
++ inode);
++
++ rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
++ req, DLM_REPLY_REC_OFF, NULL);
++out:
++ ptlrpc_req_finished(itp->d.lustre.it_data);
++ it_clear_disposition(itp, DISP_ENQ_COMPLETE);
++ ll_intent_drop_lock(itp);
++
++ RETURN(rc);
++}
++
++
++static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
++ struct obd_client_handle *och)
++{
++ struct ptlrpc_request *req = it->d.lustre.it_data;
++ struct mds_body *body;
++
++ LASSERT(och);
++
++ body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
++ LASSERT(body != NULL); /* reply already checked out */
++ /* and swabbed in mdc_enqueue */
++ LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
++
++ memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
++ och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
++ lli->lli_io_epoch = body->io_epoch;
++
++ mdc_set_open_replay_data(och, it->d.lustre.it_data);
++}
++
++int ll_local_open(struct file *file, struct lookup_intent *it,
++ struct ll_file_data *fd, struct obd_client_handle *och)
++{
++ ENTRY;
++
++ LASSERT(!LUSTRE_FPRIVATE(file));
++
++ LASSERT(fd != NULL);
++
++ if (och)
++ ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
++ LUSTRE_FPRIVATE(file) = fd;
++ ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
++ fd->fd_omode = it->it_flags;
++
++ RETURN(0);
++}
++
++/* Open a file, and (for the very first open) create objects on the OSTs at
++ * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
++ * creation or open until ll_lov_setstripe() ioctl is called. We grab
++ * lli_open_sem to ensure no other process will create objects, send the
++ * stripe MD to the MDS, or try to destroy the objects if that fails.
++ *
++ * If we already have the stripe MD locally then we don't request it in
++ * mdc_open(), by passing a lmm_size = 0.
++ *
++ * It is up to the application to ensure no other processes open this file
++ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
++ * used. We might be able to avoid races of that sort by getting lli_open_sem
++ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
++ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
++ */
++int ll_file_open(struct inode *inode, struct file *file)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lookup_intent *it, oit = { .it_op = IT_OPEN,
++ .it_flags = file->f_flags };
++ struct lov_stripe_md *lsm;
++ struct ptlrpc_request *req = NULL;
++ struct obd_client_handle **och_p;
++ __u64 *och_usecount;
++ struct ll_file_data *fd;
++ int rc = 0, opendir_set = 0;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
++ inode->i_generation, inode, file->f_flags);
++
++#ifdef HAVE_VFS_INTENT_PATCHES
++ it = file->f_it;
++#else
++ it = file->private_data; /* XXX: compat macro */
++ file->private_data = NULL; /* prevent ll_local_open assertion */
++#endif
++
++ fd = ll_file_data_get();
++ if (fd == NULL)
++ RETURN(-ENOMEM);
++
++ if (S_ISDIR(inode->i_mode)) {
++again:
++ spin_lock(&lli->lli_lock);
++ if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
++ LASSERT(lli->lli_sai == NULL);
++ lli->lli_opendir_key = fd;
++ lli->lli_opendir_pid = cfs_curproc_pid();
++ opendir_set = 1;
++ } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
++ lli->lli_opendir_key != NULL)) {
++ /* Two cases for this:
++ * (1) The same process open such directory many times.
++ * (2) The old process opened the directory, and exited
++ * before its children processes. Then new process
++ * with the same pid opens such directory before the
++ * old process's children processes exit.
++ * reset stat ahead for such cases. */
++ spin_unlock(&lli->lli_lock);
++ CDEBUG(D_INFO, "Conflict statahead for %.*s %lu/%u"
++ " reset it.\n", file->f_dentry->d_name.len,
++ file->f_dentry->d_name.name,
++ inode->i_ino, inode->i_generation);
++ ll_stop_statahead(inode, lli->lli_opendir_key);
++ goto again;
++ }
++ spin_unlock(&lli->lli_lock);
++ }
++
++ if (inode->i_sb->s_root == file->f_dentry) {
++ LUSTRE_FPRIVATE(file) = fd;
++ RETURN(0);
++ }
++
++ if (!it || !it->d.lustre.it_disposition) {
++ /* Convert f_flags into access mode. We cannot use file->f_mode,
++ * because everything but O_ACCMODE mask was stripped from it */
++ if ((oit.it_flags + 1) & O_ACCMODE)
++ oit.it_flags++;
++ if (file->f_flags & O_TRUNC)
++ oit.it_flags |= FMODE_WRITE;
++
++ /* kernel only call f_op->open in dentry_open. filp_open calls
++ * dentry_open after call to open_namei that checks permissions.
++ * Only nfsd_open call dentry_open directly without checking
++ * permissions and because of that this code below is safe. */
++ if (oit.it_flags & FMODE_WRITE)
++ oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
++
++ /* We do not want O_EXCL here, presumably we opened the file
++ * already? XXX - NFS implications? */
++ oit.it_flags &= ~O_EXCL;
++
++ it = &oit;
++ }
++
++restart:
++ /* Let's see if we have file open on MDS already. */
++ if (it->it_flags & FMODE_WRITE) {
++ och_p = &lli->lli_mds_write_och;
++ och_usecount = &lli->lli_open_fd_write_count;
++ } else if (it->it_flags & FMODE_EXEC) {
++ och_p = &lli->lli_mds_exec_och;
++ och_usecount = &lli->lli_open_fd_exec_count;
++ } else {
++ och_p = &lli->lli_mds_read_och;
++ och_usecount = &lli->lli_open_fd_read_count;
++ }
++
++ LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
++ it->d.lustre.it_disposition);
++
++ down(&lli->lli_och_sem);
++ if (*och_p) { /* Open handle is present */
++ if (it_disposition(it, DISP_OPEN_OPEN)) {
++ /* Well, there's extra open request that we do not need,
++ let's close it somehow. This will decref request. */
++ rc = it_open_error(DISP_OPEN_OPEN, it);
++ if (rc) {
++ up(&lli->lli_och_sem);
++ ll_file_data_put(fd);
++ GOTO(out_openerr, rc);
++ }
++ ll_release_openhandle(file->f_dentry, it);
++ lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
++ LPROC_LL_OPEN);
++ }
++ (*och_usecount)++;
++
++ rc = ll_local_open(file, it, fd, NULL);
++
++ LASSERTF(rc == 0, "rc = %d\n", rc);
++ } else {
++ LASSERT(*och_usecount == 0);
++ if (!it->d.lustre.it_disposition) {
++ /* We cannot just request lock handle now, new ELC code
++ means that one of other OPEN locks for this file
++ could be cancelled, and since blocking ast handler
++ would attempt to grab och_sem as well, that would
++ result in a deadlock */
++ up(&lli->lli_och_sem);
++ rc = ll_intent_file_open(file, NULL, 0, it);
++ if (rc) {
++ ll_file_data_put(fd);
++ GOTO(out_openerr, rc);
++ }
++
++ mdc_set_lock_data(&it->d.lustre.it_lock_handle,
++ file->f_dentry->d_inode);
++ goto restart;
++ }
++
++ OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
++ if (!*och_p) {
++ ll_file_data_put(fd);
++ GOTO(out_och_free, rc = -ENOMEM);
++ }
++ (*och_usecount)++;
++ req = it->d.lustre.it_data;
++
++ /* mdc_intent_lock() didn't get a request ref if there was an
++ * open error, so don't do cleanup on the request here
++ * (bug 3430) */
++ /* XXX (green): Should not we bail out on any error here, not
++ * just open error? */
++ rc = it_open_error(DISP_OPEN_OPEN, it);
++ if (rc) {
++ ll_file_data_put(fd);
++ GOTO(out_och_free, rc);
++ }
++
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
++ rc = ll_local_open(file, it, fd, *och_p);
++ LASSERTF(rc == 0, "rc = %d\n", rc);
++ }
++ up(&lli->lli_och_sem);
++
++ /* Must do this outside lli_och_sem lock to prevent deadlock where
++ different kind of OPEN lock for this same inode gets cancelled
++ by ldlm_cancel_lru */
++ if (!S_ISREG(inode->i_mode))
++ GOTO(out, rc);
++
++ lsm = lli->lli_smd;
++ if (lsm == NULL) {
++ if (file->f_flags & O_LOV_DELAY_CREATE ||
++ !(file->f_mode & FMODE_WRITE)) {
++ CDEBUG(D_INODE, "object creation was delayed\n");
++ GOTO(out, rc);
++ }
++ }
++ file->f_flags &= ~O_LOV_DELAY_CREATE;
++ GOTO(out, rc);
++ out:
++ ptlrpc_req_finished(req);
++ if (req)
++ it_clear_disposition(it, DISP_ENQ_OPEN_REF);
++ if (rc == 0) {
++ ll_open_complete(inode);
++ } else {
++out_och_free:
++ if (*och_p) {
++ OBD_FREE(*och_p, sizeof (struct obd_client_handle));
++ *och_p = NULL; /* OBD_FREE writes some magic there */
++ (*och_usecount)--;
++ }
++ up(&lli->lli_och_sem);
++out_openerr:
++ if (opendir_set != 0)
++ ll_stop_statahead(inode, lli->lli_opendir_key);
++ }
++
++ return rc;
++}
++
++/* Fills the obdo with the attributes for the inode defined by lsm */
++int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
++ struct obdo *oa)
++{
++ struct ptlrpc_request_set *set;
++ struct obd_info oinfo = { { { 0 } } };
++ int rc;
++ ENTRY;
++
++ LASSERT(lsm != NULL);
++
++ memset(oa, 0, sizeof *oa);
++ oinfo.oi_md = lsm;
++ oinfo.oi_oa = oa;
++ oa->o_id = lsm->lsm_object_id;
++ oa->o_mode = S_IFREG;
++ oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
++ OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
++ OBD_MD_FLCTIME;
++
++ set = ptlrpc_prep_set();
++ if (set == NULL) {
++ rc = -ENOMEM;
++ } else {
++ rc = obd_getattr_async(exp, &oinfo, set);
++ if (rc == 0)
++ rc = ptlrpc_set_wait(set);
++ ptlrpc_set_destroy(set);
++ }
++ if (rc)
++ RETURN(rc);
++
++ oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
++ OBD_MD_FLCTIME | OBD_MD_FLSIZE);
++ RETURN(0);
++}
++
++static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct obd_export *exp = ll_i2obdexp(inode);
++ struct {
++ char name[16];
++ struct ldlm_lock *lock;
++ } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
++ __u32 stripe, vallen = sizeof(stripe);
++ int rc;
++ ENTRY;
++
++ if (lsm->lsm_stripe_count == 1)
++ GOTO(check, stripe = 0);
++
++ /* get our offset in the lov */
++ rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
++ if (rc != 0) {
++ CERROR("obd_get_info: rc = %d\n", rc);
++ RETURN(rc);
++ }
++ LASSERT(stripe < lsm->lsm_stripe_count);
++
++check:
++ if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
++ lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){
++ LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
++ lsm->lsm_oinfo[stripe]->loi_id,
++ lsm->lsm_oinfo[stripe]->loi_gr);
++ RETURN(-ELDLM_NO_LOCK_DATA);
++ }
++
++ RETURN(stripe);
++}
++
++/* Get extra page reference to ensure it is not going away */
++void ll_pin_extent_cb(void *data)
++{
++ struct page *page = data;
++
++ page_cache_get(page);
++
++ return;
++}
++/* Flush the page from page cache for an extent as its canceled.
++ * Page to remove is delivered as @data.
++ *
++ * No one can dirty the extent until we've finished our work and they cannot
++ * enqueue another lock. The DLM protects us from ll_file_read/write here,
++ * but other kernel actors could have pages locked.
++ *
++ * If @discard is set, there is no need to write the page if it is dirty.
++ *
++ * Called with the DLM lock held. */
++int ll_page_removal_cb(void *data, int discard)
++{
++ int rc;
++ struct page *page = data;
++ struct address_space *mapping;
++
++ ENTRY;
++
++ /* We have page reference already from ll_pin_page */
++ lock_page(page);
++
++ /* Already truncated by somebody */
++ if (!page->mapping)
++ GOTO(out, rc = 0);
++
++ mapping = page->mapping;
++
++ ll_teardown_mmaps(mapping,
++ (__u64)page->index << PAGE_CACHE_SHIFT,
++ ((__u64)page->index<<PAGE_CACHE_SHIFT)|
++ ~PAGE_CACHE_MASK);
++ LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
++ if (!discard && PageWriteback(page))
++ wait_on_page_writeback(page);
++
++ if (!discard && clear_page_dirty_for_io(page)) {
++ rc = ll_call_writepage(page->mapping->host, page);
++ /* either waiting for io to complete or reacquiring
++ * the lock that the failed writepage released */
++ lock_page(page);
++ wait_on_page_writeback(page);
++ if (rc < 0) {
++ CERROR("writepage inode %lu(%p) of page %p "
++ "failed: %d\n", mapping->host->i_ino,
++ mapping->host, page, rc);
++ if (rc == -ENOSPC)
++ set_bit(AS_ENOSPC, &mapping->flags);
++ else
++ set_bit(AS_EIO, &mapping->flags);
++ }
++ }
++ if (page->mapping != NULL) {
++ struct ll_async_page *llap = llap_cast_private(page);
++ // checking again to account for writeback's lock_page()
++ LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
++ if (llap)
++ ll_ra_accounting(llap, page->mapping);
++ ll_truncate_complete_page(page);
++ }
++ EXIT;
++out:
++ LASSERT(!PageWriteback(page));
++ unlock_page(page);
++ page_cache_release(page);
++
++ return 0;
++}
++
++int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
++ void *data, int flag)
++{
++ struct inode *inode;
++ struct ll_inode_info *lli;
++ struct lov_stripe_md *lsm;
++ int stripe;
++ __u64 kms;
++
++ ENTRY;
++
++ if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
++ LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
++ LBUG();
++ }
++
++ inode = ll_inode_from_lock(lock);
++ if (inode == NULL)
++ RETURN(0);
++ lli = ll_i2info(inode);
++ if (lli == NULL)
++ GOTO(iput, 0);
++ if (lli->lli_smd == NULL)
++ GOTO(iput, 0);
++ lsm = lli->lli_smd;
++
++ stripe = ll_lock_to_stripe_offset(inode, lock);
++ if (stripe < 0)
++ GOTO(iput, 0);
++
++ lov_stripe_lock(lsm);
++ lock_res_and_lock(lock);
++ kms = ldlm_extent_shift_kms(lock,
++ lsm->lsm_oinfo[stripe]->loi_kms);
++
++ if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
++ LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
++ lsm->lsm_oinfo[stripe]->loi_kms, kms);
++ lsm->lsm_oinfo[stripe]->loi_kms = kms;
++ unlock_res_and_lock(lock);
++ lov_stripe_unlock(lsm);
++ ll_try_done_writing(inode);
++ EXIT;
++iput:
++ iput(inode);
++
++ return 0;
++}
++
++#if 0
++int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
++{
++ /* XXX ALLOCATE - 160 bytes */
++ struct inode *inode = ll_inode_from_lock(lock);
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lustre_handle lockh = { 0 };
++ struct ost_lvb *lvb;
++ int stripe;
++ ENTRY;
++
++ if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
++ LDLM_FL_BLOCK_CONV)) {
++ LBUG(); /* not expecting any blocked async locks yet */
++ LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
++ "lock, returning");
++ ldlm_lock_dump(D_OTHER, lock, 0);
++ ldlm_reprocess_all(lock->l_resource);
++ RETURN(0);
++ }
++
++ LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
++
++ stripe = ll_lock_to_stripe_offset(inode, lock);
++ if (stripe < 0)
++ goto iput;
++
++ if (lock->l_lvb_len) {
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ __u64 kms;
++ lvb = lock->l_lvb_data;
++ lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
++
++ lock_res_and_lock(lock);
++ ll_inode_size_lock(inode, 1);
++ kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
++ kms = ldlm_extent_shift_kms(NULL, kms);
++ if (lsm->lsm_oinfo[stripe].loi_kms != kms)
++ LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
++ lsm->lsm_oinfo[stripe].loi_kms, kms);
++ lsm->lsm_oinfo[stripe].loi_kms = kms;
++ ll_inode_size_unlock(inode, 1);
++ unlock_res_and_lock(lock);
++ }
++
++iput:
++ iput(inode);
++ wake_up(&lock->l_waitq);
++
++ ldlm_lock2handle(lock, &lockh);
++ ldlm_lock_decref(&lockh, LCK_PR);
++ RETURN(0);
++}
++#endif
++
++static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
++{
++ struct ptlrpc_request *req = reqp;
++ struct inode *inode = ll_inode_from_lock(lock);
++ struct ll_inode_info *lli;
++ struct lov_stripe_md *lsm;
++ struct ost_lvb *lvb;
++ int rc, stripe;
++ int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
++ ENTRY;
++
++ if (inode == NULL)
++ GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
++ lli = ll_i2info(inode);
++ if (lli == NULL)
++ GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
++ lsm = lli->lli_smd;
++ if (lsm == NULL)
++ GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
++
++ /* First, find out which stripe index this lock corresponds to. */
++ stripe = ll_lock_to_stripe_offset(inode, lock);
++ if (stripe < 0)
++ GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
++
++ rc = lustre_pack_reply(req, 2, size, NULL);
++ if (rc)
++ GOTO(iput, rc);
++
++ lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
++ lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
++ lvb->lvb_mtime = LTIME_S(inode->i_mtime);
++ lvb->lvb_atime = LTIME_S(inode->i_atime);
++ lvb->lvb_ctime = LTIME_S(inode->i_ctime);
++
++ LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
++ " atime "LPU64", mtime "LPU64", ctime "LPU64,
++ i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
++ lvb->lvb_atime, lvb->lvb_ctime);
++ iput:
++ iput(inode);
++
++ out:
++ /* These errors are normal races, so we don't want to fill the console
++ * with messages by calling ptlrpc_error() */
++ if (rc == -ELDLM_NO_LOCK_DATA)
++ lustre_pack_reply(req, 1, NULL, NULL);
++
++ req->rq_status = rc;
++ return rc;
++}
++
++int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
++ lstat_t *st)
++{
++ struct lustre_handle lockh = { 0 };
++ struct ldlm_enqueue_info einfo = { 0 };
++ struct obd_info oinfo = { { { 0 } } };
++ struct ost_lvb lvb;
++ int rc;
++
++ ENTRY;
++
++ einfo.ei_type = LDLM_EXTENT;
++ einfo.ei_mode = LCK_PR;
++ einfo.ei_cb_bl = osc_extent_blocking_cb;
++ einfo.ei_cb_cp = ldlm_completion_ast;
++ einfo.ei_cb_gl = ll_glimpse_callback;
++ einfo.ei_cbdata = NULL;
++
++ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
++ oinfo.oi_lockh = &lockh;
++ oinfo.oi_md = lsm;
++ oinfo.oi_flags = LDLM_FL_HAS_INTENT;
++
++ rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
++ if (rc == -ENOENT)
++ RETURN(rc);
++ if (rc != 0) {
++ CERROR("obd_enqueue returned rc %d, "
++ "returning -EIO\n", rc);
++ RETURN(rc > 0 ? -EIO : rc);
++ }
++
++ lov_stripe_lock(lsm);
++ memset(&lvb, 0, sizeof(lvb));
++ obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
++ st->st_size = lvb.lvb_size;
++ st->st_blocks = lvb.lvb_blocks;
++ st->st_mtime = lvb.lvb_mtime;
++ st->st_atime = lvb.lvb_atime;
++ st->st_ctime = lvb.lvb_ctime;
++ lov_stripe_unlock(lsm);
++
++ RETURN(rc);
++}
++
++/* NB: obd_merge_lvb will prefer locally cached writes if they extend the
++ * file (because it prefers KMS over RSS when larger) */
++int ll_glimpse_size(struct inode *inode, int ast_flags)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct lustre_handle lockh = { 0 };
++ struct ldlm_enqueue_info einfo = { 0 };
++ struct obd_info oinfo = { { { 0 } } };
++ struct ost_lvb lvb;
++ int rc;
++ ENTRY;
++
++ CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
++
++ if (!lli->lli_smd) {
++ CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
++ RETURN(0);
++ }
++
++ /* NOTE: this looks like DLM lock request, but it may not be one. Due
++ * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
++ * won't revoke any conflicting DLM locks held. Instead,
++ * ll_glimpse_callback() will be called on each client
++ * holding a DLM lock against this file, and resulting size
++ * will be returned for each stripe. DLM lock on [0, EOF] is
++ * acquired only if there were no conflicting locks. */
++ einfo.ei_type = LDLM_EXTENT;
++ einfo.ei_mode = LCK_PR;
++ einfo.ei_cb_bl = osc_extent_blocking_cb;
++ einfo.ei_cb_cp = ldlm_completion_ast;
++ einfo.ei_cb_gl = ll_glimpse_callback;
++ einfo.ei_cbdata = inode;
++
++ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
++ oinfo.oi_lockh = &lockh;
++ oinfo.oi_md = lli->lli_smd;
++ oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
++
++ rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
++ if (rc == -ENOENT)
++ RETURN(rc);
++ if (rc != 0) {
++ CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
++ RETURN(rc > 0 ? -EIO : rc);
++ }
++
++ ll_inode_size_lock(inode, 1);
++ inode_init_lvb(inode, &lvb);
++ rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
++ i_size_write(inode, lvb.lvb_size);
++ inode->i_blocks = lvb.lvb_blocks;
++ LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
++ LTIME_S(inode->i_atime) = lvb.lvb_atime;
++ LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
++ ll_inode_size_unlock(inode, 1);
++
++ CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
++ i_size_read(inode), (long long)inode->i_blocks);
++
++ RETURN(rc);
++}
++
++int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
++ struct lov_stripe_md *lsm, int mode,
++ ldlm_policy_data_t *policy, struct lustre_handle *lockh,
++ int ast_flags)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ost_lvb lvb;
++ struct ldlm_enqueue_info einfo = { 0 };
++ struct obd_info oinfo = { { { 0 } } };
++ int rc;
++ ENTRY;
++
++ LASSERT(!lustre_handle_is_used(lockh));
++ LASSERT(lsm != NULL);
++
++ /* don't drop the mmapped file to LRU */
++ if (mapping_mapped(inode->i_mapping))
++ ast_flags |= LDLM_FL_NO_LRU;
++
++ /* XXX phil: can we do this? won't it screw the file size up? */
++ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
++ (sbi->ll_flags & LL_SBI_NOLCK))
++ RETURN(0);
++
++ CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
++ inode->i_ino, policy->l_extent.start, policy->l_extent.end);
++
++ einfo.ei_type = LDLM_EXTENT;
++ einfo.ei_mode = mode;
++ einfo.ei_cb_bl = osc_extent_blocking_cb;
++ einfo.ei_cb_cp = ldlm_completion_ast;
++ einfo.ei_cb_gl = ll_glimpse_callback;
++ einfo.ei_cbdata = inode;
++
++ oinfo.oi_policy = *policy;
++ oinfo.oi_lockh = lockh;
++ oinfo.oi_md = lsm;
++ oinfo.oi_flags = ast_flags;
++
++ rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
++ *policy = oinfo.oi_policy;
++ if (rc > 0)
++ rc = -EIO;
++
++ ll_inode_size_lock(inode, 1);
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
++
++ if (policy->l_extent.start == 0 &&
++ policy->l_extent.end == OBD_OBJECT_EOF) {
++ /* vmtruncate()->ll_truncate() first sets the i_size and then
++ * the kms under both a DLM lock and the
++ * ll_inode_size_lock(). If we don't get the
++ * ll_inode_size_lock() here we can match the DLM lock and
++ * reset i_size from the kms before the truncating path has
++ * updated the kms. generic_file_write can then trust the
++ * stale i_size when doing appending writes and effectively
++ * cancel the result of the truncate. Getting the
++ * ll_inode_size_lock() after the enqueue maintains the DLM
++ * -> ll_inode_size_lock() acquiring order. */
++ i_size_write(inode, lvb.lvb_size);
++ CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
++ inode->i_ino, i_size_read(inode));
++ }
++
++ if (rc == 0) {
++ LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
++ LTIME_S(inode->i_atime) = lvb.lvb_atime;
++ LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
++ }
++ ll_inode_size_unlock(inode, 1);
++
++ RETURN(rc);
++}
++
++int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
++ struct lov_stripe_md *lsm, int mode,
++ struct lustre_handle *lockh)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ int rc;
++ ENTRY;
++
++ /* XXX phil: can we do this? won't it screw the file size up? */
++ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
++ (sbi->ll_flags & LL_SBI_NOLCK))
++ RETURN(0);
++
++ rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh);
++
++ RETURN(rc);
++}
++
++static void ll_set_file_contended(struct inode *inode)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++
++ lli->lli_contention_time = cfs_time_current();
++ set_bit(LLI_F_CONTENDED, &lli->lli_flags);
++}
++
++void ll_clear_file_contended(struct inode *inode)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++
++ clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
++}
++
++static int ll_is_file_contended(struct file *file)
++{
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ ENTRY;
++
++ if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
++ CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
++ " osc connect flags = 0x"LPX64"\n",
++ sbi->ll_lco.lco_flags);
++ RETURN(0);
++ }
++ if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
++ RETURN(1);
++ if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
++ cfs_time_t cur_time = cfs_time_current();
++ cfs_time_t retry_time;
++
++ retry_time = cfs_time_add(
++ lli->lli_contention_time,
++ cfs_time_seconds(sbi->ll_contention_time));
++ if (cfs_time_after(cur_time, retry_time)) {
++ ll_clear_file_contended(inode);
++ RETURN(0);
++ }
++ RETURN(1);
++ }
++ RETURN(0);
++}
++
++static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
++ struct file *file, const struct iovec *iov,
++ unsigned long nr_segs,
++ loff_t start, loff_t end, int rw)
++{
++ int append;
++ int tree_locked = 0;
++ int rc;
++ struct inode * inode = file->f_dentry->d_inode;
++
++ append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
++
++ if (append || !ll_is_file_contended(file)) {
++ struct ll_lock_tree_node *node;
++ int ast_flags;
++
++ ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
++ if (file->f_flags & O_NONBLOCK)
++ ast_flags |= LDLM_FL_BLOCK_NOWAIT;
++ node = ll_node_from_inode(inode, start, end,
++ (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
++ if (IS_ERR(node)) {
++ rc = PTR_ERR(node);
++ GOTO(out, rc);
++ }
++ tree->lt_fd = LUSTRE_FPRIVATE(file);
++ rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
++ if (rc == 0)
++ tree_locked = 1;
++ else if (rc == -EUSERS)
++ ll_set_file_contended(inode);
++ else
++ GOTO(out, rc);
++ }
++ RETURN(tree_locked);
++out:
++ return rc;
++}
++
++/* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
++ */
++static size_t ll_file_get_iov_count(const struct iovec *iov,
++ unsigned long *nr_segs)
++{
++ size_t count = 0;
++ unsigned long seg;
++
++ for (seg = 0; seg < *nr_segs; seg++) {
++ const struct iovec *iv = &iov[seg];
++
++ /*
++ * If any segment has a negative length, or the cumulative
++ * length ever wraps negative then return -EINVAL.
++ */
++ count += iv->iov_len;
++ if (unlikely((ssize_t)(count|iv->iov_len) < 0))
++ return -EINVAL;
++ if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
++ continue;
++ if (seg == 0)
++ return -EFAULT;
++ *nr_segs = seg;
++ count -= iv->iov_len; /* This segment is no good */
++ break;
++ }
++ return count;
++}
++
++static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
++ unsigned long *nrsegs_copy,
++ struct iovec *iov_copy, size_t *offset,
++ size_t size)
++{
++ int i;
++ const struct iovec *iov = *iov_out;
++ for (i = 0; i < *nr_segs;
++ i++) {
++ const struct iovec *iv = &iov[i];
++ struct iovec *ivc = &iov_copy[i];
++ *ivc = *iv;
++ if (i == 0) {
++ ivc->iov_len -= *offset;
++ ivc->iov_base += *offset;
++ }
++ if (ivc->iov_len >= size) {
++ ivc->iov_len = size;
++ if (i == 0)
++ *offset += size;
++ else
++ *offset = size;
++ break;
++ }
++ size -= ivc->iov_len;
++ }
++ *iov_out += i;
++ *nr_segs -= i;
++ *nrsegs_copy = i + 1;
++
++ return 0;
++}
++
++static int ll_reget_short_lock(struct page *page, int rw,
++ obd_off start, obd_off end,
++ void **cookie)
++{
++ struct ll_async_page *llap;
++ struct obd_export *exp;
++ struct inode *inode = page->mapping->host;
++
++ ENTRY;
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ RETURN(0);
++
++ llap = llap_cast_private(page);
++ if (llap == NULL)
++ RETURN(0);
++
++ RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
++ &llap->llap_cookie, rw, start, end,
++ cookie));
++}
++
++static void ll_release_short_lock(struct inode *inode, obd_off end,
++ void *cookie, int rw)
++{
++ struct obd_export *exp;
++ int rc;
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ return;
++
++ rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
++ cookie, rw);
++ if (rc < 0)
++ CERROR("unlock failed (%d)\n", rc);
++}
++
++static inline int ll_file_get_fast_lock(struct file *file,
++ obd_off ppos, obd_off end,
++ const struct iovec *iov,
++ unsigned long nr_segs,
++ void **cookie, int rw)
++{
++ int rc = 0, seg;
++ struct page *page;
++
++ ENTRY;
++
++ /* we would like this read request to be lockfree */
++ for (seg = 0; seg < nr_segs; seg++) {
++ const struct iovec *iv = &iov[seg];
++ if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
++ GOTO(out, rc);
++ }
++
++ page = find_lock_page(file->f_dentry->d_inode->i_mapping,
++ ppos >> CFS_PAGE_SHIFT);
++ if (page) {
++ if (ll_reget_short_lock(page, rw, ppos, end, cookie))
++ rc = 1;
++
++ unlock_page(page);
++ page_cache_release(page);
++ }
++
++out:
++ RETURN(rc);
++}
++
++static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
++ void *cookie, int rw)
++{
++ ll_release_short_lock(inode, end, cookie, rw);
++}
++
++enum ll_lock_style {
++ LL_LOCK_STYLE_NOLOCK = 0,
++ LL_LOCK_STYLE_FASTLOCK = 1,
++ LL_LOCK_STYLE_TREELOCK = 2
++};
++
++static inline int ll_file_get_lock(struct file *file, obd_off ppos,
++ obd_off end, const struct iovec *iov,
++ unsigned long nr_segs, void **cookie,
++ struct ll_lock_tree *tree, int rw)
++{
++ int rc;
++
++ ENTRY;
++
++ if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, cookie, rw))
++ RETURN(LL_LOCK_STYLE_FASTLOCK);
++
++ rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
++ ppos, end, rw);
++ /* rc: 1 for tree lock, 0 for no lock, <0 for error */
++ switch (rc) {
++ case 1:
++ RETURN(LL_LOCK_STYLE_TREELOCK);
++ case 0:
++ RETURN(LL_LOCK_STYLE_NOLOCK);
++ }
++
++ /* an error happened if we reached this point, rc = -errno here */
++ RETURN(rc);
++}
++
++static inline void ll_file_put_lock(struct inode *inode, obd_off end,
++ enum ll_lock_style lock_style,
++ void *cookie, struct ll_lock_tree *tree,
++ int rw)
++
++{
++ switch (lock_style) {
++ case LL_LOCK_STYLE_TREELOCK:
++ ll_tree_unlock(tree);
++ break;
++ case LL_LOCK_STYLE_FASTLOCK:
++ ll_file_put_fast_lock(inode, end, cookie, rw);
++ break;
++ default:
++ CERROR("invalid locking style (%d)\n", lock_style);
++ }
++}
++
++#ifdef HAVE_FILE_READV
++static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
++ unsigned long nr_segs, loff_t *ppos)
++{
++#else
++static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos)
++{
++ struct file *file = iocb->ki_filp;
++ loff_t *ppos = &iocb->ki_pos;
++#endif
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ll_lock_tree tree;
++ struct ost_lvb lvb;
++ struct ll_ra_read bead;
++ int ra = 0;
++ obd_off end;
++ ssize_t retval, chunk, sum = 0;
++ int lock_style;
++ struct iovec *iov_copy = NULL;
++ unsigned long nrsegs_copy, nrsegs_orig = 0;
++ size_t count, iov_offset = 0;
++ __u64 kms;
++ void *cookie;
++ ENTRY;
++
++ count = ll_file_get_iov_count(iov, &nr_segs);
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
++ inode->i_ino, inode->i_generation, inode, count, *ppos);
++ /* "If nbyte is 0, read() will return 0 and have no other results."
++ * -- Single Unix Spec */
++ if (count == 0)
++ RETURN(0);
++
++ ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
++
++ if (!lsm) {
++ /* Read on file with no objects should return zero-filled
++ * buffers up to file size (we can get non-zero sizes with
++ * mknod + truncate, then opening file for read. This is a
++ * common pattern in NFS case, it seems). Bug 6243 */
++ int notzeroed;
++ /* Since there are no objects on OSTs, we have nothing to get
++ * lock on and so we are forced to access inode->i_size
++ * unguarded */
++
++ /* Read beyond end of file */
++ if (*ppos >= i_size_read(inode))
++ RETURN(0);
++
++ if (count > i_size_read(inode) - *ppos)
++ count = i_size_read(inode) - *ppos;
++ /* Make sure to correctly adjust the file pos pointer for
++ * EFAULT case */
++ for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
++ const struct iovec *iv = &iov[nrsegs_copy];
++
++ if (count < iv->iov_len)
++ chunk = count;
++ else
++ chunk = iv->iov_len;
++ notzeroed = clear_user(iv->iov_base, chunk);
++ sum += (chunk - notzeroed);
++ count -= (chunk - notzeroed);
++ if (notzeroed || !count)
++ break;
++ }
++ *ppos += sum;
++ if (!sum)
++ RETURN(-EFAULT);
++ RETURN(sum);
++ }
++
++repeat:
++ if (sbi->ll_max_rw_chunk != 0) {
++ /* first, let's know the end of the current stripe */
++ end = *ppos;
++ obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
++ (obd_off *)&end);
++
++ /* correct, the end is beyond the request */
++ if (end > *ppos + count - 1)
++ end = *ppos + count - 1;
++
++ /* and chunk shouldn't be too large even if striping is wide */
++ if (end - *ppos > sbi->ll_max_rw_chunk)
++ end = *ppos + sbi->ll_max_rw_chunk - 1;
++
++ chunk = end - *ppos + 1;
++ if ((count == chunk) && (iov_offset == 0)) {
++ if (iov_copy)
++ OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
++
++ iov_copy = (struct iovec *)iov;
++ nrsegs_copy = nr_segs;
++ } else {
++ if (!iov_copy) {
++ nrsegs_orig = nr_segs;
++ OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
++ if (!iov_copy)
++ GOTO(out, retval = -ENOMEM);
++ }
++
++ iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
++ &iov_offset, chunk);
++ }
++ } else {
++ end = *ppos + count - 1;
++ iov_copy = (struct iovec *)iov;
++ nrsegs_copy = nr_segs;
++ }
++
++ lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
++ iov_copy, nrsegs_copy, &cookie, &tree,
++ OBD_BRW_READ);
++ if (lock_style < 0)
++ GOTO(out, retval = lock_style);
++
++ ll_inode_size_lock(inode, 1);
++ /*
++ * Consistency guarantees: following possibilities exist for the
++ * relation between region being read and real file size at this
++ * moment:
++ *
++ * (A): the region is completely inside of the file;
++ *
++ * (B-x): x bytes of region are inside of the file, the rest is
++ * outside;
++ *
++ * (C): the region is completely outside of the file.
++ *
++ * This classification is stable under DLM lock acquired by
++ * ll_tree_lock() above, because to change class, other client has to
++ * take DLM lock conflicting with our lock. Also, any updates to
++ * ->i_size by other threads on this client are serialized by
++ * ll_inode_size_lock(). This guarantees that short reads are handled
++ * correctly in the face of concurrent writes and truncates.
++ */
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
++ kms = lvb.lvb_size;
++ if (*ppos + count - 1 > kms) {
++ /* A glimpse is necessary to determine whether we return a
++ * short read (B) or some zeroes at the end of the buffer (C) */
++ ll_inode_size_unlock(inode, 1);
++ retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
++ if (retval) {
++ if (lock_style != LL_LOCK_STYLE_NOLOCK)
++ ll_file_put_lock(inode, end, lock_style,
++ cookie, &tree, OBD_BRW_READ);
++ goto out;
++ }
++ } else {
++ /* region is within kms and, hence, within real file size (A).
++ * We need to increase i_size to cover the read region so that
++ * generic_file_read() will do its job, but that doesn't mean
++ * the kms size is _correct_, it is only the _minimum_ size.
++ * If someone does a stat they will get the correct size which
++ * will always be >= the kms value here. b=11081 */
++ if (i_size_read(inode) < kms)
++ i_size_write(inode, kms);
++ ll_inode_size_unlock(inode, 1);
++ }
++
++ chunk = end - *ppos + 1;
++ CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
++ inode->i_ino, chunk, *ppos, i_size_read(inode));
++
++ /* turn off the kernel's read-ahead */
++ if (lock_style != LL_LOCK_STYLE_NOLOCK) {
++ file->f_ra.ra_pages = 0;
++ /* initialize read-ahead window once per syscall */
++ if (ra == 0) {
++ ra = 1;
++ bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
++ bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
++ ll_ra_read_in(file, &bead);
++ }
++
++ /* BUG: 5972 */
++ file_accessed(file);
++#ifdef HAVE_FILE_READV
++ retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
++#else
++ retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
++ *ppos);
++#endif
++ ll_file_put_lock(inode, end, lock_style, cookie,
++ &tree, OBD_BRW_READ);
++ } else {
++ retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy, ppos,
++ READ, chunk);
++ }
++ ll_rw_stats_tally(sbi, current->pid, file, count, 0);
++ if (retval > 0) {
++ count -= retval;
++ sum += retval;
++ if (retval == chunk && count > 0)
++ goto repeat;
++ }
++
++ out:
++ if (ra != 0)
++ ll_ra_read_ex(file, &bead);
++ retval = (sum > 0) ? sum : retval;
++
++ if (iov_copy && iov_copy != iov)
++ OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
++
++ RETURN(retval);
++}
++
++static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
++ loff_t *ppos)
++{
++ struct iovec local_iov = { .iov_base = (void __user *)buf,
++ .iov_len = count };
++#ifdef HAVE_FILE_READV
++ return ll_file_readv(file, &local_iov, 1, ppos);
++#else
++ struct kiocb kiocb;
++ ssize_t ret;
++
++ init_sync_kiocb(&kiocb, file);
++ kiocb.ki_pos = *ppos;
++ kiocb.ki_left = count;
++
++ ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
++ *ppos = kiocb.ki_pos;
++ return ret;
++#endif
++}
++
++/*
++ * Write to a file (through the page cache).
++ */
++#ifdef HAVE_FILE_WRITEV
++static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
++ unsigned long nr_segs, loff_t *ppos)
++{
++#else /* AIO stuff */
++static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
++ unsigned long nr_segs, loff_t pos)
++{
++ struct file *file = iocb->ki_filp;
++ loff_t *ppos = &iocb->ki_pos;
++#endif
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
++ struct ll_lock_tree tree;
++ loff_t maxbytes = ll_file_maxbytes(inode);
++ loff_t lock_start, lock_end, end;
++ ssize_t retval, chunk, sum = 0;
++ int tree_locked;
++ struct iovec *iov_copy = NULL;
++ unsigned long nrsegs_copy, nrsegs_orig = 0;
++ size_t count, iov_offset = 0;
++ ENTRY;
++
++ count = ll_file_get_iov_count(iov, &nr_segs);
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
++ inode->i_ino, inode->i_generation, inode, count, *ppos);
++
++ SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
++
++ /* POSIX, but surprised the VFS doesn't check this already */
++ if (count == 0)
++ RETURN(0);
++
++ /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
++ * called on the file, don't fail the below assertion (bug 2388). */
++ if (file->f_flags & O_LOV_DELAY_CREATE &&
++ ll_i2info(inode)->lli_smd == NULL)
++ RETURN(-EBADF);
++
++ LASSERT(ll_i2info(inode)->lli_smd != NULL);
++
++ down(&ll_i2info(inode)->lli_write_sem);
++
++repeat:
++ chunk = 0; /* just to fix gcc's warning */
++ end = *ppos + count - 1;
++
++ if (file->f_flags & O_APPEND) {
++ lock_start = 0;
++ lock_end = OBD_OBJECT_EOF;
++ iov_copy = (struct iovec *)iov;
++ nrsegs_copy = nr_segs;
++ } else if (sbi->ll_max_rw_chunk != 0) {
++ /* first, let's know the end of the current stripe */
++ end = *ppos;
++ obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
++ (obd_off *)&end);
++
++ /* correct, the end is beyond the request */
++ if (end > *ppos + count - 1)
++ end = *ppos + count - 1;
++
++ /* and chunk shouldn't be too large even if striping is wide */
++ if (end - *ppos > sbi->ll_max_rw_chunk)
++ end = *ppos + sbi->ll_max_rw_chunk - 1;
++ lock_start = *ppos;
++ lock_end = end;
++ chunk = end - *ppos + 1;
++ if ((count == chunk) && (iov_offset == 0)) {
++ if (iov_copy)
++ OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
++
++ iov_copy = (struct iovec *)iov;
++ nrsegs_copy = nr_segs;
++ } else {
++ if (!iov_copy) {
++ nrsegs_orig = nr_segs;
++ OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
++ if (!iov_copy)
++ GOTO(out, retval = -ENOMEM);
++ }
++ iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
++ &iov_offset, chunk);
++ }
++ } else {
++ lock_start = *ppos;
++ lock_end = end;
++ iov_copy = (struct iovec *)iov;
++ nrsegs_copy = nr_segs;
++ }
++
++ tree_locked = ll_file_get_tree_lock_iov(&tree, file, iov_copy,
++ nrsegs_copy,
++ (obd_off)lock_start,
++ (obd_off)lock_end,
++ OBD_BRW_WRITE);
++ if (tree_locked < 0)
++ GOTO(out, retval = tree_locked);
++
++ /* This is ok, g_f_w will overwrite this under i_sem if it races
++ * with a local truncate, it just makes our maxbyte checking easier.
++ * The i_size value gets updated in ll_extent_lock() as a consequence
++ * of the [0,EOF] extent lock we requested above. */
++ if (file->f_flags & O_APPEND) {
++ *ppos = i_size_read(inode);
++ end = *ppos + count - 1;
++ }
++
++ if (*ppos >= maxbytes) {
++ send_sig(SIGXFSZ, current, 0);
++ GOTO(out_unlock, retval = -EFBIG);
++ }
++ if (end > maxbytes - 1)
++ end = maxbytes - 1;
++
++ /* generic_file_write handles O_APPEND after getting i_mutex */
++ chunk = end - *ppos + 1;
++ CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
++ inode->i_ino, chunk, *ppos);
++ if (tree_locked)
++#ifdef HAVE_FILE_WRITEV
++ retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
++#else
++ retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
++ *ppos);
++#endif
++ else
++ retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy,
++ ppos, WRITE, chunk);
++ ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
++
++out_unlock:
++ if (tree_locked)
++ ll_tree_unlock(&tree);
++
++out:
++ if (retval > 0) {
++ count -= retval;
++ sum += retval;
++ if (retval == chunk && count > 0)
++ goto repeat;
++ }
++
++ up(&ll_i2info(inode)->lli_write_sem);
++
++ if (iov_copy && iov_copy != iov)
++ OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
++
++ retval = (sum > 0) ? sum : retval;
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
++ retval > 0 ? retval : 0);
++ RETURN(retval);
++}
++
++static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
++ loff_t *ppos)
++{
++ struct iovec local_iov = { .iov_base = (void __user *)buf,
++ .iov_len = count };
++
++#ifdef HAVE_FILE_WRITEV
++ return ll_file_writev(file, &local_iov, 1, ppos);
++#else
++ struct kiocb kiocb;
++ ssize_t ret;
++
++ init_sync_kiocb(&kiocb, file);
++ kiocb.ki_pos = *ppos;
++ kiocb.ki_left = count;
++
++ ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
++ *ppos = kiocb.ki_pos;
++
++ return ret;
++#endif
++}
++
++/*
++ * Send file content (through pagecache) somewhere with helper
++ */
++static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
++ read_actor_t actor, void *target)
++{
++ struct inode *inode = in_file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct ll_lock_tree tree;
++ struct ll_lock_tree_node *node;
++ struct ost_lvb lvb;
++ struct ll_ra_read bead;
++ int rc;
++ ssize_t retval;
++ __u64 kms;
++ ENTRY;
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
++ inode->i_ino, inode->i_generation, inode, count, *ppos);
++
++ /* "If nbyte is 0, read() will return 0 and have no other results."
++ * -- Single Unix Spec */
++ if (count == 0)
++ RETURN(0);
++
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
++ /* turn off the kernel's read-ahead */
++ in_file->f_ra.ra_pages = 0;
++
++ /* File with no objects, nothing to lock */
++ if (!lsm)
++ RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
++
++ node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
++ if (IS_ERR(node))
++ RETURN(PTR_ERR(node));
++
++ tree.lt_fd = LUSTRE_FPRIVATE(in_file);
++ rc = ll_tree_lock(&tree, node, NULL, count,
++ in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
++ if (rc != 0)
++ RETURN(rc);
++
++ ll_clear_file_contended(inode);
++ ll_inode_size_lock(inode, 1);
++ /*
++ * Consistency guarantees: following possibilities exist for the
++ * relation between region being read and real file size at this
++ * moment:
++ *
++ * (A): the region is completely inside of the file;
++ *
++ * (B-x): x bytes of region are inside of the file, the rest is
++ * outside;
++ *
++ * (C): the region is completely outside of the file.
++ *
++ * This classification is stable under DLM lock acquired by
++ * ll_tree_lock() above, because to change class, other client has to
++ * take DLM lock conflicting with our lock. Also, any updates to
++ * ->i_size by other threads on this client are serialized by
++ * ll_inode_size_lock(). This guarantees that short reads are handled
++ * correctly in the face of concurrent writes and truncates.
++ */
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
++ kms = lvb.lvb_size;
++ if (*ppos + count - 1 > kms) {
++ /* A glimpse is necessary to determine whether we return a
++ * short read (B) or some zeroes at the end of the buffer (C) */
++ ll_inode_size_unlock(inode, 1);
++ retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
++ if (retval)
++ goto out;
++ } else {
++ /* region is within kms and, hence, within real file size (A) */
++ i_size_write(inode, kms);
++ ll_inode_size_unlock(inode, 1);
++ }
++
++ CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
++ inode->i_ino, count, *ppos, i_size_read(inode));
++
++ bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
++ bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
++ ll_ra_read_in(in_file, &bead);
++ /* BUG: 5972 */
++ file_accessed(in_file);
++ retval = generic_file_sendfile(in_file, ppos, count, actor, target);
++ ll_ra_read_ex(in_file, &bead);
++
++ out:
++ ll_tree_unlock(&tree);
++ RETURN(retval);
++}
++
++static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
++ unsigned long arg)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct obd_export *exp = ll_i2obdexp(inode);
++ struct ll_recreate_obj ucreatp;
++ struct obd_trans_info oti = { 0 };
++ struct obdo *oa = NULL;
++ int lsm_size;
++ int rc = 0;
++ struct lov_stripe_md *lsm, *lsm2;
++ ENTRY;
++
++ if (!cfs_capable(CFS_CAP_SYS_ADMIN))
++ RETURN(-EPERM);
++
++ rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
++ sizeof(struct ll_recreate_obj));
++ if (rc) {
++ RETURN(-EFAULT);
++ }
++ OBDO_ALLOC(oa);
++ if (oa == NULL)
++ RETURN(-ENOMEM);
++
++ down(&lli->lli_size_sem);
++ lsm = lli->lli_smd;
++ if (lsm == NULL)
++ GOTO(out, rc = -ENOENT);
++ lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
++ (lsm->lsm_stripe_count));
++
++ OBD_ALLOC(lsm2, lsm_size);
++ if (lsm2 == NULL)
++ GOTO(out, rc = -ENOMEM);
++
++ oa->o_id = ucreatp.lrc_id;
++ oa->o_nlink = ucreatp.lrc_ost_idx;
++ oa->o_flags |= OBD_FL_RECREATE_OBJS;
++ oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
++ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
++ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
++
++ memcpy(lsm2, lsm, lsm_size);
++ rc = obd_create(exp, oa, &lsm2, &oti);
++
++ OBD_FREE(lsm2, lsm_size);
++ GOTO(out, rc);
++out:
++ up(&lli->lli_size_sem);
++ OBDO_FREE(oa);
++ return rc;
++}
++
++int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
++ int flags, struct lov_user_md *lum,
++ int lum_size)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm;
++ struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
++ int rc = 0;
++ ENTRY;
++
++ down(&lli->lli_size_sem);
++ lsm = lli->lli_smd;
++ if (lsm) {
++ up(&lli->lli_size_sem);
++ CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
++ inode->i_ino);
++ RETURN(-EEXIST);
++ }
++
++ rc = ll_intent_file_open(file, lum, lum_size, &oit);
++ if (rc)
++ GOTO(out, rc);
++ if (it_disposition(&oit, DISP_LOOKUP_NEG))
++ GOTO(out_req_free, rc = -ENOENT);
++ rc = oit.d.lustre.it_status;
++ if (rc < 0)
++ GOTO(out_req_free, rc);
++
++ ll_release_openhandle(file->f_dentry, &oit);
++
++ out:
++ up(&lli->lli_size_sem);
++ ll_intent_release(&oit);
++ RETURN(rc);
++out_req_free:
++ ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
++ goto out;
++}
++
++int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
++ struct lov_mds_md **lmmp, int *lmm_size,
++ struct ptlrpc_request **request)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ll_fid fid;
++ struct mds_body *body;
++ struct lov_mds_md *lmm = NULL;
++ struct ptlrpc_request *req = NULL;
++ int rc, lmmsize;
++
++ ll_inode2fid(&fid, inode);
++
++ rc = ll_get_max_mdsize(sbi, &lmmsize);
++ if (rc)
++ RETURN(rc);
++
++ rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
++ filename, strlen(filename) + 1,
++ OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
++ lmmsize, &req);
++ if (rc < 0) {
++ CDEBUG(D_INFO, "mdc_getattr_name failed "
++ "on %s: rc %d\n", filename, rc);
++ GOTO(out, rc);
++ }
++
++ body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
++ sizeof(*body));
++ LASSERT(body != NULL); /* checked by mdc_getattr_name */
++ /* swabbed by mdc_getattr_name */
++ LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
++
++ lmmsize = body->eadatasize;
++
++ if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
++ lmmsize == 0) {
++ GOTO(out, rc = -ENODATA);
++ }
++
++ lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
++ lmmsize);
++ LASSERT(lmm != NULL);
++ LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
++
++ if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC)) &&
++ (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
++ GOTO(out, rc = -EPROTO);
++ }
++ /*
++ * This is coming from the MDS, so is probably in
++ * little endian. We convert it to host endian before
++ * passing it to userspace.
++ */
++ if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
++ if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC)) {
++ lustre_swab_lov_user_md((struct lov_user_md *)lmm);
++ /* if function called for directory - we should be
++ * avoid swab not existent lsm objects */
++ if (S_ISREG(body->mode))
++ lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
++ } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
++ lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
++ }
++ }
++
++ if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
++ struct lov_stripe_md *lsm;
++ struct lov_user_md_join *lmj;
++ int lmj_size, i, aindex = 0;
++
++ rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
++ if (rc < 0)
++ GOTO(out, rc = -ENOMEM);
++ rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
++ if (rc)
++ GOTO(out_free_memmd, rc);
++
++ lmj_size = sizeof(struct lov_user_md_join) +
++ lsm->lsm_stripe_count *
++ sizeof(struct lov_user_ost_data_join);
++ OBD_ALLOC(lmj, lmj_size);
++ if (!lmj)
++ GOTO(out_free_memmd, rc = -ENOMEM);
++
++ memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
++ for (i = 0; i < lsm->lsm_stripe_count; i++) {
++ struct lov_extent *lex =
++ &lsm->lsm_array->lai_ext_array[aindex];
++
++ if (lex->le_loi_idx + lex->le_stripe_count <= i)
++ aindex ++;
++ CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
++ LPU64" len %d\n", aindex, i,
++ lex->le_start, (int)lex->le_len);
++ lmj->lmm_objects[i].l_extent_start =
++ lex->le_start;
++
++ if ((int)lex->le_len == -1)
++ lmj->lmm_objects[i].l_extent_end = -1;
++ else
++ lmj->lmm_objects[i].l_extent_end =
++ lex->le_start + lex->le_len;
++ lmj->lmm_objects[i].l_object_id =
++ lsm->lsm_oinfo[i]->loi_id;
++ lmj->lmm_objects[i].l_object_gr =
++ lsm->lsm_oinfo[i]->loi_gr;
++ lmj->lmm_objects[i].l_ost_gen =
++ lsm->lsm_oinfo[i]->loi_ost_gen;
++ lmj->lmm_objects[i].l_ost_idx =
++ lsm->lsm_oinfo[i]->loi_ost_idx;
++ }
++ lmm = (struct lov_mds_md *)lmj;
++ lmmsize = lmj_size;
++out_free_memmd:
++ obd_free_memmd(sbi->ll_osc_exp, &lsm);
++ }
++out:
++ *lmmp = lmm;
++ *lmm_size = lmmsize;
++ *request = req;
++ return rc;
++}
++static int ll_lov_setea(struct inode *inode, struct file *file,
++ unsigned long arg)
++{
++ int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
++ struct lov_user_md *lump;
++ int lum_size = sizeof(struct lov_user_md) +
++ sizeof(struct lov_user_ost_data);
++ int rc;
++ ENTRY;
++
++ if (!cfs_capable(CFS_CAP_SYS_ADMIN))
++ RETURN(-EPERM);
++
++ OBD_ALLOC(lump, lum_size);
++ if (lump == NULL) {
++ RETURN(-ENOMEM);
++ }
++ rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
++ if (rc) {
++ OBD_FREE(lump, lum_size);
++ RETURN(-EFAULT);
++ }
++
++ rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
++
++ OBD_FREE(lump, lum_size);
++ RETURN(rc);
++}
++
++static int ll_lov_setstripe(struct inode *inode, struct file *file,
++ unsigned long arg)
++{
++ struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
++ int rc;
++ int flags = FMODE_WRITE;
++ ENTRY;
++
++ /* Bug 1152: copy properly when this is no longer true */
++ LASSERT(sizeof(lum) == sizeof(*lump));
++ LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
++ rc = copy_from_user(&lum, lump, sizeof(lum));
++ if (rc)
++ RETURN(-EFAULT);
++
++ rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
++ if (rc == 0) {
++ put_user(0, &lump->lmm_stripe_count);
++ rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
++ 0, ll_i2info(inode)->lli_smd, lump);
++ }
++ RETURN(rc);
++}
++
++static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
++{
++ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
++
++ if (!lsm)
++ RETURN(-ENODATA);
++
++ return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
++ (void *)arg);
++}
++
++static int ll_get_grouplock(struct inode *inode, struct file *file,
++ unsigned long arg)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ ldlm_policy_data_t policy = { .l_extent = { .start = 0,
++ .end = OBD_OBJECT_EOF}};
++ struct lustre_handle lockh = { 0 };
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ int flags = 0, rc;
++ ENTRY;
++
++ if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
++ RETURN(-EINVAL);
++ }
++
++ policy.l_extent.gid = arg;
++ if (file->f_flags & O_NONBLOCK)
++ flags = LDLM_FL_BLOCK_NOWAIT;
++
++ rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
++ if (rc)
++ RETURN(rc);
++
++ fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
++ fd->fd_gid = arg;
++ memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
++
++ RETURN(0);
++}
++
++static int ll_put_grouplock(struct inode *inode, struct file *file,
++ unsigned long arg)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ int rc;
++ ENTRY;
++
++ if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
++ /* Ugh, it's already unlocked. */
++ RETURN(-EINVAL);
++ }
++
++ if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
++ RETURN(-EINVAL);
++
++ fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
++
++ rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
++ if (rc)
++ RETURN(rc);
++
++ fd->fd_gid = 0;
++ memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
++
++ RETURN(0);
++}
++
++#if LUSTRE_FIX >= 50
++static int join_sanity_check(struct inode *head, struct inode *tail)
++{
++ ENTRY;
++ if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
++ CERROR("server do not support join \n");
++ RETURN(-EINVAL);
++ }
++ if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
++ CERROR("tail ino %lu and ino head %lu must be regular\n",
++ head->i_ino, tail->i_ino);
++ RETURN(-EINVAL);
++ }
++ if (head->i_ino == tail->i_ino) {
++ CERROR("file %lu can not be joined to itself \n", head->i_ino);
++ RETURN(-EINVAL);
++ }
++ if (i_size_read(head) % JOIN_FILE_ALIGN) {
++ CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
++ RETURN(-EINVAL);
++ }
++ RETURN(0);
++}
++
++static int join_file(struct inode *head_inode, struct file *head_filp,
++ struct file *tail_filp)
++{
++ struct dentry *tail_dentry = tail_filp->f_dentry;
++ struct lookup_intent oit = {.it_op = IT_OPEN,
++ .it_flags = head_filp->f_flags|O_JOIN_FILE};
++ struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
++ ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
++
++ struct lustre_handle lockh;
++ struct mdc_op_data *op_data;
++ int rc;
++ loff_t data;
++ ENTRY;
++
++ tail_dentry = tail_filp->f_dentry;
++
++ OBD_ALLOC_PTR(op_data);
++ if (op_data == NULL) {
++ RETURN(-ENOMEM);
++ }
++
++ data = i_size_read(head_inode);
++ ll_prepare_mdc_op_data(op_data, head_inode,
++ tail_dentry->d_parent->d_inode,
++ tail_dentry->d_name.name,
++ tail_dentry->d_name.len, 0, &data);
++ rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
++ op_data, &lockh, NULL, 0, 0);
++
++ if (rc < 0)
++ GOTO(out, rc);
++
++ rc = oit.d.lustre.it_status;
++
++ if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
++ rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
++ ptlrpc_req_finished((struct ptlrpc_request *)
++ oit.d.lustre.it_data);
++ GOTO(out, rc);
++ }
++
++ if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
++ * away */
++ ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
++ oit.d.lustre.it_lock_mode = 0;
++ }
++ ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
++ it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
++ ll_release_openhandle(head_filp->f_dentry, &oit);
++out:
++ if (op_data)
++ OBD_FREE_PTR(op_data);
++ ll_intent_release(&oit);
++ RETURN(rc);
++}
++
++static int ll_file_join(struct inode *head, struct file *filp,
++ char *filename_tail)
++{
++ struct inode *tail = NULL, *first = NULL, *second = NULL;
++ struct dentry *tail_dentry;
++ struct file *tail_filp, *first_filp, *second_filp;
++ struct ll_lock_tree first_tree, second_tree;
++ struct ll_lock_tree_node *first_node, *second_node;
++ struct ll_inode_info *hlli = ll_i2info(head), *tlli;
++ int rc = 0, cleanup_phase = 0;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
++ head->i_ino, head->i_generation, head, filename_tail);
++
++ tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
++ if (IS_ERR(tail_filp)) {
++ CERROR("Can not open tail file %s", filename_tail);
++ rc = PTR_ERR(tail_filp);
++ GOTO(cleanup, rc);
++ }
++ tail = igrab(tail_filp->f_dentry->d_inode);
++
++ tlli = ll_i2info(tail);
++ tail_dentry = tail_filp->f_dentry;
++ LASSERT(tail_dentry);
++ cleanup_phase = 1;
++
++ /*reorder the inode for lock sequence*/
++ first = head->i_ino > tail->i_ino ? head : tail;
++ second = head->i_ino > tail->i_ino ? tail : head;
++ first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
++ second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
++
++ CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
++ head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
++ first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
++ if (IS_ERR(first_node)){
++ rc = PTR_ERR(first_node);
++ GOTO(cleanup, rc);
++ }
++ first_tree.lt_fd = first_filp->private_data;
++ rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
++ if (rc != 0)
++ GOTO(cleanup, rc);
++ cleanup_phase = 2;
++
++ second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
++ if (IS_ERR(second_node)){
++ rc = PTR_ERR(second_node);
++ GOTO(cleanup, rc);
++ }
++ second_tree.lt_fd = second_filp->private_data;
++ rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
++ if (rc != 0)
++ GOTO(cleanup, rc);
++ cleanup_phase = 3;
++
++ rc = join_sanity_check(head, tail);
++ if (rc)
++ GOTO(cleanup, rc);
++
++ rc = join_file(head, filp, tail_filp);
++ if (rc)
++ GOTO(cleanup, rc);
++cleanup:
++ switch (cleanup_phase) {
++ case 3:
++ ll_tree_unlock(&second_tree);
++ obd_cancel_unused(ll_i2obdexp(second),
++ ll_i2info(second)->lli_smd, 0, NULL);
++ case 2:
++ ll_tree_unlock(&first_tree);
++ obd_cancel_unused(ll_i2obdexp(first),
++ ll_i2info(first)->lli_smd, 0, NULL);
++ case 1:
++ filp_close(tail_filp, 0);
++ if (tail)
++ iput(tail);
++ if (head && rc == 0) {
++ obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
++ &hlli->lli_smd);
++ hlli->lli_smd = NULL;
++ }
++ case 0:
++ break;
++ default:
++ CERROR("invalid cleanup_phase %d\n", cleanup_phase);
++ LBUG();
++ }
++ RETURN(rc);
++}
++#endif /* LUSTRE_FIX >= 50 */
++
++/**
++ * Close inode open handle
++ *
++ * \param dentry [in] dentry which contains the inode
++ * \param it [in,out] intent which contains open info and result
++ *
++ * \retval 0 success
++ * \retval <0 failure
++ */
++int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
++{
++ struct inode *inode = dentry->d_inode;
++ struct obd_client_handle *och;
++ int rc;
++ ENTRY;
++
++ LASSERT(inode);
++
++ /* Root ? Do nothing. */
++ if (dentry->d_inode->i_sb->s_root == dentry)
++ RETURN(0);
++
++ /* No open handle to close? Move away */
++ if (!it_disposition(it, DISP_OPEN_OPEN))
++ RETURN(0);
++
++ LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
++
++ OBD_ALLOC(och, sizeof(*och));
++ if (!och)
++ GOTO(out, rc = -ENOMEM);
++
++ ll_och_fill(ll_i2info(inode), it, och);
++
++ rc = ll_close_inode_openhandle(inode, och);
++
++ OBD_FREE(och, sizeof(*och));
++ out:
++ /* this one is in place of ll_file_open */
++ if (it_disposition(it, DISP_ENQ_OPEN_REF))
++ ptlrpc_req_finished(it->d.lustre.it_data);
++ it_clear_disposition(it, DISP_ENQ_OPEN_REF);
++ RETURN(rc);
++}
++
++int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
++ int num_bytes)
++{
++ struct obd_export *exp = ll_i2obdexp(inode);
++ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
++ struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
++ int vallen = num_bytes;
++ int rc;
++ ENTRY;
++
++ /* If the stripe_count > 1 and the application does not understand
++ * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
++ */
++ if (lsm->lsm_stripe_count > 1 &&
++ !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
++ return -EOPNOTSUPP;
++
++ fm_key.oa.o_id = lsm->lsm_object_id;
++ fm_key.oa.o_valid = OBD_MD_FLID;
++
++ obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
++
++ /* If filesize is 0, then there would be no objects for mapping */
++ if (fm_key.oa.o_size == 0) {
++ fiemap->fm_mapped_extents = 0;
++ RETURN(0);
++ }
++
++ memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
++
++ rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
++ if (rc)
++ CERROR("obd_get_info failed: rc = %d\n", rc);
++
++ RETURN(rc);
++}
++
++int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
++ unsigned long arg)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ int flags;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
++ inode->i_generation, inode, cmd);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
++
++ /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
++ if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
++ RETURN(-ENOTTY);
++
++ switch(cmd) {
++ case LL_IOC_GETFLAGS:
++ /* Get the current value of the file flags */
++ return put_user(fd->fd_flags, (int *)arg);
++ case LL_IOC_SETFLAGS:
++ case LL_IOC_CLRFLAGS:
++ /* Set or clear specific file flags */
++ /* XXX This probably needs checks to ensure the flags are
++ * not abused, and to handle any flag side effects.
++ */
++ if (get_user(flags, (int *) arg))
++ RETURN(-EFAULT);
++
++ if (cmd == LL_IOC_SETFLAGS) {
++ if ((flags & LL_FILE_IGNORE_LOCK) &&
++ !(file->f_flags & O_DIRECT)) {
++ CERROR("%s: unable to disable locking on "
++ "non-O_DIRECT file\n", current->comm);
++ RETURN(-EINVAL);
++ }
++
++ fd->fd_flags |= flags;
++ } else {
++ fd->fd_flags &= ~flags;
++ }
++ RETURN(0);
++ case LL_IOC_LOV_SETSTRIPE:
++ RETURN(ll_lov_setstripe(inode, file, arg));
++ case LL_IOC_LOV_SETEA:
++ RETURN(ll_lov_setea(inode, file, arg));
++ case LL_IOC_LOV_GETSTRIPE:
++ RETURN(ll_lov_getstripe(inode, arg));
++ case LL_IOC_RECREATE_OBJ:
++ RETURN(ll_lov_recreate_obj(inode, file, arg));
++ case EXT3_IOC_FIEMAP: {
++ struct ll_user_fiemap *fiemap_s;
++ size_t num_bytes, ret_bytes;
++ unsigned int extent_count;
++ int rc = 0;
++
++ /* Get the extent count so we can calculate the size of
++ * required fiemap buffer */
++ if (get_user(extent_count,
++ &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
++ RETURN(-EFAULT);
++ num_bytes = sizeof(*fiemap_s) + (extent_count *
++ sizeof(struct ll_fiemap_extent));
++ OBD_VMALLOC(fiemap_s, num_bytes);
++ if (fiemap_s == NULL)
++ RETURN(-ENOMEM);
++
++ if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
++ sizeof(*fiemap_s)))
++ GOTO(error, rc = -EFAULT);
++
++ if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
++ fiemap_s->fm_flags = fiemap_s->fm_flags &
++ ~LUSTRE_FIEMAP_FLAGS_COMPAT;
++ if (copy_to_user((char *)arg, fiemap_s,
++ sizeof(*fiemap_s)))
++ GOTO(error, rc = -EFAULT);
++
++ GOTO(error, rc = -EBADR);
++ }
++
++ /* If fm_extent_count is non-zero, read the first extent since
++ * it is used to calculate end_offset and device from previous
++ * fiemap call. */
++ if (extent_count) {
++ if (copy_from_user(&fiemap_s->fm_extents[0],
++ (char __user *)arg + sizeof(*fiemap_s),
++ sizeof(struct ll_fiemap_extent)))
++ GOTO(error, rc = -EFAULT);
++ }
++
++ if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
++ int rc;
++
++ rc = filemap_fdatawrite(inode->i_mapping);
++ if (rc)
++ GOTO(error, rc);
++ }
++
++ rc = ll_fiemap(inode, fiemap_s, num_bytes);
++ if (rc)
++ GOTO(error, rc);
++
++ ret_bytes = sizeof(struct ll_user_fiemap);
++
++ if (extent_count != 0)
++ ret_bytes += (fiemap_s->fm_mapped_extents *
++ sizeof(struct ll_fiemap_extent));
++
++ if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
++ rc = -EFAULT;
++
++error:
++ OBD_VFREE(fiemap_s, num_bytes);
++ RETURN(rc);
++ }
++ case EXT3_IOC_GETFLAGS:
++ case EXT3_IOC_SETFLAGS:
++ RETURN(ll_iocontrol(inode, file, cmd, arg));
++ case EXT3_IOC_GETVERSION_OLD:
++ case EXT3_IOC_GETVERSION:
++ RETURN(put_user(inode->i_generation, (int *)arg));
++ case LL_IOC_JOIN: {
++#if LUSTRE_FIX >= 50
++ /* Allow file join in beta builds to allow debuggging */
++ char *ftail;
++ int rc;
++
++ ftail = getname((const char *)arg);
++ if (IS_ERR(ftail))
++ RETURN(PTR_ERR(ftail));
++ rc = ll_file_join(inode, file, ftail);
++ putname(ftail);
++ RETURN(rc);
++#else
++ CWARN("file join is not supported in this version of Lustre\n");
++ RETURN(-ENOTTY);
++#endif
++ }
++ case LL_IOC_GROUP_LOCK:
++ RETURN(ll_get_grouplock(inode, file, arg));
++ case LL_IOC_GROUP_UNLOCK:
++ RETURN(ll_put_grouplock(inode, file, arg));
++ case IOC_OBD_STATFS:
++ RETURN(ll_obd_statfs(inode, (void *)arg));
++ case OBD_IOC_GETNAME_OLD:
++ case OBD_IOC_GETNAME: {
++ struct obd_device *obd =
++ class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
++ if (!obd)
++ RETURN(-EFAULT);
++ if (copy_to_user((void *)arg, obd->obd_name,
++ strlen(obd->obd_name) + 1))
++ RETURN (-EFAULT);
++ RETURN(0);
++ }
++
++ /* We need to special case any other ioctls we want to handle,
++ * to send them to the MDS/OST as appropriate and to properly
++ * network encode the arg field.
++ case EXT3_IOC_SETVERSION_OLD:
++ case EXT3_IOC_SETVERSION:
++ */
++ default: {
++ int err;
++
++ if (LLIOC_STOP ==
++ ll_iocontrol_call(inode, file, cmd, arg, &err))
++ RETURN(err);
++
++ RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
++ (void *)arg));
++ }
++ }
++}
++
++loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
++{
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ loff_t retval;
++ ENTRY;
++ retval = offset + ((origin == 2) ? i_size_read(inode) :
++ (origin == 1) ? file->f_pos : 0);
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
++ inode->i_ino, inode->i_generation, inode, retval, retval,
++ origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
++
++ if (origin == 2) { /* SEEK_END */
++ int nonblock = 0, rc;
++
++ if (file->f_flags & O_NONBLOCK)
++ nonblock = LDLM_FL_BLOCK_NOWAIT;
++
++ if (lsm != NULL) {
++ rc = ll_glimpse_size(inode, nonblock);
++ if (rc != 0)
++ RETURN(rc);
++ }
++
++ ll_inode_size_lock(inode, 0);
++ offset += i_size_read(inode);
++ ll_inode_size_unlock(inode, 0);
++ } else if (origin == 1) { /* SEEK_CUR */
++ offset += file->f_pos;
++ }
++
++ retval = -EINVAL;
++ if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
++ if (offset != file->f_pos) {
++ file->f_pos = offset;
++ file->f_version = 0;
++ }
++ retval = offset;
++ }
++
++ RETURN(retval);
++}
++
++int ll_fsync(struct file *file, struct dentry *dentry, int data)
++{
++ struct inode *inode = dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct ll_fid fid;
++ struct ptlrpc_request *req;
++ int rc, err;
++ ENTRY;
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
++ inode->i_generation, inode);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
++
++ /* fsync's caller has already called _fdata{sync,write}, we want
++ * that IO to finish before calling the osc and mdc sync methods */
++ rc = filemap_fdatawait(inode->i_mapping);
++
++ /* catch async errors that were recorded back when async writeback
++ * failed for pages in this mapping. */
++ err = lli->lli_async_rc;
++ lli->lli_async_rc = 0;
++ if (rc == 0)
++ rc = err;
++ if (lsm) {
++ err = lov_test_and_clear_async_rc(lsm);
++ if (rc == 0)
++ rc = err;
++ }
++
++ ll_inode2fid(&fid, inode);
++ err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
++ if (!rc)
++ rc = err;
++ if (!err)
++ ptlrpc_req_finished(req);
++
++ if (data && lsm) {
++ struct obdo *oa;
++
++ OBDO_ALLOC(oa);
++ if (!oa)
++ RETURN(rc ? rc : -ENOMEM);
++
++ oa->o_id = lsm->lsm_object_id;
++ oa->o_valid = OBD_MD_FLID;
++ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
++ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
++
++ err = obd_sync(ll_i2sbi(inode)->ll_osc_exp, oa, lsm,
++ 0, OBD_OBJECT_EOF);
++ if (!rc)
++ rc = err;
++ OBDO_FREE(oa);
++ }
++
++ RETURN(rc);
++}
++
++int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
++{
++ struct inode *inode = file->f_dentry->d_inode;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ldlm_res_id res_id =
++ { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} };
++ struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
++ ldlm_flock_completion_ast, NULL, file_lock };
++ struct lustre_handle lockh = {0};
++ ldlm_policy_data_t flock;
++ int flags = 0;
++ int rc;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
++ inode->i_ino, file_lock);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
++
++ if (file_lock->fl_flags & FL_FLOCK) {
++ LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
++ /* set missing params for flock() calls */
++ file_lock->fl_end = OFFSET_MAX;
++ file_lock->fl_pid = current->tgid;
++ }
++ flock.l_flock.pid = file_lock->fl_pid;
++ flock.l_flock.start = file_lock->fl_start;
++ flock.l_flock.end = file_lock->fl_end;
++
++ switch (file_lock->fl_type) {
++ case F_RDLCK:
++ einfo.ei_mode = LCK_PR;
++ break;
++ case F_UNLCK:
++ /* An unlock request may or may not have any relation to
++ * existing locks so we may not be able to pass a lock handle
++ * via a normal ldlm_lock_cancel() request. The request may even
++ * unlock a byte range in the middle of an existing lock. In
++ * order to process an unlock request we need all of the same
++ * information that is given with a normal read or write record
++ * lock request. To avoid creating another ldlm unlock (cancel)
++ * message we'll treat a LCK_NL flock request as an unlock. */
++ einfo.ei_mode = LCK_NL;
++ break;
++ case F_WRLCK:
++ einfo.ei_mode = LCK_PW;
++ break;
++ default:
++ CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
++ RETURN (-EINVAL);
++ }
++
++ switch (cmd) {
++ case F_SETLKW:
++#ifdef F_SETLKW64
++ case F_SETLKW64:
++#endif
++ flags = 0;
++ break;
++ case F_SETLK:
++#ifdef F_SETLK64
++ case F_SETLK64:
++#endif
++ flags = LDLM_FL_BLOCK_NOWAIT;
++ break;
++ case F_GETLK:
++#ifdef F_GETLK64
++ case F_GETLK64:
++#endif
++ flags = LDLM_FL_TEST_LOCK;
++ /* Save the old mode so that if the mode in the lock changes we
++ * can decrement the appropriate reader or writer refcount. */
++ file_lock->fl_type = einfo.ei_mode;
++ break;
++ default:
++ CERROR("unknown fcntl lock command: %d\n", cmd);
++ RETURN (-EINVAL);
++ }
++
++ CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
++ "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
++ flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
++
++ rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
++ &flock, &flags, NULL, 0, NULL, &lockh, 0);
++ if ((file_lock->fl_flags & FL_FLOCK) &&
++ (rc == 0 || file_lock->fl_type == F_UNLCK))
++ ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
++#ifdef HAVE_F_OP_FLOCK
++ if ((file_lock->fl_flags & FL_POSIX) &&
++ (rc == 0 || file_lock->fl_type == F_UNLCK) &&
++ !(flags & LDLM_FL_TEST_LOCK))
++ posix_lock_file_wait(file, file_lock);
++#endif
++
++ RETURN(rc);
++}
++
++int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
++{
++ ENTRY;
++
++ RETURN(-ENOSYS);
++}
++
++int ll_have_md_lock(struct inode *inode, __u64 bits)
++{
++ struct lustre_handle lockh;
++ struct ldlm_res_id res_id = { .name = {0} };
++ struct obd_device *obddev;
++ ldlm_policy_data_t policy = { .l_inodebits = {bits}};
++ int flags;
++ ENTRY;
++
++ if (!inode)
++ RETURN(0);
++
++ obddev = ll_i2mdcexp(inode)->exp_obd;
++ res_id.name[0] = inode->i_ino;
++ res_id.name[1] = inode->i_generation;
++
++ CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
++
++ flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
++ if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
++ &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
++ RETURN(1);
++ }
++
++ RETURN(0);
++}
++
++static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
++ if (rc == -ENOENT) { /* Already unlinked. Just update nlink
++ * and return success */
++ inode->i_nlink = 0;
++ /* This path cannot be hit for regular files unless in
++ * case of obscure races, so no need to to validate
++ * size. */
++ if (!S_ISREG(inode->i_mode) &&
++ !S_ISDIR(inode->i_mode))
++ return 0;
++ }
++
++ if (rc) {
++ CERROR("failure %d inode %lu\n", rc, inode->i_ino);
++ return -abs(rc);
++
++ }
++
++ return 0;
++}
++
++int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
++{
++ struct inode *inode = dentry->d_inode;
++ struct ptlrpc_request *req = NULL;
++ struct obd_export *exp;
++ int rc;
++ ENTRY;
++
++ if (!inode) {
++ CERROR("REPORT THIS LINE TO PETER\n");
++ RETURN(0);
++ }
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
++ inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
++
++ exp = ll_i2mdcexp(inode);
++
++ if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
++ struct lookup_intent oit = { .it_op = IT_GETATTR };
++ struct mdc_op_data op_data;
++
++ /* Call getattr by fid, so do not provide name at all. */
++ ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
++ dentry->d_inode, NULL, 0, 0, NULL);
++ rc = mdc_intent_lock(exp, &op_data, NULL, 0,
++ /* we are not interested in name
++ based lookup */
++ &oit, 0, &req,
++ ll_mdc_blocking_ast, 0);
++ if (rc < 0) {
++ rc = ll_inode_revalidate_fini(inode, rc);
++ GOTO (out, rc);
++ }
++
++ rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
++ if (rc != 0) {
++ ll_intent_release(&oit);
++ GOTO(out, rc);
++ }
++
++ /* Unlinked? Unhash dentry, so it is not picked up later by
++ do_lookup() -> ll_revalidate_it(). We cannot use d_drop
++ here to preserve get_cwd functionality on 2.6.
++ Bug 10503 */
++ if (!dentry->d_inode->i_nlink) {
++ spin_lock(&ll_lookup_lock);
++ spin_lock(&dcache_lock);
++ ll_drop_dentry(dentry);
++ spin_unlock(&dcache_lock);
++ spin_unlock(&ll_lookup_lock);
++ }
++
++ ll_lookup_finish_locks(&oit, dentry);
++ } else if (!ll_have_md_lock(dentry->d_inode,
++ MDS_INODELOCK_UPDATE|MDS_INODELOCK_LOOKUP)) {
++ struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
++ struct ll_fid fid;
++ obd_valid valid = OBD_MD_FLGETATTR;
++ int ealen = 0;
++
++ if (S_ISREG(inode->i_mode)) {
++ rc = ll_get_max_mdsize(sbi, &ealen);
++ if (rc)
++ RETURN(rc);
++ valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
++ }
++ ll_inode2fid(&fid, inode);
++ rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
++ if (rc) {
++ rc = ll_inode_revalidate_fini(inode, rc);
++ RETURN(rc);
++ }
++
++ rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
++ NULL);
++ if (rc)
++ GOTO(out, rc);
++ }
++
++ /* if object not yet allocated, don't validate size */
++ if (ll_i2info(inode)->lli_smd == NULL)
++ GOTO(out, rc = 0);
++
++ /* ll_glimpse_size will prefer locally cached writes if they extend
++ * the file */
++ rc = ll_glimpse_size(inode, 0);
++
++out:
++ ptlrpc_req_finished(req);
++ RETURN(rc);
++}
++
++int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
++ struct lookup_intent *it, struct kstat *stat)
++{
++ struct inode *inode = de->d_inode;
++ int res = 0;
++
++ res = ll_inode_revalidate_it(de, it);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
++
++ if (res)
++ return res;
++
++ stat->dev = inode->i_sb->s_dev;
++ stat->ino = inode->i_ino;
++ stat->mode = inode->i_mode;
++ stat->nlink = inode->i_nlink;
++ stat->uid = inode->i_uid;
++ stat->gid = inode->i_gid;
++ stat->rdev = kdev_t_to_nr(inode->i_rdev);
++ stat->atime = inode->i_atime;
++ stat->mtime = inode->i_mtime;
++ stat->ctime = inode->i_ctime;
++#ifdef HAVE_INODE_BLKSIZE
++ stat->blksize = inode->i_blksize;
++#else
++ stat->blksize = 1<<inode->i_blkbits;
++#endif
++
++ ll_inode_size_lock(inode, 0);
++ stat->size = i_size_read(inode);
++ stat->blocks = inode->i_blocks;
++ ll_inode_size_unlock(inode, 0);
++
++ return 0;
++}
++int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
++{
++ struct lookup_intent it = { .it_op = IT_GETATTR };
++
++ return ll_getattr_it(mnt, de, &it, stat);
++}
++
++static
++int lustre_check_acl(struct inode *inode, int mask)
++{
++#ifdef CONFIG_FS_POSIX_ACL
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct posix_acl *acl;
++ int rc;
++ ENTRY;
++
++ spin_lock(&lli->lli_lock);
++ acl = posix_acl_dup(lli->lli_posix_acl);
++ spin_unlock(&lli->lli_lock);
++
++ if (!acl)
++ RETURN(-EAGAIN);
++
++ rc = posix_acl_permission(inode, acl, mask);
++ posix_acl_release(acl);
++
++ RETURN(rc);
++#else
++ return -EAGAIN;
++#endif
++}
++
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
++int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
++{
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
++ inode->i_ino, inode->i_generation, inode, mask);
++
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
++ return generic_permission(inode, mask, lustre_check_acl);
++}
++#else
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
++int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
++#else
++int ll_inode_permission(struct inode *inode, int mask)
++#endif
++{
++ int mode = inode->i_mode;
++ int rc;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
++ inode->i_ino, inode->i_generation, inode, mask);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
++
++ if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
++ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
++ return -EROFS;
++ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
++ return -EACCES;
++ if (current->fsuid == inode->i_uid) {
++ mode >>= 6;
++ } else if (1) {
++ if (((mode >> 3) & mask & S_IRWXO) != mask)
++ goto check_groups;
++ rc = lustre_check_acl(inode, mask);
++ if (rc == -EAGAIN)
++ goto check_groups;
++ if (rc == -EACCES)
++ goto check_capabilities;
++ return rc;
++ } else {
++check_groups:
++ if (in_group_p(inode->i_gid))
++ mode >>= 3;
++ }
++ if ((mode & mask & S_IRWXO) == mask)
++ return 0;
++
++check_capabilities:
++ if (!(mask & MAY_EXEC) ||
++ (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
++ if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
++ return 0;
++
++ if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
++ (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
++ return 0;
++
++ return -EACCES;
++}
++#endif
++
++/* -o localflock - only provides locally consistent flock locks */
++struct file_operations ll_file_operations = {
++ .read = ll_file_read,
++#ifdef HAVE_FILE_READV
++ .readv = ll_file_readv,
++#else
++ .aio_read = ll_file_aio_read,
++#endif
++ .write = ll_file_write,
++#ifdef HAVE_FILE_WRITEV
++ .writev = ll_file_writev,
++#else
++ .aio_write = ll_file_aio_write,
++#endif
++ .ioctl = ll_file_ioctl,
++ .open = ll_file_open,
++ .release = ll_file_release,
++ .mmap = ll_file_mmap,
++ .llseek = ll_file_seek,
++ .sendfile = ll_file_sendfile,
++ .fsync = ll_fsync,
++};
++
++struct file_operations ll_file_operations_flock = {
++ .read = ll_file_read,
++#ifdef HAVE_FILE_READV
++ .readv = ll_file_readv,
++#else
++ .aio_read = ll_file_aio_read,
++#endif
++ .write = ll_file_write,
++#ifdef HAVE_FILE_WRITEV
++ .writev = ll_file_writev,
++#else
++ .aio_write = ll_file_aio_write,
++#endif
++ .ioctl = ll_file_ioctl,
++ .open = ll_file_open,
++ .release = ll_file_release,
++ .mmap = ll_file_mmap,
++ .llseek = ll_file_seek,
++ .sendfile = ll_file_sendfile,
++ .fsync = ll_fsync,
++#ifdef HAVE_F_OP_FLOCK
++ .flock = ll_file_flock,
++#endif
++ .lock = ll_file_flock
++};
++
++/* These are for -o noflock - to return ENOSYS on flock calls */
++struct file_operations ll_file_operations_noflock = {
++ .read = ll_file_read,
++#ifdef HAVE_FILE_READV
++ .readv = ll_file_readv,
++#else
++ .aio_read = ll_file_aio_read,
++#endif
++ .write = ll_file_write,
++#ifdef HAVE_FILE_WRITEV
++ .writev = ll_file_writev,
++#else
++ .aio_write = ll_file_aio_write,
++#endif
++ .ioctl = ll_file_ioctl,
++ .open = ll_file_open,
++ .release = ll_file_release,
++ .mmap = ll_file_mmap,
++ .llseek = ll_file_seek,
++ .sendfile = ll_file_sendfile,
++ .fsync = ll_fsync,
++#ifdef HAVE_F_OP_FLOCK
++ .flock = ll_file_noflock,
++#endif
++ .lock = ll_file_noflock
++};
++
++struct inode_operations ll_file_inode_operations = {
++#ifdef HAVE_VFS_INTENT_PATCHES
++ .setattr_raw = ll_setattr_raw,
++#endif
++ .setattr = ll_setattr,
++ .truncate = ll_truncate,
++ .getattr = ll_getattr,
++ .permission = ll_inode_permission,
++ .setxattr = ll_setxattr,
++ .getxattr = ll_getxattr,
++ .listxattr = ll_listxattr,
++ .removexattr = ll_removexattr,
++};
++
++/* dynamic ioctl number support routins */
++static struct llioc_ctl_data {
++ struct rw_semaphore ioc_sem;
++ struct list_head ioc_head;
++} llioc = {
++ __RWSEM_INITIALIZER(llioc.ioc_sem),
++ CFS_LIST_HEAD_INIT(llioc.ioc_head)
++};
++
++
++struct llioc_data {
++ struct list_head iocd_list;
++ unsigned int iocd_size;
++ llioc_callback_t iocd_cb;
++ unsigned int iocd_count;
++ unsigned int iocd_cmd[0];
++};
++
++void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
++{
++ unsigned int size;
++ struct llioc_data *in_data = NULL;
++ ENTRY;
++
++ if (cb == NULL || cmd == NULL ||
++ count > LLIOC_MAX_CMD || count < 0)
++ RETURN(NULL);
++
++ size = sizeof(*in_data) + count * sizeof(unsigned int);
++ OBD_ALLOC(in_data, size);
++ if (in_data == NULL)
++ RETURN(NULL);
++
++ memset(in_data, 0, sizeof(*in_data));
++ in_data->iocd_size = size;
++ in_data->iocd_cb = cb;
++ in_data->iocd_count = count;
++ memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
++
++ down_write(&llioc.ioc_sem);
++ list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
++ up_write(&llioc.ioc_sem);
++
++ RETURN(in_data);
++}
++
++void ll_iocontrol_unregister(void *magic)
++{
++ struct llioc_data *tmp;
++
++ if (magic == NULL)
++ return;
++
++ down_write(&llioc.ioc_sem);
++ list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
++ if (tmp == magic) {
++ unsigned int size = tmp->iocd_size;
++
++ list_del(&tmp->iocd_list);
++ up_write(&llioc.ioc_sem);
++
++ OBD_FREE(tmp, size);
++ return;
++ }
++ }
++ up_write(&llioc.ioc_sem);
++
++ CWARN("didn't find iocontrol register block with magic: %p\n", magic);
++}
++
++EXPORT_SYMBOL(ll_iocontrol_register);
++EXPORT_SYMBOL(ll_iocontrol_unregister);
++
++enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg, int *rcp)
++{
++ enum llioc_iter ret = LLIOC_CONT;
++ struct llioc_data *data;
++ int rc = -EINVAL, i;
++
++ down_read(&llioc.ioc_sem);
++ list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
++ for (i = 0; i < data->iocd_count; i++) {
++ if (cmd != data->iocd_cmd[i])
++ continue;
++
++ ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
++ break;
++ }
++
++ if (ret == LLIOC_STOP)
++ break;
++ }
++ up_read(&llioc.ioc_sem);
++
++ if (rcp)
++ *rcp = rc;
++ return ret;
++}
+diff -urNad lustre~/lustre/llite/llite_internal.h lustre/lustre/llite/llite_internal.h
+--- lustre~/lustre/llite/llite_internal.h 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/llite_internal.h 2009-03-12 11:02:51.000000000 +0100
+@@ -647,7 +647,7 @@
+ struct lookup_intent *it, struct kstat *stat);
+ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+ struct ll_file_data *ll_file_data_get(void);
+-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
++#ifndef HAVE_INODE_PERMISION_2ARGS
+ int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
+ #else
+ int ll_inode_permission(struct inode *inode, int mask);
+@@ -727,9 +727,6 @@
+ /* llite/llite_nfs.c */
+ extern struct export_operations lustre_export_operations;
+ __u32 get_uuid2int(const char *name, int len);
+-struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
+- int fhtype, int parent);
+-int ll_dentry_to_fh(struct dentry *, __u32 *datap, int *lenp, int need_parent);
+
+ /* llite/special.c */
+ extern struct inode_operations ll_special_inode_operations;
+diff -urNad lustre~/lustre/llite/llite_internal.h.orig lustre/lustre/llite/llite_internal.h.orig
+--- lustre~/lustre/llite/llite_internal.h.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/llite/llite_internal.h.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,1027 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ */
++
++#ifndef LLITE_INTERNAL_H
++#define LLITE_INTERNAL_H
++
++#ifdef CONFIG_FS_POSIX_ACL
++# include <linux/fs.h>
++#ifdef HAVE_XATTR_ACL
++# include <linux/xattr_acl.h>
++#endif
++#ifdef HAVE_LINUX_POSIX_ACL_XATTR_H
++# include <linux/posix_acl_xattr.h>
++#endif
++#endif
++
++#include <lustre_debug.h>
++#include <lustre_ver.h>
++#include <linux/lustre_version.h>
++#include <lustre_disk.h> /* for s2sbi */
++
++#ifndef HAVE_LE_TYPES
++typedef __u16 __le16;
++typedef __u32 __le32;
++#endif
++
++/*
++struct lustre_intent_data {
++ __u64 it_lock_handle[2];
++ __u32 it_disposition;
++ __u32 it_status;
++ __u32 it_lock_mode;
++ }; */
++
++/* If there is no FMODE_EXEC defined, make it to match nothing */
++#ifndef FMODE_EXEC
++#define FMODE_EXEC 0
++#endif
++
++#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
++#define LUSTRE_FPRIVATE(file) ((file)->private_data)
++
++#ifdef HAVE_VFS_INTENT_PATCHES
++static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
++{
++ return &nd->intent;
++}
++#endif
++
++/*
++ * Directory entries are currently in the same format as ext2/ext3, but will
++ * be changed in the future to accomodate FIDs
++ */
++#define LL_DIR_NAME_LEN (255)
++#define LL_DIR_PAD (4)
++
++struct ll_dir_entry {
++ /* number of inode, referenced by this entry */
++ __le32 lde_inode;
++ /* total record length, multiple of LL_DIR_PAD */
++ __le16 lde_rec_len;
++ /* length of name */
++ __u8 lde_name_len;
++ /* file type: regular, directory, device, etc. */
++ __u8 lde_file_type;
++ /* name. NOT NUL-terminated */
++ char lde_name[LL_DIR_NAME_LEN];
++};
++
++struct ll_dentry_data {
++ int lld_cwd_count;
++ int lld_mnt_count;
++ struct obd_client_handle lld_cwd_och;
++ struct obd_client_handle lld_mnt_och;
++#ifndef HAVE_VFS_INTENT_PATCHES
++ struct lookup_intent *lld_it;
++#endif
++ unsigned int lld_sa_generation;
++};
++
++#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
++
++extern struct file_operations ll_pgcache_seq_fops;
++
++#define LLI_INODE_MAGIC 0x111d0de5
++#define LLI_INODE_DEAD 0xdeadd00d
++#define LLI_F_HAVE_OST_SIZE_LOCK 0
++#define LLI_F_HAVE_MDS_SIZE_LOCK 1
++#define LLI_F_CONTENDED 2
++#define LLI_F_SRVLOCK 3
++
++struct ll_inode_info {
++ int lli_inode_magic;
++ struct semaphore lli_size_sem; /* protect open and change size */
++ void *lli_size_sem_owner;
++ struct semaphore lli_write_sem;
++ struct lov_stripe_md *lli_smd;
++ char *lli_symlink_name;
++ __u64 lli_maxbytes;
++ __u64 lli_io_epoch;
++ unsigned long lli_flags;
++ cfs_time_t lli_contention_time;
++
++ /* this lock protects s_d_w and p_w_ll and mmap_cnt */
++ spinlock_t lli_lock;
++#ifdef HAVE_CLOSE_THREAD
++ struct list_head lli_pending_write_llaps;
++ struct list_head lli_close_item;
++ int lli_send_done_writing;
++#endif
++ atomic_t lli_mmap_cnt;
++
++ /* for writepage() only to communicate to fsync */
++ int lli_async_rc;
++
++ struct posix_acl *lli_posix_acl;
++
++ struct list_head lli_dead_list;
++
++ struct semaphore lli_och_sem; /* Protects access to och pointers
++ and their usage counters */
++ /* We need all three because every inode may be opened in different
++ modes */
++ struct obd_client_handle *lli_mds_read_och;
++ __u64 lli_open_fd_read_count;
++ struct obd_client_handle *lli_mds_write_och;
++ __u64 lli_open_fd_write_count;
++ struct obd_client_handle *lli_mds_exec_och;
++ __u64 lli_open_fd_exec_count;
++ struct inode lli_vfs_inode;
++
++ /* metadata stat-ahead */
++ /*
++ * "opendir_pid" is the token when lookup/revalid -- I am the owner of
++ * dir statahead.
++ */
++ pid_t lli_opendir_pid;
++ /*
++ * since parent-child threads can share the same @file struct,
++ * "opendir_key" is the token when dir close for case of parent exit
++ * before child -- it is me should cleanup the dir readahead. */
++ void *lli_opendir_key;
++ struct ll_statahead_info *lli_sai;
++};
++
++/*
++ * Locking to guarantee consistency of non-atomic updates to long long i_size,
++ * consistency between file size and KMS, and consistency within
++ * ->lli_smd->lsm_oinfo[]'s.
++ *
++ * Implemented by ->lli_size_sem and ->lsm_sem, nested in that order.
++ */
++
++void ll_inode_size_lock(struct inode *inode, int lock_lsm);
++void ll_inode_size_unlock(struct inode *inode, int unlock_lsm);
++
++// FIXME: replace the name of this with LL_I to conform to kernel stuff
++// static inline struct ll_inode_info *LL_I(struct inode *inode)
++static inline struct ll_inode_info *ll_i2info(struct inode *inode)
++{
++ return container_of(inode, struct ll_inode_info, lli_vfs_inode);
++}
++
++/* default to about 40meg of readahead on a given system. That much tied
++ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
++#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - CFS_PAGE_SHIFT))
++
++/* default to read-ahead full files smaller than 2MB on the second read */
++#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - CFS_PAGE_SHIFT))
++
++enum ra_stat {
++ RA_STAT_HIT = 0,
++ RA_STAT_MISS,
++ RA_STAT_DISTANT_READPAGE,
++ RA_STAT_MISS_IN_WINDOW,
++ RA_STAT_FAILED_GRAB_PAGE,
++ RA_STAT_FAILED_MATCH,
++ RA_STAT_DISCARDED,
++ RA_STAT_ZERO_LEN,
++ RA_STAT_ZERO_WINDOW,
++ RA_STAT_EOF,
++ RA_STAT_MAX_IN_FLIGHT,
++ RA_STAT_WRONG_GRAB_PAGE,
++ _NR_RA_STAT,
++};
++
++struct ll_ra_info {
++ unsigned long ra_cur_pages;
++ unsigned long ra_max_pages;
++ unsigned long ra_max_read_ahead_whole_pages;
++ unsigned long ra_stats[_NR_RA_STAT];
++};
++
++/* LL_HIST_MAX=32 causes an overflow */
++#define LL_HIST_MAX 28
++#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
++#define LL_PROCESS_HIST_MAX 10
++struct per_process_info {
++ pid_t pid;
++ struct obd_histogram pp_r_hist;
++ struct obd_histogram pp_w_hist;
++};
++
++/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
++struct ll_rw_extents_info {
++ struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
++};
++
++#define LL_OFFSET_HIST_MAX 100
++struct ll_rw_process_info {
++ pid_t rw_pid;
++ int rw_op;
++ loff_t rw_range_start;
++ loff_t rw_range_end;
++ loff_t rw_last_file_pos;
++ loff_t rw_offset;
++ size_t rw_smallest_extent;
++ size_t rw_largest_extent;
++ struct file *rw_last_file;
++};
++
++
++enum stats_track_type {
++ STATS_TRACK_ALL = 0, /* track all processes */
++ STATS_TRACK_PID, /* track process with this pid */
++ STATS_TRACK_PPID, /* track processes with this ppid */
++ STATS_TRACK_GID, /* track processes with this gid */
++ STATS_TRACK_LAST,
++};
++
++/* flags for sbi->ll_flags */
++#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */
++#define LL_SBI_DATA_CHECKSUM 0x02 /* checksum each page on the wire */
++#define LL_SBI_FLOCK 0x04
++#define LL_SBI_USER_XATTR 0x08 /* support user xattr */
++#define LL_SBI_ACL 0x10 /* support ACL */
++#define LL_SBI_JOIN 0x20 /* support JOIN */
++#define LL_SBI_LOCALFLOCK 0x40 /* Local flocks support by kernel */
++#define LL_SBI_LRU_RESIZE 0x80 /* support lru resize */
++#define LL_SBI_LLITE_CHECKSUM 0x100 /* checksum each page in memory */
++
++/* default value for ll_sb_info->contention_time */
++#define SBI_DEFAULT_CONTENTION_SECONDS 60
++/* default value for lockless_truncate_enable */
++#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
++
++struct ll_sb_info {
++ struct list_head ll_list;
++ /* this protects pglist and ra_info. It isn't safe to
++ * grab from interrupt contexts */
++ spinlock_t ll_lock;
++ spinlock_t ll_pp_extent_lock; /* Lock for pp_extent entries */
++ spinlock_t ll_process_lock; /* Lock for ll_rw_process_info */
++ struct obd_uuid ll_sb_uuid;
++ struct obd_export *ll_mdc_exp;
++ struct obd_export *ll_osc_exp;
++ struct proc_dir_entry *ll_proc_root;
++ obd_id ll_rootino; /* number of root inode */
++
++ int ll_flags;
++ struct list_head ll_conn_chain; /* per-conn chain of SBs */
++ struct lustre_client_ocd ll_lco;
++
++ struct list_head ll_orphan_dentry_list; /*please don't ask -p*/
++ struct ll_close_queue *ll_lcq;
++
++ struct lprocfs_stats *ll_stats; /* lprocfs stats counter */
++
++ unsigned long ll_async_page_max;
++ unsigned long ll_async_page_count;
++ unsigned long ll_pglist_gen;
++ struct list_head ll_pglist; /* all pages (llap_pglist_item) */
++
++ unsigned ll_contention_time; /* seconds */
++ unsigned ll_lockless_truncate_enable; /* true/false */
++
++ struct ll_ra_info ll_ra_info;
++ unsigned int ll_namelen;
++ struct file_operations *ll_fop;
++
++#ifdef HAVE_EXPORT___IGET
++ struct list_head ll_deathrow; /* inodes to be destroyed (b1443) */
++ spinlock_t ll_deathrow_lock;
++#endif
++ /* =0 - hold lock over whole read/write
++ * >0 - max. chunk to be read/written w/o lock re-acquiring */
++ unsigned long ll_max_rw_chunk;
++
++ /* Statistics */
++ struct ll_rw_extents_info ll_rw_extents_info;
++ int ll_extent_process_count;
++ struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
++ unsigned int ll_offset_process_count;
++ struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
++ unsigned int ll_rw_offset_entry_count;
++ enum stats_track_type ll_stats_track_type;
++ int ll_stats_track_id;
++ int ll_rw_stats_on;
++ dev_t ll_sdev_orig; /* save s_dev before assign for
++ * clustred nfs */
++
++ /* metadata stat-ahead */
++ unsigned int ll_sa_max; /* max statahead RPCs */
++ unsigned int ll_sa_wrong; /* statahead thread stopped for
++ * low hit ratio */
++ unsigned int ll_sa_total; /* statahead thread started
++ * count */
++ unsigned long long ll_sa_blocked; /* ls count waiting for
++ * statahead */
++ unsigned long long ll_sa_cached; /* ls count got in cache */
++ unsigned long long ll_sa_hit; /* hit count */
++ unsigned long long ll_sa_miss; /* miss count */
++};
++
++#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024)
++
++struct ll_ra_read {
++ pgoff_t lrr_start;
++ pgoff_t lrr_count;
++ struct task_struct *lrr_reader;
++ struct list_head lrr_linkage;
++};
++
++/*
++ * per file-descriptor read-ahead data.
++ */
++struct ll_readahead_state {
++ spinlock_t ras_lock;
++ /*
++ * index of the last page that read(2) needed and that wasn't in the
++ * cache. Used by ras_update() to detect seeks.
++ *
++ * XXX nikita: if access seeks into cached region, Lustre doesn't see
++ * this.
++ */
++ unsigned long ras_last_readpage;
++ /*
++ * number of pages read after last read-ahead window reset. As window
++ * is reset on each seek, this is effectively a number of consecutive
++ * accesses. Maybe ->ras_accessed_in_window is better name.
++ *
++ * XXX nikita: window is also reset (by ras_update()) when Lustre
++ * believes that memory pressure evicts read-ahead pages. In that
++ * case, it probably doesn't make sense to expand window to
++ * PTLRPC_MAX_BRW_PAGES on the third access.
++ */
++ unsigned long ras_consecutive_pages;
++ /*
++ * number of read requests after the last read-ahead window reset
++ * As window is reset on each seek, this is effectively the number
++ * on consecutive read request and is used to trigger read-ahead.
++ */
++ unsigned long ras_consecutive_requests;
++ /*
++ * Parameters of current read-ahead window. Handled by
++ * ras_update(). On the initial access to the file or after a seek,
++ * window is reset to 0. After 3 consecutive accesses, window is
++ * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
++ * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
++ */
++ unsigned long ras_window_start, ras_window_len;
++ /*
++ * Where next read-ahead should start at. This lies within read-ahead
++ * window. Read-ahead window is read in pieces rather than at once
++ * because: 1. lustre limits total number of pages under read-ahead by
++ * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
++ * not covered by DLM lock.
++ */
++ unsigned long ras_next_readahead;
++ /*
++ * Total number of ll_file_read requests issued, reads originating
++ * due to mmap are not counted in this total. This value is used to
++ * trigger full file read-ahead after multiple reads to a small file.
++ */
++ unsigned long ras_requests;
++ /*
++ * Page index with respect to the current request, these value
++ * will not be accurate when dealing with reads issued via mmap.
++ */
++ unsigned long ras_request_index;
++ /*
++ * list of struct ll_ra_read's one per read(2) call current in
++ * progress against this file descriptor. Used by read-ahead code,
++ * protected by ->ras_lock.
++ */
++ struct list_head ras_read_beads;
++ /*
++ * The following 3 items are used for detecting the stride I/O
++ * mode.
++ * In stride I/O mode,
++ * ...............|-----data-----|****gap*****|--------|******|....
++ * offset |-stride_pages-|-stride_gap-|
++ * ras_stride_offset = offset;
++ * ras_stride_length = stride_pages + stride_gap;
++ * ras_stride_pages = stride_pages;
++ * Note: all these three items are counted by pages.
++ */
++ unsigned long ras_stride_length;
++ unsigned long ras_stride_pages;
++ pgoff_t ras_stride_offset;
++ /*
++ * number of consecutive stride request count, and it is similar as
++ * ras_consecutive_requests, but used for stride I/O mode.
++ * Note: only more than 2 consecutive stride request are detected,
++ * stride read-ahead will be enable
++ */
++ unsigned long ras_consecutive_stride_requests;
++};
++
++extern cfs_mem_cache_t *ll_file_data_slab;
++struct lustre_handle;
++struct ll_file_data {
++ struct ll_readahead_state fd_ras;
++ int fd_omode;
++ struct lustre_handle fd_cwlockh;
++ unsigned long fd_gid;
++ __u32 fd_flags;
++};
++
++struct lov_stripe_md;
++
++extern spinlock_t inode_lock;
++
++extern struct proc_dir_entry *proc_lustre_fs_root;
++
++static inline struct inode *ll_info2i(struct ll_inode_info *lli)
++{
++ return &lli->lli_vfs_inode;
++}
++
++struct it_cb_data {
++ struct inode *icbd_parent;
++ struct dentry **icbd_childp;
++ obd_id hash;
++};
++
++void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
++
++#define LLAP_MAGIC 98764321
++
++extern cfs_mem_cache_t *ll_async_page_slab;
++extern size_t ll_async_page_slab_size;
++struct ll_async_page {
++ int llap_magic;
++ /* only trust these if the page lock is providing exclusion */
++ unsigned int llap_write_queued:1,
++ llap_defer_uptodate:1,
++ llap_origin:3,
++ llap_ra_used:1,
++ llap_ignore_quota:1,
++ llap_nocache:1,
++ llap_lockless_io_page:1;
++ void *llap_cookie;
++ struct page *llap_page;
++ struct list_head llap_pending_write;
++ struct list_head llap_pglist_item;
++ /* checksum for paranoid I/O debugging */
++ __u32 llap_checksum;
++};
++
++/*
++ * enumeration of llap_from_page() call-sites. Used to export statistics in
++ * /proc/fs/lustre/llite/fsN/dump_page_cache.
++ */
++enum {
++ LLAP_ORIGIN_UNKNOWN = 0,
++ LLAP_ORIGIN_READPAGE,
++ LLAP_ORIGIN_READAHEAD,
++ LLAP_ORIGIN_COMMIT_WRITE,
++ LLAP_ORIGIN_WRITEPAGE,
++ LLAP_ORIGIN_REMOVEPAGE,
++ LLAP_ORIGIN_LOCKLESS_IO,
++ LLAP__ORIGIN_MAX,
++};
++extern char *llap_origins[];
++
++#ifdef HAVE_REGISTER_CACHE
++#define ll_register_cache(cache) register_cache(cache)
++#define ll_unregister_cache(cache) unregister_cache(cache)
++#else
++#define ll_register_cache(cache) do {} while (0)
++#define ll_unregister_cache(cache) do {} while (0)
++#endif
++
++void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
++void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
++struct ll_ra_read *ll_ra_read_get(struct file *f);
++
++/* llite/lproc_llite.c */
++#ifdef LPROCFS
++int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
++ struct super_block *sb, char *osc, char *mdc);
++void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
++void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
++void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
++#else
++static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
++ struct super_block *sb, char *osc, char *mdc){return 0;}
++static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
++static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
++static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
++{
++ memset(lvars, 0, sizeof(*lvars));
++}
++#endif
++
++
++/* llite/dir.c */
++extern struct file_operations ll_dir_operations;
++extern struct inode_operations ll_dir_inode_operations;
++
++struct page *ll_get_dir_page(struct inode *dir, unsigned long n);
++
++static inline unsigned ll_dir_rec_len(unsigned name_len)
++{
++ return (name_len + 8 + LL_DIR_PAD - 1) & ~(LL_DIR_PAD - 1);
++}
++
++static inline struct ll_dir_entry *ll_entry_at(void *base, unsigned offset)
++{
++ return (struct ll_dir_entry *)((char *)base + offset);
++}
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ll_dir_entry *ll_dir_next_entry(struct ll_dir_entry *p)
++{
++ return ll_entry_at(p, le16_to_cpu(p->lde_rec_len));
++}
++
++static inline void ll_put_page(struct page *page)
++{
++ kunmap(page);
++ page_cache_release(page);
++}
++
++static inline unsigned long dir_pages(struct inode *inode)
++{
++ return (inode->i_size + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
++}
++
++int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir);
++struct inode *ll_iget(struct super_block *sb, ino_t hash,
++ struct lustre_md *lic);
++int ll_mdc_cancel_unused(struct lustre_handle *, struct inode *, int flags,
++ void *opaque);
++int ll_mdc_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
++ void *data, int flag);
++int ll_prepare_mdc_op_data(struct mdc_op_data *,
++ struct inode *i1, struct inode *i2,
++ const char *name, int namelen, int mode, void *data);
++#ifndef HAVE_VFS_INTENT_PATCHES
++struct lookup_intent *ll_convert_intent(struct open_intent *oit,
++ int lookup_flags);
++#endif
++void ll_pin_extent_cb(void *data);
++int ll_page_removal_cb(void *data, int discard);
++int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
++ void *data, int flag);
++int lookup_it_finish(struct ptlrpc_request *request, int offset,
++ struct lookup_intent *it, void *data);
++void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
++
++/* llite/rw.c */
++int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
++int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
++int ll_writepage(struct page *page);
++void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa);
++int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc);
++int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction);
++extern struct cache_definition ll_cache_definition;
++void ll_removepage(struct page *page);
++int ll_readpage(struct file *file, struct page *page);
++struct ll_async_page *llap_cast_private(struct page *page);
++void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
++void ll_ra_accounting(struct ll_async_page *llap,struct address_space *mapping);
++void ll_truncate(struct inode *inode);
++int ll_file_punch(struct inode *, loff_t, int);
++ssize_t ll_file_lockless_io(struct file *, const struct iovec *,
++ unsigned long, loff_t *, int, ssize_t);
++void ll_clear_file_contended(struct inode*);
++int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
++
++/* llite/file.c */
++extern struct file_operations ll_file_operations;
++extern struct file_operations ll_file_operations_flock;
++extern struct file_operations ll_file_operations_noflock;
++extern struct inode_operations ll_file_inode_operations;
++extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *);
++extern int ll_have_md_lock(struct inode *inode, __u64 bits);
++int ll_region_mapped(unsigned long addr, size_t count);
++int ll_extent_lock(struct ll_file_data *, struct inode *,
++ struct lov_stripe_md *, int mode, ldlm_policy_data_t *,
++ struct lustre_handle *, int ast_flags);
++int ll_extent_unlock(struct ll_file_data *, struct inode *,
++ struct lov_stripe_md *, int mode, struct lustre_handle *);
++int ll_file_open(struct inode *inode, struct file *file);
++int ll_file_release(struct inode *inode, struct file *file);
++int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
++int ll_glimpse_ioctl(struct ll_sb_info *sbi,
++ struct lov_stripe_md *lsm, lstat_t *st);
++int ll_glimpse_size(struct inode *inode, int ast_flags);
++int ll_local_open(struct file *file,
++ struct lookup_intent *it, struct ll_file_data *fd,
++ struct obd_client_handle *och);
++int ll_release_openhandle(struct dentry *, struct lookup_intent *);
++int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
++ struct file *file);
++int ll_mdc_real_close(struct inode *inode, int flags);
++extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
++ *file, size_t count, int rw);
++int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
++ struct lookup_intent *it, struct kstat *stat);
++int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
++struct ll_file_data *ll_file_data_get(void);
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
++int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
++#else
++int ll_inode_permission(struct inode *inode, int mask);
++#endif
++int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
++ int flags, struct lov_user_md *lum,
++ int lum_size);
++int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
++ struct lov_mds_md **lmm, int *lmm_size,
++ struct ptlrpc_request **request);
++int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
++ int set_default);
++int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
++ int *lmm_size, struct ptlrpc_request **request);
++int ll_fsync(struct file *file, struct dentry *dentry, int data);
++int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
++ int num_bytes);
++
++/* llite/dcache.c */
++/* llite/namei.c */
++/**
++ * protect race ll_find_aliases vs ll_revalidate_it vs ll_unhash_aliases
++ */
++extern spinlock_t ll_lookup_lock;
++extern struct dentry_operations ll_d_ops;
++void ll_intent_drop_lock(struct lookup_intent *);
++void ll_intent_release(struct lookup_intent *);
++extern void ll_set_dd(struct dentry *de);
++int ll_drop_dentry(struct dentry *dentry);
++void ll_unhash_aliases(struct inode *);
++void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
++void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
++int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name);
++int revalidate_it_finish(struct ptlrpc_request *request, int offset,
++ struct lookup_intent *it, struct dentry *de);
++
++/* llite/llite_lib.c */
++extern struct super_operations lustre_super_operations;
++
++char *ll_read_opt(const char *opt, char *data);
++void ll_lli_init(struct ll_inode_info *lli);
++int ll_fill_super(struct super_block *sb);
++void ll_put_super(struct super_block *sb);
++void ll_kill_super(struct super_block *sb);
++struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
++void ll_clear_inode(struct inode *inode);
++int ll_setattr_raw(struct inode *inode, struct iattr *attr);
++int ll_setattr(struct dentry *de, struct iattr *attr);
++#ifndef HAVE_STATFS_DENTRY_PARAM
++int ll_statfs(struct super_block *sb, struct kstatfs *sfs);
++#else
++int ll_statfs(struct dentry *de, struct kstatfs *sfs);
++#endif
++int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
++ __u64 max_age, __u32 flags);
++void ll_update_inode(struct inode *inode, struct lustre_md *md);
++void ll_read_inode2(struct inode *inode, void *opaque);
++int ll_iocontrol(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg);
++#ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
++void ll_umount_begin(struct vfsmount *vfsmnt, int flags);
++#else
++void ll_umount_begin(struct super_block *sb);
++#endif
++int ll_remount_fs(struct super_block *sb, int *flags, char *data);
++int ll_show_options(struct seq_file *seq, struct vfsmount *vfs);
++int ll_prep_inode(struct obd_export *exp, struct inode **inode,
++ struct ptlrpc_request *req, int offset, struct super_block *);
++void lustre_dump_dentry(struct dentry *, int recur);
++void lustre_dump_inode(struct inode *);
++struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
++ struct list_head *list);
++int ll_obd_statfs(struct inode *inode, void *arg);
++int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
++int ll_process_config(struct lustre_cfg *lcfg);
++
++/* llite/llite_nfs.c */
++extern struct export_operations lustre_export_operations;
++__u32 get_uuid2int(const char *name, int len);
++struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
++ int fhtype, int parent);
++int ll_dentry_to_fh(struct dentry *, __u32 *datap, int *lenp, int need_parent);
++
++/* llite/special.c */
++extern struct inode_operations ll_special_inode_operations;
++extern struct file_operations ll_special_chr_inode_fops;
++extern struct file_operations ll_special_chr_file_fops;
++extern struct file_operations ll_special_blk_inode_fops;
++extern struct file_operations ll_special_fifo_inode_fops;
++extern struct file_operations ll_special_fifo_file_fops;
++extern struct file_operations ll_special_sock_inode_fops;
++
++/* llite/symlink.c */
++extern struct inode_operations ll_fast_symlink_inode_operations;
++
++/* llite/llite_close.c */
++struct ll_close_queue {
++ spinlock_t lcq_lock;
++ struct list_head lcq_list;
++ wait_queue_head_t lcq_waitq;
++ struct completion lcq_comp;
++};
++
++#ifdef HAVE_CLOSE_THREAD
++void llap_write_pending(struct inode *inode, struct ll_async_page *llap);
++void llap_write_complete(struct inode *inode, struct ll_async_page *llap);
++void ll_open_complete(struct inode *inode);
++int ll_is_inode_dirty(struct inode *inode);
++void ll_try_done_writing(struct inode *inode);
++void ll_queue_done_writing(struct inode *inode);
++#else
++static inline void llap_write_pending(struct inode *inode,
++ struct ll_async_page *llap) { return; };
++static inline void llap_write_complete(struct inode *inode,
++ struct ll_async_page *llap) { return; };
++static inline void ll_open_complete(struct inode *inode) { return; };
++static inline int ll_is_inode_dirty(struct inode *inode) { return 0; };
++static inline void ll_try_done_writing(struct inode *inode) { return; };
++static inline void ll_queue_done_writing(struct inode *inode) { return; };
++//static inline void ll_close_thread_shutdown(struct ll_close_queue *lcq) { return; };
++//static inline int ll_close_thread_start(struct ll_close_queue **lcq_ret) { return 0; };
++#endif
++void ll_close_thread_shutdown(struct ll_close_queue *lcq);
++int ll_close_thread_start(struct ll_close_queue **lcq_ret);
++
++/* llite/llite_mmap.c */
++typedef struct rb_root rb_root_t;
++typedef struct rb_node rb_node_t;
++
++struct ll_lock_tree_node;
++struct ll_lock_tree {
++ rb_root_t lt_root;
++ struct list_head lt_locked_list;
++ struct ll_file_data *lt_fd;
++};
++
++int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
++int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
++struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
++ __u64 end, ldlm_mode_t mode);
++int ll_tree_lock(struct ll_lock_tree *tree,
++ struct ll_lock_tree_node *first_node,
++ const char *buf, size_t count, int ast_flags);
++int ll_tree_lock_iov(struct ll_lock_tree *tree,
++ struct ll_lock_tree_node *first_node,
++ const struct iovec *iov, unsigned long nr_segs,
++ int ast_flags);
++int ll_tree_unlock(struct ll_lock_tree *tree);
++
++#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi)
++
++static inline __u64 ll_ts2u64(struct timespec *time)
++{
++ __u64 t = time->tv_sec;
++ return t;
++}
++
++/* don't need an addref as the sb_info should be holding one */
++static inline struct obd_export *ll_s2obdexp(struct super_block *sb)
++{
++ return ll_s2sbi(sb)->ll_osc_exp;
++}
++
++/* don't need an addref as the sb_info should be holding one */
++static inline struct obd_export *ll_s2mdcexp(struct super_block *sb)
++{
++ return ll_s2sbi(sb)->ll_mdc_exp;
++}
++
++static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
++{
++ struct obd_device *obd = sbi->ll_mdc_exp->exp_obd;
++ if (obd == NULL)
++ LBUG();
++ return &obd->u.cli;
++}
++
++// FIXME: replace the name of this with LL_SB to conform to kernel stuff
++static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
++{
++ return ll_s2sbi(inode->i_sb);
++}
++
++static inline struct obd_export *ll_i2obdexp(struct inode *inode)
++{
++ return ll_s2obdexp(inode->i_sb);
++}
++
++static inline struct obd_export *ll_i2mdcexp(struct inode *inode)
++{
++ return ll_s2mdcexp(inode->i_sb);
++}
++
++static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
++{
++ mdc_pack_fid(fid, inode->i_ino, inode->i_generation,
++ inode->i_mode & S_IFMT);
++}
++
++static inline int ll_mds_max_easize(struct super_block *sb)
++{
++ return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
++}
++
++static inline __u64 ll_file_maxbytes(struct inode *inode)
++{
++ return ll_i2info(inode)->lli_maxbytes;
++}
++
++/* llite/xattr.c */
++int ll_setxattr(struct dentry *dentry, const char *name,
++ const void *value, size_t size, int flags);
++ssize_t ll_getxattr(struct dentry *dentry, const char *name,
++ void *buffer, size_t size);
++ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
++int ll_removexattr(struct dentry *dentry, const char *name);
++
++/* statahead.c */
++
++#define LL_SA_RPC_MIN 2
++#define LL_SA_RPC_DEF 32
++#define LL_SA_RPC_MAX 8192
++
++/* per inode struct, for dir only */
++struct ll_statahead_info {
++ struct inode *sai_inode;
++ unsigned int sai_generation; /* generation for statahead */
++ atomic_t sai_refcount; /* when access this struct, hold
++ * refcount */
++ unsigned int sai_sent; /* stat requests sent count */
++ unsigned int sai_replied; /* stat requests which received
++ * reply */
++ unsigned int sai_max; /* max ahead of lookup */
++ unsigned int sai_index; /* index of statahead entry */
++ unsigned int sai_index_next; /* index for the next statahead
++ * entry to be stated */
++ unsigned int sai_hit; /* hit count */
++ unsigned int sai_miss; /* miss count:
++ * for "ls -al" case, it includes
++ * hidden dentry miss;
++ * for "ls -l" case, it does not
++ * include hidden dentry miss.
++ * "sai_miss_hidden" is used for
++ * the later case.
++ */
++ unsigned int sai_consecutive_miss; /* consecutive miss */
++ unsigned int sai_miss_hidden;/* "ls -al", but first dentry
++ * is not a hidden one */
++ unsigned int sai_skip_hidden;/* skipped hidden dentry count */
++ unsigned int sai_ls_all:1; /* "ls -al", do stat-ahead for
++ * hidden entries */
++ cfs_waitq_t sai_waitq; /* stat-ahead wait queue */
++ struct ptlrpc_thread sai_thread; /* stat-ahead thread */
++ struct list_head sai_entries_sent; /* entries sent out */
++ struct list_head sai_entries_received; /* entries returned */
++ struct list_head sai_entries_stated; /* entries stated */
++};
++
++int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup);
++void ll_statahead_exit(struct dentry *dentry, int result);
++void ll_stop_statahead(struct inode *inode, void *key);
++
++static inline
++void ll_statahead_mark(struct dentry *dentry)
++{
++ struct ll_inode_info *lli = ll_i2info(dentry->d_parent->d_inode);
++ struct ll_dentry_data *ldd = ll_d2d(dentry);
++
++ /* not the same process, don't mark */
++ if (lli->lli_opendir_pid != cfs_curproc_pid())
++ return;
++
++ spin_lock(&lli->lli_lock);
++ if (likely(lli->lli_sai != NULL && ldd != NULL))
++ ldd->lld_sa_generation = lli->lli_sai->sai_generation;
++ spin_unlock(&lli->lli_lock);
++}
++
++static inline
++int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(dir);
++ struct ll_inode_info *lli = ll_i2info(dir);
++ struct ll_dentry_data *ldd = ll_d2d(*dentryp);
++
++ if (sbi->ll_sa_max == 0)
++ return -ENOTSUPP;
++
++ /* not the same process, don't statahead */
++ if (lli->lli_opendir_pid != cfs_curproc_pid())
++ return -EBADF;
++
++ /*
++ * When "ls" a dentry, the system trigger more than once "revalidate" or
++ * "lookup", for "getattr", for "getxattr", and maybe for others.
++ * Under patchless client mode, the operation intent is not accurate,
++ * it maybe misguide the statahead thread. For example:
++ * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
++ * have the same operation intent -- "IT_GETATTR".
++ * In fact, one dentry should has only one chance to interact with the
++ * statahead thread, otherwise the statahead windows will be confused.
++ * The solution is as following:
++ * Assign "lld_sa_generation" with "sai_generation" when a dentry
++ * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
++ * will bypass interacting with statahead thread for checking:
++ * "lld_sa_generation == lli_sai->sai_generation"
++ */
++ if (ldd && lli->lli_sai &&
++ ldd->lld_sa_generation == lli->lli_sai->sai_generation)
++ return -EAGAIN;
++
++ return do_statahead_enter(dir, dentryp, lookup);
++}
++
++static void inline ll_dops_init(struct dentry *de, int block)
++{
++ struct ll_dentry_data *lld = ll_d2d(de);
++
++ if (lld == NULL && block != 0) {
++ ll_set_dd(de);
++ lld = ll_d2d(de);
++ }
++
++ if (lld != NULL)
++ lld->lld_sa_generation = 0;
++
++ de->d_op = &ll_d_ops;
++}
++
++/* llite ioctl register support rountine */
++#ifdef __KERNEL__
++enum llioc_iter {
++ LLIOC_CONT = 0,
++ LLIOC_STOP
++};
++
++#define LLIOC_MAX_CMD 256
++
++/*
++ * Rules to write a callback function:
++ *
++ * Parameters:
++ * @magic: Dynamic ioctl call routine will feed this vaule with the pointer
++ * returned to ll_iocontrol_register. Callback functions should use this
++ * data to check the potential collasion of ioctl cmd. If collasion is
++ * found, callback function should return LLIOC_CONT.
++ * @rcp: The result of ioctl command.
++ *
++ * Return values:
++ * If @magic matches the pointer returned by ll_iocontrol_data, the
++ * callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
++ */
++typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
++ struct file *file, unsigned int cmd, unsigned long arg,
++ void *magic, int *rcp);
++
++enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg, int *rcp);
++
++/* export functions */
++/* Register ioctl block dynamatically for a regular file.
++ *
++ * @cmd: the array of ioctl command set
++ * @count: number of commands in the @cmd
++ * @cb: callback function, it will be called if an ioctl command is found to
++ * belong to the command list @cmd.
++ *
++ * Return vaule:
++ * A magic pointer will be returned if success;
++ * otherwise, NULL will be returned.
++ * */
++void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
++void ll_iocontrol_unregister(void *magic);
++
++#endif
++
++#endif /* LLITE_INTERNAL_H */
+diff -urNad lustre~/lustre/llite/llite_lib.c lustre/lustre/llite/llite_lib.c
+--- lustre~/lustre/llite/llite_lib.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/llite_lib.c 2009-03-12 11:02:51.000000000 +0100
+@@ -1346,7 +1346,7 @@
+ rc = vmtruncate(inode, new_size);
+ clear_bit(LLI_F_SRVLOCK, &lli->lli_flags);
+ if (rc != 0) {
+- LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
++ LASSERT(SEM_COUNT(&lli->lli_size_sem) <= 0);
+ ll_inode_size_unlock(inode, 0);
+ }
+ }
+diff -urNad lustre~/lustre/llite/llite_lib.c.orig lustre/lustre/llite/llite_lib.c.orig
+--- lustre~/lustre/llite/llite_lib.c.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/llite/llite_lib.c.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,2232 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ *
++ * lustre/llite/llite_lib.c
++ *
++ * Lustre Light Super operations
++ */
++
++#define DEBUG_SUBSYSTEM S_LLITE
++
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/random.h>
++#include <linux/version.h>
++#include <linux/mm.h>
++
++#include <lustre_lite.h>
++#include <lustre_ha.h>
++#include <lustre_dlm.h>
++#include <lprocfs_status.h>
++#include <lustre_disk.h>
++#include <lustre_param.h>
++#include <lustre_cache.h>
++#include "llite_internal.h"
++
++cfs_mem_cache_t *ll_file_data_slab;
++
++LIST_HEAD(ll_super_blocks);
++spinlock_t ll_sb_lock = SPIN_LOCK_UNLOCKED;
++
++extern struct address_space_operations ll_aops;
++extern struct address_space_operations ll_dir_aops;
++
++#ifndef log2
++#define log2(n) ffz(~(n))
++#endif
++
++
++static struct ll_sb_info *ll_init_sbi(void)
++{
++ struct ll_sb_info *sbi = NULL;
++ unsigned long pages;
++ struct sysinfo si;
++ class_uuid_t uuid;
++ int i;
++ ENTRY;
++
++ OBD_ALLOC(sbi, sizeof(*sbi));
++ if (!sbi)
++ RETURN(NULL);
++
++ spin_lock_init(&sbi->ll_lock);
++ spin_lock_init(&sbi->ll_lco.lco_lock);
++ spin_lock_init(&sbi->ll_pp_extent_lock);
++ spin_lock_init(&sbi->ll_process_lock);
++ sbi->ll_rw_stats_on = 0;
++ INIT_LIST_HEAD(&sbi->ll_pglist);
++
++ si_meminfo(&si);
++ pages = si.totalram - si.totalhigh;
++ if (pages >> (20 - CFS_PAGE_SHIFT) < 512) {
++#ifdef HAVE_BGL_SUPPORT
++ sbi->ll_async_page_max = pages / 4;
++#else
++ sbi->ll_async_page_max = pages / 2;
++#endif
++ } else {
++ sbi->ll_async_page_max = (pages / 4) * 3;
++ }
++ sbi->ll_ra_info.ra_max_pages = min(pages / 32,
++ SBI_DEFAULT_READAHEAD_MAX);
++ sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
++ SBI_DEFAULT_READAHEAD_WHOLE_MAX;
++ sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
++ sbi->ll_lockless_truncate_enable = SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE;
++ INIT_LIST_HEAD(&sbi->ll_conn_chain);
++ INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
++
++ ll_generate_random_uuid(uuid);
++ class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
++ CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
++
++ spin_lock(&ll_sb_lock);
++ list_add_tail(&sbi->ll_list, &ll_super_blocks);
++ spin_unlock(&ll_sb_lock);
++
++#ifdef ENABLE_CHECKSUM
++ sbi->ll_flags |= LL_SBI_DATA_CHECKSUM;
++#endif
++#ifdef ENABLE_LLITE_CHECKSUM
++ sbi->ll_flags |= LL_SBI_LLITE_CHECKSUM;
++#endif
++
++#ifdef HAVE_LRU_RESIZE_SUPPORT
++ sbi->ll_flags |= LL_SBI_LRU_RESIZE;
++#endif
++
++#ifdef HAVE_EXPORT___IGET
++ INIT_LIST_HEAD(&sbi->ll_deathrow);
++ spin_lock_init(&sbi->ll_deathrow_lock);
++#endif
++ for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
++ spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].pp_r_hist.oh_lock);
++ spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].pp_w_hist.oh_lock);
++ }
++
++ /* metadata statahead is enabled by default */
++ sbi->ll_sa_max = LL_SA_RPC_DEF;
++
++ RETURN(sbi);
++}
++
++void ll_free_sbi(struct super_block *sb)
++{
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ ENTRY;
++
++ if (sbi != NULL) {
++ spin_lock(&ll_sb_lock);
++ list_del(&sbi->ll_list);
++ spin_unlock(&ll_sb_lock);
++ OBD_FREE(sbi, sizeof(*sbi));
++ }
++ EXIT;
++}
++
++static struct dentry_operations ll_d_root_ops = {
++#ifdef DCACHE_LUSTRE_INVALID
++ .d_compare = ll_dcompare,
++#endif
++};
++
++static int client_common_fill_super(struct super_block *sb,
++ char *mdc, char *osc)
++{
++ struct inode *root = 0;
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ struct obd_device *obd;
++ struct ll_fid rootfid;
++ struct obd_statfs osfs;
++ struct ptlrpc_request *request = NULL;
++ struct lustre_handle osc_conn = {0, };
++ struct lustre_handle mdc_conn = {0, };
++ struct lustre_md md;
++ struct obd_connect_data *data = NULL;
++ int err, checksum;
++ ENTRY;
++
++ obd = class_name2obd(mdc);
++ if (!obd) {
++ CERROR("MDC %s: not setup or attached\n", mdc);
++ RETURN(-EINVAL);
++ }
++
++ OBD_ALLOC(data, sizeof(*data));
++ if (data == NULL)
++ RETURN(-ENOMEM);
++
++ if (proc_lustre_fs_root) {
++ err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
++ osc, mdc);
++ if (err < 0)
++ CERROR("could not register mount in /proc/fs/lustre\n");
++ }
++
++ /* indicate the features supported by this client */
++ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS |
++ OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH |
++ OBD_CONNECT_CANCELSET | OBD_CONNECT_AT;
++#ifdef HAVE_LRU_RESIZE_SUPPORT
++ if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
++ data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
++#endif
++#ifdef CONFIG_FS_POSIX_ACL
++ data->ocd_connect_flags |= OBD_CONNECT_ACL;
++#endif
++ data->ocd_ibits_known = MDS_INODELOCK_FULL;
++ data->ocd_version = LUSTRE_VERSION_CODE;
++
++ if (sb->s_flags & MS_RDONLY)
++ data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
++ if (sbi->ll_flags & LL_SBI_USER_XATTR)
++ data->ocd_connect_flags |= OBD_CONNECT_XATTR;
++
++#ifdef HAVE_MS_FLOCK_LOCK
++ /* force vfs to use lustre handler for flock() calls - bug 10743 */
++ sb->s_flags |= MS_FLOCK_LOCK;
++#endif
++
++ if (sbi->ll_flags & LL_SBI_FLOCK)
++ sbi->ll_fop = &ll_file_operations_flock;
++ else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
++ sbi->ll_fop = &ll_file_operations;
++ else
++ sbi->ll_fop = &ll_file_operations_noflock;
++
++
++ err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data, &sbi->ll_mdc_exp);
++ if (err == -EBUSY) {
++ LCONSOLE_ERROR_MSG(0x14f, "An MDT (mdc %s) is performing "
++ "recovery, of which this client is not a "
++ "part. Please wait for recovery to complete,"
++ " abort, or time out.\n", mdc);
++ GOTO(out, err);
++ } else if (err) {
++ CERROR("cannot connect to %s: rc = %d\n", mdc, err);
++ GOTO(out, err);
++ }
++
++ err = obd_statfs(obd, &osfs, cfs_time_current_64() - HZ, 0);
++ if (err)
++ GOTO(out_mdc, err);
++
++ /* MDC connect is surely finished by now because we actually sent
++ * a statfs RPC, otherwise obd_connect() is asynchronous. */
++ *data = class_exp2cliimp(sbi->ll_mdc_exp)->imp_connect_data;
++
++ LASSERT(osfs.os_bsize);
++ sb->s_blocksize = osfs.os_bsize;
++ sb->s_blocksize_bits = log2(osfs.os_bsize);
++ sb->s_magic = LL_SUPER_MAGIC;
++
++ /* for bug 11559. in $LINUX/fs/read_write.c, function do_sendfile():
++ * retval = in_file->f_op->sendfile(...);
++ * if (*ppos > max)
++ * retval = -EOVERFLOW;
++ *
++ * it will check if *ppos is greater than max. However, max equals to
++ * s_maxbytes, which is a negative integer in a x86_64 box since loff_t
++ * has been defined as a signed long long ineger in linux kernel. */
++#if BITS_PER_LONG == 64
++ sb->s_maxbytes = PAGE_CACHE_MAXBYTES >> 1;
++#else
++ sb->s_maxbytes = PAGE_CACHE_MAXBYTES;
++#endif
++ sbi->ll_namelen = osfs.os_namelen;
++ sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
++
++ if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
++ !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
++ LCONSOLE_INFO("Disabling user_xattr feature because "
++ "it is not supported on the server\n");
++ sbi->ll_flags &= ~LL_SBI_USER_XATTR;
++ }
++
++ if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
++#ifdef MS_POSIXACL
++ sb->s_flags |= MS_POSIXACL;
++#endif
++ sbi->ll_flags |= LL_SBI_ACL;
++ } else
++ sbi->ll_flags &= ~LL_SBI_ACL;
++
++ if (data->ocd_connect_flags & OBD_CONNECT_JOIN)
++ sbi->ll_flags |= LL_SBI_JOIN;
++
++ obd = class_name2obd(osc);
++ if (!obd) {
++ CERROR("OSC %s: not setup or attached\n", osc);
++ GOTO(out_mdc, err = -ENODEV);
++ }
++
++ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT |
++ OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
++ OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT |
++ OBD_CONNECT_TRUNCLOCK;
++
++ if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
++ /* OBD_CONNECT_CKSUM should always be set, even if checksums are
++ * disabled by default, because it can still be enabled on the
++ * fly via /proc. As a consequence, we still need to come to an
++ * agreement on the supported algorithms at connect time */
++ data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
++
++ if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
++ data->ocd_cksum_types = OBD_CKSUM_ADLER;
++ else
++ /* send the list of supported checksum types */
++ data->ocd_cksum_types = OBD_CKSUM_ALL;
++ }
++
++#ifdef HAVE_LRU_RESIZE_SUPPORT
++ if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
++ data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
++#endif
++
++ CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
++ "ocd_grant: %d\n", data->ocd_connect_flags,
++ data->ocd_version, data->ocd_grant);
++
++ obd->obd_upcall.onu_owner = &sbi->ll_lco;
++ obd->obd_upcall.onu_upcall = ll_ocd_update;
++ data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT;
++
++ obd_register_lock_cancel_cb(obd, ll_extent_lock_cancel_cb);
++ obd_register_page_removal_cb(obd, ll_page_removal_cb, ll_pin_extent_cb);
++
++
++ err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, data, &sbi->ll_osc_exp);
++ if (err == -EBUSY) {
++ LCONSOLE_ERROR_MSG(0x150, "An OST (osc %s) is performing "
++ "recovery, of which this client is not a "
++ "part. Please wait for recovery to "
++ "complete, abort, or time out.\n", osc);
++ GOTO(out, err); // need clear cb?
++ } else if (err) {
++ CERROR("cannot connect to %s: rc = %d\n", osc, err);
++ GOTO(out_cb, err);
++ }
++ spin_lock(&sbi->ll_lco.lco_lock);
++ sbi->ll_lco.lco_flags = data->ocd_connect_flags;
++ sbi->ll_lco.lco_mdc_exp = sbi->ll_mdc_exp;
++ sbi->ll_lco.lco_osc_exp = sbi->ll_osc_exp;
++ spin_unlock(&sbi->ll_lco.lco_lock);
++
++ err = mdc_init_ea_size(sbi->ll_mdc_exp, sbi->ll_osc_exp);
++ if (err) {
++ CERROR("cannot set max EA and cookie sizes: rc = %d\n", err);
++ GOTO(out_osc, err);
++ }
++
++ err = obd_prep_async_page(sbi->ll_osc_exp, NULL, NULL, NULL,
++ 0, NULL, NULL, NULL, 0, NULL);
++ if (err < 0) {
++ LCONSOLE_ERROR_MSG(0x151, "There are no OST's in this "
++ "filesystem. There must be at least one "
++ "active OST for a client to start.\n");
++ GOTO(out_osc, err);
++ }
++
++ if (!ll_async_page_slab) {
++ ll_async_page_slab_size =
++ size_round(sizeof(struct ll_async_page)) + err;
++ ll_async_page_slab = cfs_mem_cache_create("ll_async_page",
++ ll_async_page_slab_size,
++ 0, 0);
++ if (!ll_async_page_slab)
++ GOTO(out_osc, err = -ENOMEM);
++ }
++
++ err = mdc_getstatus(sbi->ll_mdc_exp, &rootfid);
++ if (err) {
++ CERROR("cannot mds_connect: rc = %d\n", err);
++ GOTO(out_osc, err);
++ }
++ CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id);
++ sbi->ll_rootino = rootfid.id;
++
++ sb->s_op = &lustre_super_operations;
++#if THREAD_SIZE >= 8192
++ /* Disable the NFS export because of stack overflow
++ * when THREAD_SIZE < 8192. Please refer to 17630. */
++ sb->s_export_op = &lustre_export_operations;
++#endif
++
++ /* make root inode
++ * XXX: move this to after cbd setup? */
++ err = mdc_getattr(sbi->ll_mdc_exp, &rootfid,
++ OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS |
++ (sbi->ll_flags & LL_SBI_ACL ? OBD_MD_FLACL : 0),
++ 0, &request);
++ if (err) {
++ CERROR("mdc_getattr failed for root: rc = %d\n", err);
++ GOTO(out_osc, err);
++ }
++
++ err = mdc_req2lustre_md(request, REPLY_REC_OFF, sbi->ll_osc_exp, &md);
++ if (err) {
++ CERROR("failed to understand root inode md: rc = %d\n",err);
++ ptlrpc_req_finished (request);
++ GOTO(out_osc, err);
++ }
++
++ LASSERT(sbi->ll_rootino != 0);
++ root = ll_iget(sb, sbi->ll_rootino, &md);
++
++ ptlrpc_req_finished(request);
++
++ if (root == NULL || is_bad_inode(root)) {
++ mdc_free_lustre_md(sbi->ll_osc_exp, &md);
++ CERROR("lustre_lite: bad iget4 for root\n");
++ GOTO(out_root, err = -EBADF);
++ }
++
++ err = ll_close_thread_start(&sbi->ll_lcq);
++ if (err) {
++ CERROR("cannot start close thread: rc %d\n", err);
++ GOTO(out_root, err);
++ }
++
++ checksum = sbi->ll_flags & LL_SBI_DATA_CHECKSUM;
++ err = obd_set_info_async(sbi->ll_osc_exp, sizeof(KEY_CHECKSUM),
++ KEY_CHECKSUM, sizeof(checksum),
++ &checksum, NULL);
++
++ /* making vm readahead 0 for 2.4.x. In the case of 2.6.x,
++ backing dev info assigned to inode mapping is used for
++ determining maximal readahead. */
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \
++ !defined(KERNEL_HAS_AS_MAX_READAHEAD)
++ /* bug 2805 - set VM readahead to zero */
++ vm_max_readahead = vm_min_readahead = 0;
++#endif
++
++ sb->s_root = d_alloc_root(root);
++ if (data != NULL)
++ OBD_FREE(data, sizeof(*data));
++ sb->s_root->d_op = &ll_d_root_ops;
++
++ sbi->ll_sdev_orig = sb->s_dev;
++ /* We set sb->s_dev equal on all lustre clients in order to support
++ * NFS export clustering. NFSD requires that the FSID be the same
++ * on all clients. */
++ /* s_dev is also used in lt_compare() to compare two fs, but that is
++ * only a node-local comparison. */
++ sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid,
++ strlen(sbi2mdc(sbi)->cl_target_uuid.uuid));
++
++ RETURN(err);
++
++out_root:
++ if (root)
++ iput(root);
++out_osc:
++ obd_disconnect(sbi->ll_osc_exp);
++ sbi->ll_osc_exp = NULL;
++out_cb:
++ obd = class_name2obd(osc);
++ obd_unregister_lock_cancel_cb(obd, ll_extent_lock_cancel_cb);
++ obd_unregister_page_removal_cb(obd, ll_page_removal_cb);
++out_mdc:
++ obd_disconnect(sbi->ll_mdc_exp);
++ sbi->ll_mdc_exp = NULL;
++out:
++ if (data != NULL)
++ OBD_FREE(data, sizeof(*data));
++ lprocfs_unregister_mountpoint(sbi);
++ RETURN(err);
++}
++
++int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
++{
++ int size, rc;
++
++ *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL);
++ size = sizeof(int);
++ rc = obd_get_info(sbi->ll_mdc_exp, sizeof(KEY_MAX_EASIZE),
++ KEY_MAX_EASIZE, &size, lmmsize, NULL);
++ if (rc)
++ CERROR("Get max mdsize error rc %d \n", rc);
++
++ RETURN(rc);
++}
++
++void ll_dump_inode(struct inode *inode)
++{
++ struct list_head *tmp;
++ int dentry_count = 0;
++
++ LASSERT(inode != NULL);
++
++ list_for_each(tmp, &inode->i_dentry)
++ dentry_count++;
++
++ CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
++ inode, ll_i2mdcexp(inode)->exp_obd->obd_name, inode->i_ino,
++ inode->i_mode, atomic_read(&inode->i_count), dentry_count);
++}
++
++void lustre_dump_dentry(struct dentry *dentry, int recur)
++{
++ struct list_head *tmp;
++ int subdirs = 0;
++
++ LASSERT(dentry != NULL);
++
++ list_for_each(tmp, &dentry->d_subdirs)
++ subdirs++;
++
++ CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
++ " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
++ dentry->d_name.len, dentry->d_name.name,
++ dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
++ dentry->d_parent, dentry->d_inode, atomic_read(&dentry->d_count),
++ dentry->d_flags, dentry->d_fsdata, subdirs);
++ if (dentry->d_inode != NULL)
++ ll_dump_inode(dentry->d_inode);
++
++ if (recur == 0)
++ return;
++
++ list_for_each(tmp, &dentry->d_subdirs) {
++ struct dentry *d = list_entry(tmp, struct dentry, d_child);
++ lustre_dump_dentry(d, recur - 1);
++ }
++}
++
++#ifdef HAVE_EXPORT___IGET
++static void prune_dir_dentries(struct inode *inode)
++{
++ struct dentry *dentry, *prev = NULL;
++
++ /* due to lustre specific logic, a directory
++ * can have few dentries - a bug from VFS POV */
++restart:
++ spin_lock(&dcache_lock);
++ if (!list_empty(&inode->i_dentry)) {
++ dentry = list_entry(inode->i_dentry.prev,
++ struct dentry, d_alias);
++ /* in order to prevent infinite loops we
++ * break if previous dentry is busy */
++ if (dentry != prev) {
++ prev = dentry;
++ dget_locked(dentry);
++ spin_unlock(&dcache_lock);
++
++ /* try to kill all child dentries */
++ shrink_dcache_parent(dentry);
++ dput(dentry);
++
++ /* now try to get rid of current dentry */
++ d_prune_aliases(inode);
++ goto restart;
++ }
++ }
++ spin_unlock(&dcache_lock);
++}
++
++static void prune_deathrow_one(struct ll_inode_info *lli)
++{
++ struct inode *inode = ll_info2i(lli);
++
++ /* first, try to drop any dentries - they hold a ref on the inode */
++ if (S_ISDIR(inode->i_mode))
++ prune_dir_dentries(inode);
++ else
++ d_prune_aliases(inode);
++
++
++ /* if somebody still uses it, leave it */
++ LASSERT(atomic_read(&inode->i_count) > 0);
++ if (atomic_read(&inode->i_count) > 1)
++ goto out;
++
++ CDEBUG(D_INODE, "inode %lu/%u(%d) looks a good candidate for prune\n",
++ inode->i_ino,inode->i_generation, atomic_read(&inode->i_count));
++
++ /* seems nobody uses it anymore */
++ inode->i_nlink = 0;
++
++out:
++ iput(inode);
++ return;
++}
++
++static void prune_deathrow(struct ll_sb_info *sbi, int try)
++{
++ struct ll_inode_info *lli;
++ int empty;
++
++ do {
++ if (need_resched() && try)
++ break;
++
++ if (try) {
++ if (!spin_trylock(&sbi->ll_deathrow_lock))
++ break;
++ } else {
++ spin_lock(&sbi->ll_deathrow_lock);
++ }
++
++ empty = 1;
++ lli = NULL;
++ if (!list_empty(&sbi->ll_deathrow)) {
++ lli = list_entry(sbi->ll_deathrow.next,
++ struct ll_inode_info,
++ lli_dead_list);
++ list_del_init(&lli->lli_dead_list);
++ if (!list_empty(&sbi->ll_deathrow))
++ empty = 0;
++ }
++ spin_unlock(&sbi->ll_deathrow_lock);
++
++ if (lli)
++ prune_deathrow_one(lli);
++
++ } while (empty == 0);
++}
++#else /* !HAVE_EXPORT___IGET */
++#define prune_deathrow(sbi, try) do {} while (0)
++#endif /* HAVE_EXPORT___IGET */
++
++void client_common_put_super(struct super_block *sb)
++{
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ ENTRY;
++
++ ll_close_thread_shutdown(sbi->ll_lcq);
++
++ lprocfs_unregister_mountpoint(sbi);
++
++ /* destroy inodes in deathrow */
++ prune_deathrow(sbi, 0);
++
++ list_del(&sbi->ll_conn_chain);
++
++ /* callbacks is cleared after disconnect each target */
++ obd_disconnect(sbi->ll_osc_exp);
++ sbi->ll_osc_exp = NULL;
++
++ obd_disconnect(sbi->ll_mdc_exp);
++ sbi->ll_mdc_exp = NULL;
++
++ EXIT;
++}
++
++void ll_kill_super(struct super_block *sb)
++{
++ struct ll_sb_info *sbi;
++
++ ENTRY;
++
++ /* not init sb ?*/
++ if (!(sb->s_flags & MS_ACTIVE))
++ return;
++
++ sbi = ll_s2sbi(sb);
++ /* we need restore s_dev from changed for clustred NFS before put_super
++ * because new kernels have cached s_dev and change sb->s_dev in
++ * put_super not affected real removing devices */
++ if (sbi)
++ sb->s_dev = sbi->ll_sdev_orig;
++ EXIT;
++}
++
++char *ll_read_opt(const char *opt, char *data)
++{
++ char *value;
++ char *retval;
++ ENTRY;
++
++ CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
++ if (strncmp(opt, data, strlen(opt)))
++ RETURN(NULL);
++ if ((value = strchr(data, '=')) == NULL)
++ RETURN(NULL);
++
++ value++;
++ OBD_ALLOC(retval, strlen(value) + 1);
++ if (!retval) {
++ CERROR("out of memory!\n");
++ RETURN(NULL);
++ }
++
++ memcpy(retval, value, strlen(value)+1);
++ CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
++ RETURN(retval);
++}
++
++static inline int ll_set_opt(const char *opt, char *data, int fl)
++{
++ if (strncmp(opt, data, strlen(opt)) != 0)
++ return(0);
++ else
++ return(fl);
++}
++
++/* non-client-specific mount options are parsed in lmd_parse */
++static int ll_options(char *options, int *flags)
++{
++ int tmp;
++ char *s1 = options, *s2;
++ ENTRY;
++
++ if (!options)
++ RETURN(0);
++
++ CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
++
++ while (*s1) {
++ CDEBUG(D_SUPER, "next opt=%s\n", s1);
++ tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
++ if (tmp) {
++ *flags &= ~tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
++ if (tmp) {
++ *flags &= ~tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
++ if (tmp) {
++ /* Ignore deprecated mount option. The client will
++ * always try to mount with ACL support, whether this
++ * is used depends on whether server supports it. */
++ LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
++ "mount option 'acl'.\n");
++ goto next;
++ }
++ tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
++ if (tmp) {
++ LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
++ "mount option 'noacl'.\n");
++ goto next;
++ }
++
++ tmp = ll_set_opt("checksum", s1, LL_SBI_DATA_CHECKSUM);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("nochecksum", s1, LL_SBI_DATA_CHECKSUM);
++ if (tmp) {
++ *flags &= ~tmp;
++ goto next;
++ }
++
++ tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
++ if (tmp) {
++ *flags |= tmp;
++ goto next;
++ }
++ tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
++ if (tmp) {
++ *flags &= ~tmp;
++ goto next;
++ }
++ LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
++ s1);
++ RETURN(-EINVAL);
++
++next:
++ /* Find next opt */
++ s2 = strchr(s1, ',');
++ if (s2 == NULL)
++ break;
++ s1 = s2 + 1;
++ }
++ RETURN(0);
++}
++
++void ll_lli_init(struct ll_inode_info *lli)
++{
++ lli->lli_inode_magic = LLI_INODE_MAGIC;
++ sema_init(&lli->lli_size_sem, 1);
++ sema_init(&lli->lli_write_sem, 1);
++ lli->lli_flags = 0;
++ lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
++ spin_lock_init(&lli->lli_lock);
++ sema_init(&lli->lli_och_sem, 1);
++ lli->lli_mds_read_och = lli->lli_mds_write_och = NULL;
++ lli->lli_mds_exec_och = NULL;
++ lli->lli_open_fd_read_count = lli->lli_open_fd_write_count = 0;
++ lli->lli_open_fd_exec_count = 0;
++ INIT_LIST_HEAD(&lli->lli_dead_list);
++#ifdef HAVE_CLOSE_THREAD
++ INIT_LIST_HEAD(&lli->lli_pending_write_llaps);
++#endif
++}
++
++/* COMPAT_146 */
++#define MDCDEV "mdc_dev"
++static int old_lustre_process_log(struct super_block *sb, char *newprofile,
++ struct config_llog_instance *cfg)
++{
++ struct lustre_sb_info *lsi = s2lsi(sb);
++ struct obd_device *obd;
++ struct lustre_handle mdc_conn = {0, };
++ struct obd_export *exp;
++ char *ptr, *mdt, *profile;
++ char niduuid[10] = "mdtnid0";
++ class_uuid_t uuid;
++ struct obd_uuid mdc_uuid;
++ struct llog_ctxt *ctxt;
++ struct obd_connect_data ocd = { 0 };
++ lnet_nid_t nid;
++ int i, rc = 0, recov_bk = 1, failnodes = 0;
++ ENTRY;
++
++ ll_generate_random_uuid(uuid);
++ class_uuid_unparse(uuid, &mdc_uuid);
++ CDEBUG(D_HA, "generated uuid: %s\n", mdc_uuid.uuid);
++
++ /* Figure out the old mdt and profile name from new-style profile
++ ("lustre" from "mds/lustre-client") */
++ mdt = newprofile;
++ profile = strchr(mdt, '/');
++ if (profile == NULL) {
++ CDEBUG(D_CONFIG, "Can't find MDT name in %s\n", newprofile);
++ GOTO(out, rc = -EINVAL);
++ }
++ *profile = '\0';
++ profile++;
++ ptr = strrchr(profile, '-');
++ if (ptr == NULL) {
++ CDEBUG(D_CONFIG, "Can't find client name in %s\n", newprofile);
++ GOTO(out, rc = -EINVAL);
++ }
++ *ptr = '\0';
++
++ LCONSOLE_WARN("This looks like an old mount command; I will try to "
++ "contact MDT '%s' for profile '%s'\n", mdt, profile);
++
++ /* Use nids from mount line: uml1,1 at elan:uml2,2 at elan:/lustre */
++ i = 0;
++ ptr = lsi->lsi_lmd->lmd_dev;
++ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
++ rc = do_lcfg(MDCDEV, nid, LCFG_ADD_UUID, niduuid, 0,0,0);
++ i++;
++ /* Stop at the first failover nid */
++ if (*ptr == ':')
++ break;
++ }
++ if (i == 0) {
++ CERROR("No valid MDT nids found.\n");
++ GOTO(out, rc = -EINVAL);
++ }
++ failnodes++;
++
++ rc = do_lcfg(MDCDEV, 0, LCFG_ATTACH, LUSTRE_MDC_NAME,mdc_uuid.uuid,0,0);
++ if (rc < 0)
++ GOTO(out_del_uuid, rc);
++
++ rc = do_lcfg(MDCDEV, 0, LCFG_SETUP, mdt, niduuid, 0, 0);
++ if (rc < 0) {
++ LCONSOLE_ERROR_MSG(0x153, "I couldn't establish a connection "
++ "with the MDT. Check that the MDT host NID "
++ "is correct and the networks are up.\n");
++ GOTO(out_detach, rc);
++ }
++
++ obd = class_name2obd(MDCDEV);
++ if (obd == NULL)
++ GOTO(out_cleanup, rc = -EINVAL);
++
++ /* Add any failover nids */
++ while (*ptr == ':') {
++ /* New failover node */
++ sprintf(niduuid, "mdtnid%d", failnodes);
++ i = 0;
++ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
++ i++;
++ rc = do_lcfg(MDCDEV, nid, LCFG_ADD_UUID, niduuid,0,0,0);
++ if (rc)
++ CERROR("Add uuid for %s failed %d\n",
++ libcfs_nid2str(nid), rc);
++ if (*ptr == ':')
++ break;
++ }
++ if (i > 0) {
++ rc = do_lcfg(MDCDEV, 0, LCFG_ADD_CONN, niduuid, 0, 0,0);
++ if (rc)
++ CERROR("Add conn for %s failed %d\n",
++ libcfs_nid2str(nid), rc);
++ failnodes++;
++ } else {
++ /* at ":/fsname" */
++ break;
++ }
++ }
++
++ /* Try all connections, but only once. */
++ rc = obd_set_info_async(obd->obd_self_export,
++ sizeof(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP,
++ sizeof(recov_bk), &recov_bk, NULL);
++ if (rc)
++ GOTO(out_cleanup, rc);
++
++ /* If we don't have this then an ACL MDS will refuse the connection */
++ ocd.ocd_connect_flags = OBD_CONNECT_ACL;
++
++ rc = obd_connect(&mdc_conn, obd, &mdc_uuid, &ocd, &exp);
++ if (rc) {
++ CERROR("cannot connect to %s: rc = %d\n", mdt, rc);
++ GOTO(out_cleanup, rc);
++ }
++
++ ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
++
++ cfg->cfg_flags |= CFG_F_COMPAT146;
++
++#if 1
++ rc = class_config_parse_llog(ctxt, profile, cfg);
++#else
++ /*
++ * For debugging, it's useful to just dump the log
++ */
++ rc = class_config_dump_llog(ctxt, profile, cfg);
++#endif
++ llog_ctxt_put(ctxt);
++ switch (rc) {
++ case 0: {
++ /* Set the caller's profile name to the old-style */
++ memcpy(newprofile, profile, strlen(profile) + 1);
++ break;
++ }
++ case -EINVAL:
++ LCONSOLE_ERROR_MSG(0x154, "%s: The configuration '%s' could not"
++ " be read from the MDT '%s'. Make sure this"
++ " client and the MDT are running compatible "
++ "versions of Lustre.\n",
++ obd->obd_name, profile, mdt);
++ /* fall through */
++ default:
++ LCONSOLE_ERROR_MSG(0x155, "%s: The configuration '%s' could not"
++ " be read from the MDT '%s'. This may be "
++ "the result of communication errors between "
++ "the client and the MDT, or if the MDT is "
++ "not running.\n", obd->obd_name, profile,
++ mdt);
++ break;
++ }
++
++ /* We don't so much care about errors in cleaning up the config llog
++ * connection, as we have already read the config by this point. */
++ obd_disconnect(exp);
++
++out_cleanup:
++ do_lcfg(MDCDEV, 0, LCFG_CLEANUP, 0, 0, 0, 0);
++
++out_detach:
++ do_lcfg(MDCDEV, 0, LCFG_DETACH, 0, 0, 0, 0);
++
++out_del_uuid:
++ /* class_add_uuid adds a nid even if the same uuid exists; we might
++ delete any copy here. So they all better match. */
++ for (i = 0; i < failnodes; i++) {
++ sprintf(niduuid, "mdtnid%d", i);
++ do_lcfg(MDCDEV, 0, LCFG_DEL_UUID, niduuid, 0, 0, 0);
++ }
++ /* class_import_put will get rid of the additional connections */
++out:
++ RETURN(rc);
++}
++/* end COMPAT_146 */
++
++int ll_fill_super(struct super_block *sb)
++{
++ struct lustre_profile *lprof;
++ struct lustre_sb_info *lsi = s2lsi(sb);
++ struct ll_sb_info *sbi;
++ char *osc = NULL, *mdc = NULL;
++ char *profilenm = get_profile_name(sb);
++ struct config_llog_instance cfg = {0, };
++ char ll_instance[sizeof(sb) * 2 + 1];
++ int err;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
++
++ cfs_module_get();
++
++ /* client additional sb info */
++ lsi->lsi_llsbi = sbi = ll_init_sbi();
++ if (!sbi) {
++ cfs_module_put();
++ RETURN(-ENOMEM);
++ }
++
++ err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
++ if (err)
++ GOTO(out_free, err);
++
++ /* Generate a string unique to this super, in case some joker tries
++ to mount the same fs at two mount points.
++ Use the address of the super itself.*/
++ sprintf(ll_instance, "%p", sb);
++ cfg.cfg_instance = ll_instance;
++ cfg.cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
++ cfg.cfg_sb = sb;
++
++ /* set up client obds */
++ if (strchr(profilenm, '/') != NULL) /* COMPAT_146 */
++ err = -EINVAL; /* skip error messages, use old config code */
++ else
++ err = lustre_process_log(sb, profilenm, &cfg);
++ /* COMPAT_146 */
++ if (err < 0) {
++ char *oldname;
++ int rc, oldnamelen;
++ oldnamelen = strlen(profilenm) + 1;
++ /* Temp storage for 1.4.6 profile name */
++ OBD_ALLOC(oldname, oldnamelen);
++ if (oldname) {
++ memcpy(oldname, profilenm, oldnamelen);
++ rc = old_lustre_process_log(sb, oldname, &cfg);
++ if (rc >= 0) {
++ /* That worked - update the profile name
++ permanently */
++ err = rc;
++ OBD_FREE(lsi->lsi_lmd->lmd_profile,
++ strlen(lsi->lsi_lmd->lmd_profile) + 1);
++ OBD_ALLOC(lsi->lsi_lmd->lmd_profile,
++ strlen(oldname) + 1);
++ if (!lsi->lsi_lmd->lmd_profile) {
++ OBD_FREE(oldname, oldnamelen);
++ GOTO(out_free, err = -ENOMEM);
++ }
++ memcpy(lsi->lsi_lmd->lmd_profile, oldname,
++ strlen(oldname) + 1);
++ profilenm = get_profile_name(sb);
++ /* Don't ever try to recover the MGS */
++ rc = ptlrpc_set_import_active(
++ lsi->lsi_mgc->u.cli.cl_import, 0);
++ }
++ OBD_FREE(oldname, oldnamelen);
++ }
++ }
++ /* end COMPAT_146 */
++ if (err < 0) {
++ CERROR("Unable to process log: %d\n", err);
++ GOTO(out_free, err);
++ }
++
++ lprof = class_get_profile(profilenm);
++ if (lprof == NULL) {
++ LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
++ " read from the MGS. Does that filesystem "
++ "exist?\n", profilenm);
++ GOTO(out_free, err = -EINVAL);
++ }
++ CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
++ lprof->lp_mdc, lprof->lp_osc);
++
++ OBD_ALLOC(osc, strlen(lprof->lp_osc) +
++ strlen(ll_instance) + 2);
++ if (!osc)
++ GOTO(out_free, err = -ENOMEM);
++ sprintf(osc, "%s-%s", lprof->lp_osc, ll_instance);
++
++ OBD_ALLOC(mdc, strlen(lprof->lp_mdc) +
++ strlen(ll_instance) + 2);
++ if (!mdc)
++ GOTO(out_free, err = -ENOMEM);
++ sprintf(mdc, "%s-%s", lprof->lp_mdc, ll_instance);
++
++ /* connections, registrations, sb setup */
++ err = client_common_fill_super(sb, mdc, osc);
++
++out_free:
++ if (mdc)
++ OBD_FREE(mdc, strlen(mdc) + 1);
++ if (osc)
++ OBD_FREE(osc, strlen(osc) + 1);
++ if (err)
++ ll_put_super(sb);
++ else
++ LCONSOLE_WARN("Client %s has started\n", profilenm);
++
++ RETURN(err);
++} /* ll_fill_super */
++
++
++void ll_put_super(struct super_block *sb)
++{
++ struct config_llog_instance cfg;
++ char ll_instance[sizeof(sb) * 2 + 1];
++ struct obd_device *obd;
++ struct lustre_sb_info *lsi = s2lsi(sb);
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ char *profilenm = get_profile_name(sb);
++ int force = 1, next;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
++
++ sprintf(ll_instance, "%p", sb);
++ cfg.cfg_instance = ll_instance;
++ lustre_end_log(sb, NULL, &cfg);
++
++ if (sbi->ll_mdc_exp) {
++ obd = class_exp2obd(sbi->ll_mdc_exp);
++ if (obd)
++ force = obd->obd_force;
++ }
++
++ /* We need to set force before the lov_disconnect in
++ lustre_common_put_super, since l_d cleans up osc's as well. */
++ if (force) {
++ next = 0;
++ while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
++ &next)) != NULL) {
++ obd->obd_force = force;
++ }
++ }
++
++ if (sbi->ll_lcq) {
++ /* Only if client_common_fill_super succeeded */
++ client_common_put_super(sb);
++ }
++
++ next = 0;
++ while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
++ class_manual_cleanup(obd);
++ }
++
++ if (profilenm)
++ class_del_profile(profilenm);
++
++ ll_free_sbi(sb);
++ lsi->lsi_llsbi = NULL;
++
++ lustre_common_put_super(sb);
++
++ LCONSOLE_WARN("client %s umount complete\n", ll_instance);
++
++ cfs_module_put();
++
++ EXIT;
++} /* client_put_super */
++
++#ifdef HAVE_REGISTER_CACHE
++#include <linux/cache_def.h>
++#ifdef HAVE_CACHE_RETURN_INT
++static int
++#else
++static void
++#endif
++ll_shrink_cache(int priority, unsigned int gfp_mask)
++{
++ struct ll_sb_info *sbi;
++ int count = 0;
++
++ list_for_each_entry(sbi, &ll_super_blocks, ll_list)
++ count += llap_shrink_cache(sbi, priority);
++
++#ifdef HAVE_CACHE_RETURN_INT
++ return count;
++#endif
++}
++
++struct cache_definition ll_cache_definition = {
++ .name = "llap_cache",
++ .shrink = ll_shrink_cache
++};
++#endif /* HAVE_REGISTER_CACHE */
++
++struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
++{
++ struct inode *inode = NULL;
++ /* NOTE: we depend on atomic igrab() -bzzz */
++ lock_res_and_lock(lock);
++ if (lock->l_ast_data) {
++ struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
++ if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
++ inode = igrab(lock->l_ast_data);
++ } else {
++ inode = lock->l_ast_data;
++ ldlm_lock_debug(NULL, inode->i_state & I_FREEING ?
++ D_INFO : D_WARNING,
++ lock, __FILE__, __func__, __LINE__,
++ "l_ast_data %p is bogus: magic %08x",
++ lock->l_ast_data, lli->lli_inode_magic);
++ inode = NULL;
++ }
++ }
++ unlock_res_and_lock(lock);
++ return inode;
++}
++
++static int null_if_equal(struct ldlm_lock *lock, void *data)
++{
++ if (data == lock->l_ast_data) {
++ lock->l_ast_data = NULL;
++
++ if (lock->l_req_mode != lock->l_granted_mode)
++ LDLM_ERROR(lock,"clearing inode with ungranted lock");
++ }
++
++ return LDLM_ITER_CONTINUE;
++}
++
++void ll_clear_inode(struct inode *inode)
++{
++ struct ll_fid fid;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
++ inode->i_generation, inode);
++
++ if (S_ISDIR(inode->i_mode)) {
++ /* these should have been cleared in ll_file_release */
++ LASSERT(lli->lli_sai == NULL);
++ LASSERT(lli->lli_opendir_key == NULL);
++ LASSERT(lli->lli_opendir_pid == 0);
++ }
++
++ ll_inode2fid(&fid, inode);
++ clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
++ mdc_change_cbdata(sbi->ll_mdc_exp, &fid, null_if_equal, inode);
++
++ LASSERT(!lli->lli_open_fd_write_count);
++ LASSERT(!lli->lli_open_fd_read_count);
++ LASSERT(!lli->lli_open_fd_exec_count);
++
++ if (lli->lli_mds_write_och)
++ ll_mdc_real_close(inode, FMODE_WRITE);
++ if (lli->lli_mds_exec_och) {
++ if (!FMODE_EXEC)
++ CERROR("No FMODE exec, bug exec och is present for "
++ "inode %ld\n", inode->i_ino);
++ ll_mdc_real_close(inode, FMODE_EXEC);
++ }
++ if (lli->lli_mds_read_och)
++ ll_mdc_real_close(inode, FMODE_READ);
++
++
++ if (lli->lli_smd) {
++ obd_change_cbdata(sbi->ll_osc_exp, lli->lli_smd,
++ null_if_equal, inode);
++
++ obd_free_memmd(sbi->ll_osc_exp, &lli->lli_smd);
++ lli->lli_smd = NULL;
++ }
++
++ if (lli->lli_symlink_name) {
++ OBD_FREE(lli->lli_symlink_name,
++ strlen(lli->lli_symlink_name) + 1);
++ lli->lli_symlink_name = NULL;
++ }
++
++#ifdef CONFIG_FS_POSIX_ACL
++ if (lli->lli_posix_acl) {
++ LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
++ posix_acl_release(lli->lli_posix_acl);
++ lli->lli_posix_acl = NULL;
++ }
++#endif
++
++ lli->lli_inode_magic = LLI_INODE_DEAD;
++
++#ifdef HAVE_EXPORT___IGET
++ spin_lock(&sbi->ll_deathrow_lock);
++ list_del_init(&lli->lli_dead_list);
++ spin_unlock(&sbi->ll_deathrow_lock);
++#endif
++
++ EXIT;
++}
++static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ int rc;
++ ldlm_policy_data_t policy = { .l_extent = {new_size,
++ OBD_OBJECT_EOF } };
++ struct lustre_handle lockh = { 0 };
++ int local_lock = 0; /* 0 - no local lock;
++ * 1 - lock taken by lock_extent;
++ * 2 - by obd_match*/
++ int ast_flags;
++ int err;
++ ENTRY;
++
++ UNLOCK_INODE_MUTEX(inode);
++ UP_WRITE_I_ALLOC_SEM(inode);
++
++ if (sbi->ll_lockless_truncate_enable &&
++ (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) {
++ ast_flags = LDLM_FL_BLOCK_GRANTED;
++ rc = obd_match(sbi->ll_osc_exp, lsm, LDLM_EXTENT,
++ &policy, LCK_PW, &ast_flags, inode, &lockh);
++ if (rc > 0) {
++ local_lock = 2;
++ rc = 0;
++ } else if (rc == 0) {
++ rc = ll_file_punch(inode, new_size, 1);
++ }
++ } else {
++ /* XXX when we fix the AST intents to pass the discard-range
++ * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
++ * XXX here. */
++ ast_flags = (new_size == 0) ? LDLM_AST_DISCARD_DATA : 0;
++ rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy,
++ &lockh, ast_flags);
++ if (likely(rc == 0))
++ local_lock = 1;
++ }
++
++ LOCK_INODE_MUTEX(inode);
++ DOWN_WRITE_I_ALLOC_SEM(inode);
++ if (likely(rc == 0)) {
++ /* Only ll_inode_size_lock is taken at this level.
++ * lov_stripe_lock() is grabbed by ll_truncate() only over
++ * call to obd_adjust_kms(). If vmtruncate returns 0, then
++ * ll_truncate dropped ll_inode_size_lock() */
++ ll_inode_size_lock(inode, 0);
++ if (!local_lock)
++ set_bit(LLI_F_SRVLOCK, &lli->lli_flags);
++ rc = vmtruncate(inode, new_size);
++ clear_bit(LLI_F_SRVLOCK, &lli->lli_flags);
++ if (rc != 0) {
++ LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
++ ll_inode_size_unlock(inode, 0);
++ }
++ }
++ if (local_lock) {
++ if (local_lock == 2)
++ err = obd_cancel(sbi->ll_osc_exp, lsm, LCK_PW, &lockh);
++ else
++ err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
++ if (unlikely(err != 0)){
++ CERROR("extent unlock failed: err=%d,"
++ " unlock method =%d\n", err, local_lock);
++ if (rc == 0)
++ rc = err;
++ }
++ }
++ RETURN(rc);
++}
++
++/* If this inode has objects allocated to it (lsm != NULL), then the OST
++ * object(s) determine the file size and mtime. Otherwise, the MDS will
++ * keep these values until such a time that objects are allocated for it.
++ * We do the MDS operations first, as it is checking permissions for us.
++ * We don't to the MDS RPC if there is nothing that we want to store there,
++ * otherwise there is no harm in updating mtime/atime on the MDS if we are
++ * going to do an RPC anyways.
++ *
++ * If we are doing a truncate, we will send the mtime and ctime updates
++ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
++ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
++ * at the same time.
++ */
++int ll_setattr_raw(struct inode *inode, struct iattr *attr)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ptlrpc_request *request = NULL;
++ struct mdc_op_data op_data;
++ struct lustre_md md;
++ int ia_valid = attr->ia_valid;
++ int rc = 0;
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu valid %x\n", inode->i_ino,
++ attr->ia_valid);
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETATTR, 1);
++
++ if (ia_valid & ATTR_SIZE) {
++ if (attr->ia_size > ll_file_maxbytes(inode)) {
++ CDEBUG(D_INODE, "file too large %llu > "LPU64"\n",
++ attr->ia_size, ll_file_maxbytes(inode));
++ RETURN(-EFBIG);
++ }
++
++ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
++ }
++
++ /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
++ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
++ if (current->fsuid != inode->i_uid &&
++ !cfs_capable(CFS_CAP_FOWNER))
++ RETURN(-EPERM);
++ }
++
++ /* We mark all of the fields "set" so MDS/OST does not re-set them */
++ if (attr->ia_valid & ATTR_CTIME) {
++ attr->ia_ctime = CURRENT_TIME;
++ attr->ia_valid |= ATTR_CTIME_SET;
++ }
++ if (!(ia_valid & ATTR_ATIME_SET) && (attr->ia_valid & ATTR_ATIME)) {
++ attr->ia_atime = CURRENT_TIME;
++ attr->ia_valid |= ATTR_ATIME_SET;
++ }
++ if (!(ia_valid & ATTR_MTIME_SET) && (attr->ia_valid & ATTR_MTIME)) {
++ attr->ia_mtime = CURRENT_TIME;
++ attr->ia_valid |= ATTR_MTIME_SET;
++ }
++ if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) {
++ /* To avoid stale mtime on mds, obtain it from ost and send
++ to mds. */
++ rc = ll_glimpse_size(inode, 0);
++ if (rc)
++ RETURN(rc);
++
++ attr->ia_valid |= ATTR_MTIME_SET | ATTR_MTIME;
++ attr->ia_mtime = inode->i_mtime;
++ }
++
++ if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
++ CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
++ LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
++ CURRENT_SECONDS);
++
++ /* NB: ATTR_SIZE will only be set after this point if the size
++ * resides on the MDS, ie, this file has no objects. */
++ if (lsm)
++ attr->ia_valid &= ~ATTR_SIZE;
++
++ /* We always do an MDS RPC, even if we're only changing the size;
++ * only the MDS knows whether truncate() should fail with -ETXTBUSY */
++ ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0, NULL);
++
++ rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
++ attr, NULL, 0, NULL, 0, &request);
++
++ if (rc) {
++ ptlrpc_req_finished(request);
++ if (rc == -ENOENT) {
++ inode->i_nlink = 0;
++ /* Unlinked special device node? Or just a race?
++ * Pretend we done everything. */
++ if (!S_ISREG(inode->i_mode) &&
++ !S_ISDIR(inode->i_mode))
++ rc = inode_setattr(inode, attr);
++ } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY)
++ CERROR("mdc_setattr fails: rc = %d\n", rc);
++ RETURN(rc);
++ }
++
++ rc = mdc_req2lustre_md(request, REPLY_REC_OFF, sbi->ll_osc_exp, &md);
++ if (rc) {
++ ptlrpc_req_finished(request);
++ RETURN(rc);
++ }
++
++ /* We call inode_setattr to adjust timestamps.
++ * If there is at least some data in file, we cleared ATTR_SIZE above to
++ * avoid invoking vmtruncate, otherwise it is important to call
++ * vmtruncate in inode_setattr to update inode->i_size (bug 6196) */
++ rc = inode_setattr(inode, attr);
++
++ ll_update_inode(inode, &md);
++ ptlrpc_req_finished(request);
++
++ if (!lsm || !S_ISREG(inode->i_mode)) {
++ CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
++ RETURN(rc);
++ }
++
++ /* We really need to get our PW lock before we change inode->i_size.
++ * If we don't we can race with other i_size updaters on our node, like
++ * ll_file_read. We can also race with i_size propogation to other
++ * nodes through dirtying and writeback of final cached pages. This
++ * last one is especially bad for racing o_append users on other
++ * nodes. */
++ if (ia_valid & ATTR_SIZE) {
++ rc = ll_setattr_do_truncate(inode, attr->ia_size);
++ } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
++ obd_flag flags;
++ struct obd_info oinfo = { { { 0 } } };
++ struct obdo *oa;
++ OBDO_ALLOC(oa);
++
++ CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
++ inode->i_ino, LTIME_S(attr->ia_mtime));
++
++ if (oa) {
++ oa->o_id = lsm->lsm_object_id;
++ oa->o_valid = OBD_MD_FLID;
++
++ flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
++ OBD_MD_FLMTIME | OBD_MD_FLCTIME |
++ OBD_MD_FLFID | OBD_MD_FLGENER;
++
++ obdo_from_inode(oa, inode, flags);
++
++ oinfo.oi_oa = oa;
++ oinfo.oi_md = lsm;
++
++ rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
++ if (rc)
++ CERROR("obd_setattr_async fails: rc=%d\n", rc);
++ OBDO_FREE(oa);
++ } else {
++ rc = -ENOMEM;
++ }
++ }
++ RETURN(rc);
++}
++
++int ll_setattr(struct dentry *de, struct iattr *attr)
++{
++ int mode;
++
++ if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
++ (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
++ attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
++ if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
++ (ATTR_SIZE|ATTR_MODE)) {
++ mode = de->d_inode->i_mode;
++ if (((mode & S_ISUID) && (!(attr->ia_mode & S_ISUID))) ||
++ ((mode & S_ISGID) && (mode & S_IXGRP) &&
++ (!(attr->ia_mode & S_ISGID))))
++ attr->ia_valid |= ATTR_FORCE;
++ }
++
++ return ll_setattr_raw(de->d_inode, attr);
++}
++
++int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
++ __u64 max_age, __u32 flags)
++{
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ struct obd_statfs obd_osfs;
++ int rc;
++ ENTRY;
++
++ rc = obd_statfs(class_exp2obd(sbi->ll_mdc_exp), osfs, max_age, flags);
++ if (rc) {
++ CERROR("mdc_statfs fails: rc = %d\n", rc);
++ RETURN(rc);
++ }
++
++ osfs->os_type = sb->s_magic;
++
++ CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
++ osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
++
++ rc = obd_statfs_rqset(class_exp2obd(sbi->ll_osc_exp),
++ &obd_osfs, max_age, flags);
++ if (rc) {
++ CERROR("obd_statfs fails: rc = %d\n", rc);
++ RETURN(rc);
++ }
++
++ CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
++ obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
++ obd_osfs.os_files);
++
++ osfs->os_bsize = obd_osfs.os_bsize;
++ osfs->os_blocks = obd_osfs.os_blocks;
++ osfs->os_bfree = obd_osfs.os_bfree;
++ osfs->os_bavail = obd_osfs.os_bavail;
++
++ /* If we don't have as many objects free on the OST as inodes
++ * on the MDS, we reduce the total number of inodes to
++ * compensate, so that the "inodes in use" number is correct.
++ */
++ if (obd_osfs.os_ffree < osfs->os_ffree) {
++ osfs->os_files = (osfs->os_files - osfs->os_ffree) +
++ obd_osfs.os_ffree;
++ osfs->os_ffree = obd_osfs.os_ffree;
++ }
++
++ RETURN(rc);
++}
++#ifndef HAVE_STATFS_DENTRY_PARAM
++int ll_statfs(struct super_block *sb, struct kstatfs *sfs)
++{
++#else
++int ll_statfs(struct dentry *de, struct kstatfs *sfs)
++{
++ struct super_block *sb = de->d_sb;
++#endif
++ struct obd_statfs osfs;
++ int rc;
++
++ CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64());
++ ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
++
++ /* For now we will always get up-to-date statfs values, but in the
++ * future we may allow some amount of caching on the client (e.g.
++ * from QOS or lprocfs updates). */
++ rc = ll_statfs_internal(sb, &osfs, cfs_time_current_64() - 1, 0);
++ if (rc)
++ return rc;
++
++ statfs_unpack(sfs, &osfs);
++
++ /* We need to downshift for all 32-bit kernels, because we can't
++ * tell if the kernel is being called via sys_statfs64() or not.
++ * Stop before overflowing f_bsize - in which case it is better
++ * to just risk EOVERFLOW if caller is using old sys_statfs(). */
++ if (sizeof(long) < 8) {
++ while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
++ sfs->f_bsize <<= 1;
++
++ osfs.os_blocks >>= 1;
++ osfs.os_bfree >>= 1;
++ osfs.os_bavail >>= 1;
++ }
++ }
++
++ sfs->f_blocks = osfs.os_blocks;
++ sfs->f_bfree = osfs.os_bfree;
++ sfs->f_bavail = osfs.os_bavail;
++
++ return 0;
++}
++
++void ll_inode_size_lock(struct inode *inode, int lock_lsm)
++{
++ struct ll_inode_info *lli;
++ struct lov_stripe_md *lsm;
++
++ lli = ll_i2info(inode);
++ LASSERT(lli->lli_size_sem_owner != current);
++ down(&lli->lli_size_sem);
++ LASSERT(lli->lli_size_sem_owner == NULL);
++ lli->lli_size_sem_owner = current;
++ lsm = lli->lli_smd;
++ LASSERTF(lsm != NULL || lock_lsm == 0, "lsm %p, lock_lsm %d\n",
++ lsm, lock_lsm);
++ if (lock_lsm)
++ lov_stripe_lock(lsm);
++}
++
++void ll_inode_size_unlock(struct inode *inode, int unlock_lsm)
++{
++ struct ll_inode_info *lli;
++ struct lov_stripe_md *lsm;
++
++ lli = ll_i2info(inode);
++ lsm = lli->lli_smd;
++ LASSERTF(lsm != NULL || unlock_lsm == 0, "lsm %p, lock_lsm %d\n",
++ lsm, unlock_lsm);
++ if (unlock_lsm)
++ lov_stripe_unlock(lsm);
++ LASSERT(lli->lli_size_sem_owner == current);
++ lli->lli_size_sem_owner = NULL;
++ up(&lli->lli_size_sem);
++}
++
++static void ll_replace_lsm(struct inode *inode, struct lov_stripe_md *lsm)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++
++ dump_lsm(D_INODE, lsm);
++ dump_lsm(D_INODE, lli->lli_smd);
++ LASSERTF(lsm->lsm_magic == LOV_MAGIC_JOIN,
++ "lsm must be joined lsm %p\n", lsm);
++ obd_free_memmd(ll_i2obdexp(inode), &lli->lli_smd);
++ CDEBUG(D_INODE, "replace lsm %p to lli_smd %p for inode %lu%u(%p)\n",
++ lsm, lli->lli_smd, inode->i_ino, inode->i_generation, inode);
++ lli->lli_smd = lsm;
++ lli->lli_maxbytes = lsm->lsm_maxbytes;
++ if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
++ lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
++}
++
++void ll_update_inode(struct inode *inode, struct lustre_md *md)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct mds_body *body = md->body;
++ struct lov_stripe_md *lsm = md->lsm;
++
++ LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
++ if (lsm != NULL) {
++ if (lli->lli_smd == NULL) {
++ if (lsm->lsm_magic != LOV_MAGIC &&
++ lsm->lsm_magic != LOV_MAGIC_JOIN) {
++ dump_lsm(D_ERROR, lsm);
++ LBUG();
++ }
++ CDEBUG(D_INODE, "adding lsm %p to inode %lu/%u(%p)\n",
++ lsm, inode->i_ino, inode->i_generation, inode);
++ /* ll_inode_size_lock() requires it is only called
++ * with lli_smd != NULL or lock_lsm == 0 or we can
++ * race between lock/unlock. bug 9547 */
++ lli->lli_smd = lsm;
++ lli->lli_maxbytes = lsm->lsm_maxbytes;
++ if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
++ lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
++ } else {
++ if (lli->lli_smd->lsm_magic == lsm->lsm_magic &&
++ lli->lli_smd->lsm_stripe_count ==
++ lsm->lsm_stripe_count) {
++ if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
++ CERROR("lsm mismatch for inode %ld\n",
++ inode->i_ino);
++ CERROR("lli_smd:\n");
++ dump_lsm(D_ERROR, lli->lli_smd);
++ CERROR("lsm:\n");
++ dump_lsm(D_ERROR, lsm);
++ LBUG();
++ }
++ } else
++ ll_replace_lsm(inode, lsm);
++ }
++ if (lli->lli_smd != lsm)
++ obd_free_memmd(ll_i2obdexp(inode), &lsm);
++ }
++
++#ifdef CONFIG_FS_POSIX_ACL
++ LASSERT(!md->posix_acl || (body->valid & OBD_MD_FLACL));
++ if (body->valid & OBD_MD_FLACL) {
++ spin_lock(&lli->lli_lock);
++ if (lli->lli_posix_acl)
++ posix_acl_release(lli->lli_posix_acl);
++ lli->lli_posix_acl = md->posix_acl;
++ spin_unlock(&lli->lli_lock);
++ }
++#endif
++
++ if (body->valid & OBD_MD_FLID)
++ inode->i_ino = body->ino;
++ if (body->valid & OBD_MD_FLATIME &&
++ body->atime > LTIME_S(inode->i_atime))
++ LTIME_S(inode->i_atime) = body->atime;
++
++ /* mtime is always updated with ctime, but can be set in past.
++ As write and utime(2) may happen within 1 second, and utime's
++ mtime has a priority over write's one, so take mtime from mds
++ for the same ctimes. */
++ if (body->valid & OBD_MD_FLCTIME &&
++ body->ctime >= LTIME_S(inode->i_ctime)) {
++ LTIME_S(inode->i_ctime) = body->ctime;
++ if (body->valid & OBD_MD_FLMTIME) {
++ CDEBUG(D_INODE, "setting ino %lu mtime "
++ "from %lu to "LPU64"\n", inode->i_ino,
++ LTIME_S(inode->i_mtime), body->mtime);
++ LTIME_S(inode->i_mtime) = body->mtime;
++ }
++ }
++ if (body->valid & OBD_MD_FLMODE)
++ inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
++ if (body->valid & OBD_MD_FLTYPE)
++ inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
++ if (S_ISREG(inode->i_mode)) {
++ inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS+1, LL_MAX_BLKSIZE_BITS);
++ } else {
++ inode->i_blkbits = inode->i_sb->s_blocksize_bits;
++ }
++#ifdef HAVE_INODE_BLKSIZE
++ inode->i_blksize = 1<<inode->i_blkbits;
++#endif
++ if (body->valid & OBD_MD_FLUID)
++ inode->i_uid = body->uid;
++ if (body->valid & OBD_MD_FLGID)
++ inode->i_gid = body->gid;
++ if (body->valid & OBD_MD_FLFLAGS)
++ inode->i_flags = ll_ext_to_inode_flags(body->flags);
++
++ if (body->valid & OBD_MD_FLNLINK)
++ inode->i_nlink = body->nlink;
++ if (body->valid & OBD_MD_FLGENER)
++ inode->i_generation = body->generation;
++ if (body->valid & OBD_MD_FLRDEV)
++ inode->i_rdev = old_decode_dev(body->rdev);
++ if (body->valid & OBD_MD_FLSIZE) {
++#if 0 /* Can't block ll_test_inode->ll_update_inode, b=14326*/
++ ll_inode_size_lock(inode, 0);
++ i_size_write(inode, body->size);
++ ll_inode_size_unlock(inode, 0);
++#else
++ inode->i_size = body->size;
++#endif
++ }
++ if (body->valid & OBD_MD_FLBLOCKS)
++ inode->i_blocks = body->blocks;
++
++ if (body->valid & OBD_MD_FLSIZE)
++ set_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
++}
++
++static struct backing_dev_info ll_backing_dev_info = {
++ .ra_pages = 0, /* No readahead */
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
++ .capabilities = 0, /* Does contribute to dirty memory */
++#else
++ .memory_backed = 0, /* Does contribute to dirty memory */
++#endif
++};
++
++void ll_read_inode2(struct inode *inode, void *opaque)
++{
++ struct lustre_md *md = opaque;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ ENTRY;
++
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
++ inode->i_generation, inode);
++
++ ll_lli_init(lli);
++
++ LASSERT(!lli->lli_smd);
++
++ /* Core attributes from the MDS first. This is a new inode, and
++ * the VFS doesn't zero times in the core inode so we have to do
++ * it ourselves. They will be overwritten by either MDS or OST
++ * attributes - we just need to make sure they aren't newer. */
++ LTIME_S(inode->i_mtime) = 0;
++ LTIME_S(inode->i_atime) = 0;
++ LTIME_S(inode->i_ctime) = 0;
++ inode->i_rdev = 0;
++ ll_update_inode(inode, md);
++
++ /* OIDEBUG(inode); */
++
++ if (S_ISREG(inode->i_mode)) {
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ inode->i_op = &ll_file_inode_operations;
++ inode->i_fop = sbi->ll_fop;
++ inode->i_mapping->a_ops = &ll_aops;
++ EXIT;
++ } else if (S_ISDIR(inode->i_mode)) {
++ inode->i_op = &ll_dir_inode_operations;
++ inode->i_fop = &ll_dir_operations;
++ inode->i_mapping->a_ops = &ll_dir_aops;
++ EXIT;
++ } else if (S_ISLNK(inode->i_mode)) {
++ inode->i_op = &ll_fast_symlink_inode_operations;
++ EXIT;
++ } else {
++ inode->i_op = &ll_special_inode_operations;
++ init_special_inode(inode, inode->i_mode,
++ kdev_t_to_nr(inode->i_rdev));
++ /* initializing backing dev info. */
++ inode->i_mapping->backing_dev_info = &ll_backing_dev_info;
++ EXIT;
++ }
++}
++
++int ll_iocontrol(struct inode *inode, struct file *file,
++ unsigned int cmd, unsigned long arg)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ struct ptlrpc_request *req = NULL;
++ int rc, flags = 0;
++ ENTRY;
++
++ switch(cmd) {
++ case EXT3_IOC_GETFLAGS: {
++ struct ll_fid fid;
++ struct mds_body *body;
++
++ ll_inode2fid(&fid, inode);
++ rc = mdc_getattr(sbi->ll_mdc_exp, &fid, OBD_MD_FLFLAGS,0,&req);
++ if (rc) {
++ CERROR("failure %d inode %lu\n", rc, inode->i_ino);
++ RETURN(-abs(rc));
++ }
++
++ body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
++ sizeof(*body));
++
++ /* We want to return EXT3_*_FL flags to the caller via this
++ * ioctl. An older MDS may be sending S_* flags, fix it up. */
++ flags = ll_inode_to_ext_flags(body->flags,
++ body->flags &MDS_BFLAG_EXT_FLAGS);
++ ptlrpc_req_finished (req);
++
++ RETURN(put_user(flags, (int *)arg));
++ }
++ case EXT3_IOC_SETFLAGS: {
++ struct mdc_op_data op_data;
++ struct ll_iattr_struct attr;
++ struct obd_info oinfo = { { { 0 } } };
++ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
++
++ if (get_user(flags, (int *)arg))
++ RETURN(-EFAULT);
++
++ oinfo.oi_md = lsm;
++ OBDO_ALLOC(oinfo.oi_oa);
++ if (!oinfo.oi_oa)
++ RETURN(-ENOMEM);
++
++ ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0, NULL);
++
++ memset(&attr, 0, sizeof(attr));
++ attr.ia_attr_flags = flags;
++ ((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG;
++
++ rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
++ (struct iattr *)&attr, NULL, 0, NULL, 0, &req);
++ ptlrpc_req_finished(req);
++ if (rc || lsm == NULL) {
++ OBDO_FREE(oinfo.oi_oa);
++ RETURN(rc);
++ }
++
++ oinfo.oi_oa->o_id = lsm->lsm_object_id;
++ oinfo.oi_oa->o_flags = flags;
++ oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
++
++ obdo_from_inode(oinfo.oi_oa, inode,
++ OBD_MD_FLFID | OBD_MD_FLGENER);
++ rc = obd_setattr_rqset(sbi->ll_osc_exp, &oinfo, NULL);
++ OBDO_FREE(oinfo.oi_oa);
++ if (rc) {
++ if (rc != -EPERM && rc != -EACCES)
++ CERROR("mdc_setattr_async fails: rc = %d\n", rc);
++ RETURN(rc);
++ }
++
++ inode->i_flags = ll_ext_to_inode_flags(flags |
++ MDS_BFLAG_EXT_FLAGS);
++ RETURN(0);
++ }
++ default:
++ RETURN(-ENOSYS);
++ }
++
++ RETURN(0);
++}
++
++/* umount -f client means force down, don't save state */
++#ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
++void ll_umount_begin(struct vfsmount *vfsmnt, int flags)
++{
++ struct super_block *sb = vfsmnt->mnt_sb;
++#else
++void ll_umount_begin(struct super_block *sb)
++{
++#endif
++ struct lustre_sb_info *lsi = s2lsi(sb);
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ struct obd_device *obd;
++ struct obd_ioctl_data ioc_data = { 0 };
++ ENTRY;
++
++#ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
++ if (!(flags & MNT_FORCE)) {
++ EXIT;
++ return;
++ }
++#endif
++
++ /* Tell the MGC we got umount -f */
++ lsi->lsi_flags |= LSI_UMOUNT_FORCE;
++
++ CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
++ sb->s_count, atomic_read(&sb->s_active));
++
++ obd = class_exp2obd(sbi->ll_mdc_exp);
++ if (obd == NULL) {
++ CERROR("Invalid MDC connection handle "LPX64"\n",
++ sbi->ll_mdc_exp->exp_handle.h_cookie);
++ EXIT;
++ return;
++ }
++ obd->obd_force = 1;
++ obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_mdc_exp, sizeof ioc_data,
++ &ioc_data, NULL);
++
++ obd = class_exp2obd(sbi->ll_osc_exp);
++ if (obd == NULL) {
++ CERROR("Invalid LOV connection handle "LPX64"\n",
++ sbi->ll_osc_exp->exp_handle.h_cookie);
++ EXIT;
++ return;
++ }
++
++ obd->obd_force = 1;
++ obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_osc_exp, sizeof ioc_data,
++ &ioc_data, NULL);
++
++ /* Really, we'd like to wait until there are no requests outstanding,
++ * and then continue. For now, we just invalidate the requests,
++ * schedule() and sleep one second if needed, and hope.
++ */
++ schedule();
++#ifdef HAVE_UMOUNTBEGIN_VFSMOUNT
++ if (atomic_read(&vfsmnt->mnt_count) > 2) {
++ cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
++ cfs_time_seconds(1));
++ if (atomic_read(&vfsmnt->mnt_count) > 2)
++ LCONSOLE_WARN("Mount still busy with %d refs! You "
++ "may try to umount it a bit later\n",
++ atomic_read(&vfsmnt->mnt_count));
++ }
++#endif
++
++ EXIT;
++}
++
++int ll_remount_fs(struct super_block *sb, int *flags, char *data)
++{
++ struct ll_sb_info *sbi = ll_s2sbi(sb);
++ int err;
++ __u32 read_only;
++
++ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
++ read_only = *flags & MS_RDONLY;
++ err = obd_set_info_async(sbi->ll_mdc_exp, sizeof(KEY_READONLY),
++ KEY_READONLY, sizeof(read_only),
++ &read_only, NULL);
++
++ /* MDS might have expected a different ro key value, b=17493 */
++ if (err == -EINVAL) {
++ CDEBUG(D_CONFIG, "Retrying remount with 1.6.6 ro key\n");
++ err = obd_set_info_async(sbi->ll_mdc_exp,
++ sizeof(KEY_READONLY_166COMPAT),
++ KEY_READONLY_166COMPAT,
++ sizeof(read_only),
++ &read_only, NULL);
++ }
++
++ if (err) {
++ CERROR("Failed to change the read-only flag during "
++ "remount: %d\n", err);
++ return err;
++ }
++
++ if (read_only)
++ sb->s_flags |= MS_RDONLY;
++ else
++ sb->s_flags &= ~MS_RDONLY;
++ }
++ return 0;
++}
++
++int ll_prep_inode(struct obd_export *exp, struct inode **inode,
++ struct ptlrpc_request *req, int offset,struct super_block *sb)
++{
++ struct lustre_md md;
++ struct ll_sb_info *sbi = NULL;
++ int rc = 0;
++ ENTRY;
++
++ LASSERT(*inode || sb);
++ sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
++ prune_deathrow(sbi, 1);
++
++ rc = mdc_req2lustre_md(req, offset, exp, &md);
++ if (rc)
++ RETURN(rc);
++
++ if (*inode) {
++ ll_update_inode(*inode, &md);
++ } else {
++ LASSERT(sb);
++ *inode = ll_iget(sb, md.body->ino, &md);
++ if (*inode == NULL || is_bad_inode(*inode)) {
++ mdc_free_lustre_md(exp, &md);
++ rc = -ENOMEM;
++ CERROR("new_inode -fatal: rc %d\n", rc);
++ GOTO(out, rc);
++ }
++ }
++
++ rc = obd_checkmd(exp, ll_i2mdcexp(*inode),
++ ll_i2info(*inode)->lli_smd);
++out:
++ RETURN(rc);
++}
++
++char *llap_origins[] = {
++ [LLAP_ORIGIN_UNKNOWN] = "--",
++ [LLAP_ORIGIN_READPAGE] = "rp",
++ [LLAP_ORIGIN_READAHEAD] = "ra",
++ [LLAP_ORIGIN_COMMIT_WRITE] = "cw",
++ [LLAP_ORIGIN_WRITEPAGE] = "wp",
++ [LLAP_ORIGIN_LOCKLESS_IO] = "ls"
++};
++
++struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
++ struct list_head *list)
++{
++ struct ll_async_page *llap;
++ struct list_head *pos;
++
++ list_for_each(pos, list) {
++ if (pos == &sbi->ll_pglist)
++ return NULL;
++ llap = list_entry(pos, struct ll_async_page, llap_pglist_item);
++ if (llap->llap_page == NULL)
++ continue;
++ return llap;
++ }
++ LBUG();
++ return NULL;
++}
++
++int ll_obd_statfs(struct inode *inode, void *arg)
++{
++ struct ll_sb_info *sbi = NULL;
++ struct obd_device *client_obd = NULL, *lov_obd = NULL;
++ struct lov_obd *lov = NULL;
++ struct obd_statfs stat_buf = {0};
++ char *buf = NULL;
++ struct obd_ioctl_data *data = NULL;
++ __u32 type, index;
++ int len = 0, rc;
++
++ if (!inode || !(sbi = ll_i2sbi(inode)))
++ GOTO(out_statfs, rc = -EINVAL);
++
++ rc = obd_ioctl_getdata(&buf, &len, arg);
++ if (rc)
++ GOTO(out_statfs, rc);
++
++ data = (void*)buf;
++ if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
++ !data->ioc_pbuf1 || !data->ioc_pbuf2)
++ GOTO(out_statfs, rc = -EINVAL);
++
++ memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
++ memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
++
++ if (type == LL_STATFS_MDC) {
++ if (index > 0)
++ GOTO(out_statfs, rc = -ENODEV);
++ client_obd = class_exp2obd(sbi->ll_mdc_exp);
++ } else if (type == LL_STATFS_LOV) {
++ lov_obd = class_exp2obd(sbi->ll_osc_exp);
++ lov = &lov_obd->u.lov;
++
++ if (index >= lov->desc.ld_tgt_count)
++ GOTO(out_statfs, rc = -ENODEV);
++
++ if (!lov->lov_tgts[index])
++ /* Try again with the next index */
++ GOTO(out_statfs, rc = -EAGAIN);
++
++ client_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
++ if (!lov->lov_tgts[index]->ltd_active)
++ GOTO(out_uuid, rc = -ENODATA);
++ }
++
++ if (!client_obd)
++ GOTO(out_statfs, rc = -EINVAL);
++
++ rc = obd_statfs(client_obd, &stat_buf, cfs_time_current_64() - HZ, 1);
++ if (rc)
++ GOTO(out_statfs, rc);
++
++ if (copy_to_user(data->ioc_pbuf1, &stat_buf, data->ioc_plen1))
++ GOTO(out_statfs, rc = -EFAULT);
++
++out_uuid:
++ if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(client_obd),
++ data->ioc_plen2))
++ rc = -EFAULT;
++
++out_statfs:
++ if (buf)
++ obd_ioctl_freedata(buf, len);
++ return rc;
++}
++
++int ll_process_config(struct lustre_cfg *lcfg)
++{
++ char *ptr;
++ void *sb;
++ struct lprocfs_static_vars lvars;
++ unsigned long x;
++ int rc = 0;
++
++ lprocfs_llite_init_vars(&lvars);
++
++ /* The instance name contains the sb: lustre-client-aacfe000 */
++ ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
++ if (!ptr || !*(++ptr))
++ return -EINVAL;
++ if (sscanf(ptr, "%lx", &x) != 1)
++ return -EINVAL;
++ sb = (void *)x;
++ /* This better be a real Lustre superblock! */
++ LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
++
++ /* Note we have not called client_common_fill_super yet, so
++ proc fns must be able to handle that! */
++ rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
++ lcfg, sb);
++ return(rc);
++}
++
++int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
++{
++ struct ll_sb_info *sbi;
++
++ LASSERT((seq != NULL) && (vfs != NULL));
++ sbi = ll_s2sbi(vfs->mnt_sb);
++
++ if (sbi->ll_flags & LL_SBI_NOLCK)
++ seq_puts(seq, ",nolock");
++
++ if (sbi->ll_flags & LL_SBI_FLOCK)
++ seq_puts(seq, ",flock");
++
++ if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
++ seq_puts(seq, ",localflock");
++
++ if (sbi->ll_flags & LL_SBI_USER_XATTR)
++ seq_puts(seq, ",user_xattr");
++
++ if (sbi->ll_flags & LL_SBI_ACL)
++ seq_puts(seq, ",acl");
++
++ RETURN(0);
++}
+diff -urNad lustre~/lustre/llite/llite_mmap.c lustre/lustre/llite/llite_mmap.c
+--- lustre~/lustre/llite/llite_mmap.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/llite_mmap.c 2009-03-12 11:02:51.000000000 +0100
+@@ -81,8 +81,7 @@
+ int lt_get_mmap_locks(struct ll_lock_tree *tree,
+ unsigned long addr, size_t count);
+
+-struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+- int *type);
++static struct vm_operations_struct ll_file_vm_ops;
+
+ struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+ __u64 end, ldlm_mode_t mode)
+@@ -285,9 +284,19 @@
+ return LCK_PR;
+ }
+
++static void policy_from_vma_pgoff(ldlm_policy_data_t *policy,
++ struct vm_area_struct *vma,
++ __u64 pgoff, size_t count)
++{
++ policy->l_extent.start = pgoff << CFS_PAGE_SHIFT;
++ policy->l_extent.end = (policy->l_extent.start + count - 1) |
++ ~CFS_PAGE_MASK;
++}
++
+ static void policy_from_vma(ldlm_policy_data_t *policy,
+ struct vm_area_struct *vma, unsigned long addr,
+ size_t count)
++
+ {
+ policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+ ((__u64)vma->vm_pgoff << CFS_PAGE_SHIFT);
+@@ -308,7 +317,7 @@
+ spin_lock(&mm->page_table_lock);
+ for(vma = find_vma(mm, addr);
+ vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+- if (vma->vm_ops && vma->vm_ops->nopage == ll_nopage &&
++ if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+ vma->vm_flags & VM_SHARED) {
+ ret = vma;
+ break;
+@@ -360,44 +369,30 @@
+ }
+ RETURN(0);
+ }
+-/**
+- * Page fault handler.
+- *
+- * \param vma - is virtiual area struct related to page fault
+- * \param address - address when hit fault
+- * \param type - of fault
+- *
+- * \return allocated and filled page for address
+- * \retval NOPAGE_SIGBUS if page not exist on this address
+- * \retval NOPAGE_OOM not have memory for allocate new page
+- */
+-struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+- int *type)
++
++static int ll_get_extent_lock(struct vm_area_struct *vma, unsigned long pgoff,
++ int *save_flags, struct lustre_handle *lockh)
+ {
+ struct file *filp = vma->vm_file;
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
+ struct inode *inode = filp->f_dentry->d_inode;
+- struct lustre_handle lockh = { 0 };
+ ldlm_policy_data_t policy;
+ ldlm_mode_t mode;
+- struct page *page = NULL;
+ struct ll_inode_info *lli = ll_i2info(inode);
+- struct lov_stripe_md *lsm;
+ struct ost_lvb lvb;
+ __u64 kms, old_mtime;
+- unsigned long pgoff, size, rand_read, seq_read;
+- int rc = 0;
++ unsigned long size;
+ ENTRY;
+
+ if (lli->lli_smd == NULL) {
+ CERROR("No lsm on fault?\n");
+- RETURN(NOPAGE_SIGBUS);
++ RETURN(0);
+ }
+
+ ll_clear_file_contended(inode);
+
+ /* start and end the lock on the first and last bytes in the page */
+- policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE);
++ policy_from_vma_pgoff(&policy, vma, pgoff, CFS_PAGE_SIZE);
+
+ CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
+ vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end);
+@@ -405,26 +400,28 @@
+ mode = mode_from_vma(vma);
+ old_mtime = LTIME_S(inode->i_mtime);
+
+- lsm = lli->lli_smd;
+- rc = ll_extent_lock(fd, inode, lsm, mode, &policy,
+- &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU);
+- if (rc != 0)
+- RETURN(NOPAGE_SIGBUS);
++ if(ll_extent_lock(fd, inode, lli->lli_smd, mode, &policy,
++ lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU) != 0)
++ RETURN(0);
+
+ if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
+ CWARN("binary changed. inode %lu\n", inode->i_ino);
+
+- lov_stripe_lock(lsm);
++ lov_stripe_lock(lli->lli_smd);
+ inode_init_lvb(inode, &lvb);
+- obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);
++ if(obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 1)) {
++ lov_stripe_unlock(lli->lli_smd);
++ RETURN(0);
++ }
+ kms = lvb.lvb_size;
+
+- pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
+ size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
++ CDEBUG(D_INFO, "Kms %lu - %lu\n", size, pgoff);
+
+ if (pgoff >= size) {
+- lov_stripe_unlock(lsm);
++ lov_stripe_unlock(lli->lli_smd);
+ ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
++ lov_stripe_lock(lli->lli_smd);
+ } else {
+ /* XXX change inode size without ll_inode_size_lock() held!
+ * there is a race condition with truncate path. (see
+@@ -446,29 +443,69 @@
+ CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n",
+ inode->i_ino, i_size_read(inode));
+ }
+- lov_stripe_unlock(lsm);
+ }
+
+ /* If mapping is writeable, adjust kms to cover this page,
+ * but do not extend kms beyond actual file size.
+ * policy.l_extent.end is set to the end of the page by policy_from_vma
+ * bug 10919 */
+- lov_stripe_lock(lsm);
+ if (mode == LCK_PW)
+- obd_adjust_kms(ll_i2obdexp(inode), lsm,
++ obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd,
+ min_t(loff_t, policy.l_extent.end + 1,
+ i_size_read(inode)), 0);
+- lov_stripe_unlock(lsm);
++ lov_stripe_unlock(lli->lli_smd);
+
+ /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+ * the kernel will not read other pages not covered by ldlm in
+ * filemap_nopage. we do our readahead in ll_readpage.
+ */
+- rand_read = vma->vm_flags & VM_RAND_READ;
+- seq_read = vma->vm_flags & VM_SEQ_READ;
++ *save_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ);
+ vma->vm_flags &= ~ VM_SEQ_READ;
+ vma->vm_flags |= VM_RAND_READ;
+
++ return 1;
++}
++
++static void ll_put_extent_lock(struct vm_area_struct *vma, int save_flags,
++ struct lustre_handle *lockh)
++{
++ struct file *filp = vma->vm_file;
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
++ struct inode *inode = filp->f_dentry->d_inode;
++ ldlm_mode_t mode;
++
++ mode = mode_from_vma(vma);
++ vma->vm_flags &= ~(VM_RAND_READ | VM_SEQ_READ);
++ vma->vm_flags |= save_flags;
++
++ ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, lockh);
++}
++
++#ifndef HAVE_VM_OP_FAULT
++/**
++ * Page fault handler.
++ *
++ * \param vma - is virtiual area struct related to page fault
++ * \param address - address when hit fault
++ * \param type - of fault
++ *
++ * \return allocated and filled page for address
++ * \retval NOPAGE_SIGBUS if page not exist on this address
++ * \retval NOPAGE_OOM not have memory for allocate new page
++ */
++struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
++ int *type)
++{
++ struct lustre_handle lockh = { 0 };
++ int save_fags = 0;
++ unsigned long pgoff;
++ struct page *page;
++ ENTRY;
++
++ pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
++ if(!ll_get_extent_lock(vma, pgoff, &save_fags, &lockh))
++ RETURN(NOPAGE_SIGBUS);
++
+ page = filemap_nopage(vma, address, type);
+ if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM)
+ LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address,
+@@ -477,13 +514,48 @@
+ CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", address,
+ (long)type);
+
+- vma->vm_flags &= ~VM_RAND_READ;
+- vma->vm_flags |= (rand_read | seq_read);
++ ll_put_extent_lock(vma, save_fags, &lockh);
+
+- ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+ RETURN(page);
+ }
+
++#else
++/* New fault() API*/
++/**
++ * Page fault handler.
++ *
++ * \param vma - is virtiual area struct related to page fault
++ * \param address - address when hit fault
++ * \param type - of fault
++ *
++ * \return allocated and filled page for address
++ * \retval NOPAGE_SIGBUS if page not exist on this address
++ * \retval NOPAGE_OOM not have memory for allocate new page
++ */
++int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++ struct lustre_handle lockh = { 0 };
++ int save_fags = 0;
++ int rc;
++ ENTRY;
++
++ if(!ll_get_extent_lock(vma, vmf->pgoff, &save_fags, &lockh))
++ RETURN(VM_FAULT_SIGBUS);
++
++ rc = filemap_fault(vma, vmf);
++ if (vmf->page)
++ LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
++ vmf->virtual_address);
++ else
++ CDEBUG(D_PAGE, "got addr %p - SIGBUS\n",
++ vmf->virtual_address);
++
++ ll_put_extent_lock(vma, save_fags, &lockh);
++
++ RETURN(rc);
++}
++#endif
++
+ /* To avoid cancel the locks covering mmapped region for lock cache pressure,
+ * we track the mapped vma count by lli_mmap_cnt.
+ * ll_vm_open(): when first vma is linked, split locks from lru.
+@@ -548,6 +620,7 @@
+ }
+ }
+
++#ifndef HAVE_VM_OP_FAULT
+ #ifndef HAVE_FILEMAP_POPULATE
+ static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+ #endif
+@@ -562,6 +635,7 @@
+ rc = filemap_populate(area, address, len, prot, pgoff, 1);
+ RETURN(rc);
+ }
++#endif
+
+ /* return the user space pointer that maps to a file offset via a vma */
+ static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
+@@ -588,10 +662,14 @@
+ }
+
+ static struct vm_operations_struct ll_file_vm_ops = {
+- .nopage = ll_nopage,
+ .open = ll_vm_open,
+ .close = ll_vm_close,
++#ifdef HAVE_VM_OP_FAULT
++ .fault = ll_fault,
++#else
++ .nopage = ll_nopage,
+ .populate = ll_populate,
++#endif
+ };
+
+ int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
+@@ -602,7 +680,7 @@
+ ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), LPROC_LL_MAP, 1);
+ rc = generic_file_mmap(file, vma);
+ if (rc == 0) {
+-#ifndef HAVE_FILEMAP_POPULATE
++#if !defined(HAVE_FILEMAP_POPULATE) && !defined(HAVE_VM_OP_FAULT)
+ if (!filemap_populate)
+ filemap_populate = vma->vm_ops->populate;
+ #endif
+diff -urNad lustre~/lustre/llite/llite_nfs.c lustre/lustre/llite/llite_nfs.c
+--- lustre~/lustre/llite/llite_nfs.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/llite_nfs.c 2009-03-12 11:02:51.000000000 +0100
+@@ -68,36 +68,30 @@
+ }
+
+ static struct inode * search_inode_for_lustre(struct super_block *sb,
+- unsigned long ino,
+- unsigned long generation,
+- int mode)
++ struct ll_fid *iid)
+ {
+ struct ptlrpc_request *req = NULL;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+- struct ll_fid fid;
+ unsigned long valid = 0;
+ int eadatalen = 0, rc;
+ struct inode *inode = NULL;
+- struct ll_fid iid = { .id = ino, .generation = generation };
+ ENTRY;
+
+- inode = ILOOKUP(sb, ino, ll_nfs_test_inode, &iid);
++ inode = ILOOKUP(sb, iid->id, ll_nfs_test_inode, iid);
+
+ if (inode)
+ RETURN(inode);
+- if (S_ISREG(mode)) {
+- rc = ll_get_max_mdsize(sbi, &eadatalen);
+- if (rc)
+- RETURN(ERR_PTR(rc));
+- valid |= OBD_MD_FLEASIZE;
+- }
+- fid.id = (__u64)ino;
+- fid.generation = generation;
+- fid.f_type = mode;
+
+- rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, eadatalen, &req);
++ rc = ll_get_max_mdsize(sbi, &eadatalen);
++ if (rc)
++ RETURN(ERR_PTR(rc));
++
++ valid |= OBD_MD_FLEASIZE;
++
++ /* mds_fid2dentry is ignore f_type */
++ rc = mdc_getattr(sbi->ll_mdc_exp, iid, valid, eadatalen, &req);
+ if (rc) {
+- CERROR("failure %d inode %lu\n", rc, ino);
++ CERROR("failure %d inode "LPU64"\n", rc, iid->id);
+ RETURN(ERR_PTR(rc));
+ }
+
+@@ -111,27 +105,27 @@
+ RETURN(inode);
+ }
+
+-static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
+- __u32 generation, umode_t mode)
++static struct dentry *ll_iget_for_nfs(struct super_block *sb,
++ struct ll_fid *iid)
+ {
+ struct inode *inode;
+ struct dentry *result;
+ ENTRY;
+
+- if (ino == 0)
++ if (iid->id == 0)
+ RETURN(ERR_PTR(-ESTALE));
+
+- inode = search_inode_for_lustre(sb, ino, generation, mode);
+- if (IS_ERR(inode)) {
++ inode = search_inode_for_lustre(sb, iid);
++ if (IS_ERR(inode))
+ RETURN(ERR_PTR(PTR_ERR(inode)));
+- }
++
+ if (is_bad_inode(inode) ||
+- (generation && inode->i_generation != generation)){
++ (iid->generation && inode->i_generation != iid->generation)) {
+ /* we didn't find the right inode.. */
+ CERROR("Inode %lu, Bad count: %lu %d or version %u %u\n",
+ inode->i_ino, (unsigned long)inode->i_nlink,
+ atomic_read(&inode->i_count), inode->i_generation,
+- generation);
++ iid->generation);
+ iput(inode);
+ RETURN(ERR_PTR(-ESTALE));
+ }
+@@ -146,57 +140,102 @@
+ RETURN(result);
+ }
+
+-struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
+- int fhtype, int parent)
++#define LUSTRE_NFS_FID 0x94
++
++struct lustre_nfs_fid {
++ struct ll_fid child;
++ struct ll_fid parent;
++ umode_t mode;
++};
++
++/* The return value is file handle type:
++ * 1 -- contains child file handle;
++ * 2 -- contains child file handle and parent file handle;
++ * 255 -- error.
++ */
++static int ll_encode_fh(struct dentry *de, __u32 *fh, int *plen,
++ int connectable)
+ {
+- switch (fhtype) {
+- case 2:
+- if (len < 5)
+- break;
+- if (parent)
+- return ll_iget_for_nfs(sb, data[3], 0, data[4]);
+- case 1:
+- if (len < 3)
+- break;
+- if (parent)
+- break;
+- return ll_iget_for_nfs(sb, data[0], data[1], data[2]);
+- default: break;
+- }
+- return ERR_PTR(-EINVAL);
++ struct inode *inode = de->d_inode;
++ struct inode *parent = de->d_parent->d_inode;
++ struct lustre_nfs_fid *nfs_fid = (void *)fh;
++ ENTRY;
++
++ CDEBUG(D_INFO, "encoding for (%lu) maxlen=%d minlen=%lu\n",
++ inode->i_ino, *plen,
++ sizeof(struct lustre_nfs_fid));
++
++ if (*plen < sizeof(struct lustre_nfs_fid))
++ RETURN(255);
++
++ ll_inode2fid(&nfs_fid->child, inode);
++ ll_inode2fid(&nfs_fid->parent, parent);
++
++ nfs_fid->mode = (S_IFMT & inode->i_mode);
++ *plen = sizeof(struct lustre_nfs_fid);
++
++ RETURN(LUSTRE_NFS_FID);
+ }
+
+-int ll_dentry_to_fh(struct dentry *dentry, __u32 *datap, int *lenp,
+- int need_parent)
++#ifdef HAVE_FH_TO_DENTRY
++static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
++ int fh_len, int fh_type)
+ {
+- if (*lenp < 3)
+- return 255;
+- *datap++ = dentry->d_inode->i_ino;
+- *datap++ = dentry->d_inode->i_generation;
+- *datap++ = (__u32)(S_IFMT & dentry->d_inode->i_mode);
++ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+- if (*lenp == 3 || S_ISDIR(dentry->d_inode->i_mode)) {
+- *lenp = 3;
+- return 1;
+- }
+- if (dentry->d_parent) {
+- *datap++ = dentry->d_parent->d_inode->i_ino;
+- *datap++ = (__u32)(S_IFMT & dentry->d_parent->d_inode->i_mode);
++ if (fh_type != LUSTRE_NFS_FID)
++ RETURN(ERR_PTR(-EINVAL));
+
+- *lenp = 5;
+- return 2;
+- }
+- *lenp = 3;
+- return 1;
++ RETURN(ll_iget_for_nfs(sb, &nfs_fid->child));
++}
++static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
++ int fh_len, int fh_type)
++{
++ struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
++
++ if (fh_type != LUSTRE_NFS_FID)
++ RETURN(ERR_PTR(-EINVAL));
++ RETURN(ll_iget_for_nfs(sb, &nfs_fid->parent));
+ }
+
+-#if THREAD_SIZE >= 8192
++#else
++/*
++ * This length is counted as amount of __u32,
++ * It is composed of a fid and a mode
++ */
++static struct dentry *ll_decode_fh(struct super_block *sb, __u32 *fh, int fh_len,
++ int fh_type,
++ int (*acceptable)(void *, struct dentry *),
++ void *context)
++{
++ struct lustre_nfs_fid *nfs_fid = (void *)fh;
++ struct dentry *entry;
++ ENTRY;
++
++ CDEBUG(D_INFO, "decoding for "LPU64" fh_len=%d fh_type=%x\n",
++ nfs_fid->child.id, fh_len, fh_type);
++
++ if (fh_type != LUSTRE_NFS_FID)
++ RETURN(ERR_PTR(-ESTALE));
++
++ entry = sb->s_export_op->find_exported_dentry(sb, &nfs_fid->child,
++ &nfs_fid->parent,
++ acceptable, context);
++ RETURN(entry);
++}
++
++
+ struct dentry *ll_get_dentry(struct super_block *sb, void *data)
+ {
+- __u32 *inump = (__u32*)data;
+- return ll_iget_for_nfs(sb, inump[0], inump[1], S_IFREG);
++ struct lustre_nfs_fid *fid = data;
++ ENTRY;
++
++ RETURN(ll_iget_for_nfs(sb, &fid->child));
++
+ }
+
++#endif
++
+ struct dentry *ll_get_parent(struct dentry *dchild)
+ {
+ struct ptlrpc_request *req = NULL;
+@@ -208,11 +247,11 @@
+ char dotdot[] = "..";
+ int rc = 0;
+ ENTRY;
+-
++
+ LASSERT(dir && S_ISDIR(dir->i_mode));
+-
+- sbi = ll_s2sbi(dir->i_sb);
+-
++
++ sbi = ll_s2sbi(dir->i_sb);
++
+ fid.id = (__u64)dir->i_ino;
+ fid.generation = dir->i_generation;
+ fid.f_type = S_IFDIR;
+@@ -223,11 +262,12 @@
+ CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+ return ERR_PTR(rc);
+ }
+- body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof (*body));
+-
++ body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof (*body));
++
+ LASSERT((body->valid & OBD_MD_FLGENER) && (body->valid & OBD_MD_FLID));
+-
+- result = ll_iget_for_nfs(dir->i_sb, body->ino, body->generation, S_IFDIR);
++ fid.id = body->ino;
++ fid.generation = body->generation;
++ result = ll_iget_for_nfs(dir->i_sb, &fid);
+
+ if (IS_ERR(result))
+ rc = PTR_ERR(result);
+@@ -236,10 +276,18 @@
+ if (rc)
+ return ERR_PTR(rc);
+ RETURN(result);
+-}
++}
+
++
++#if THREAD_SIZE >= 8192
+ struct export_operations lustre_export_operations = {
+- .get_parent = ll_get_parent,
+- .get_dentry = ll_get_dentry,
++ .encode_fh = ll_encode_fh,
++#ifdef HAVE_FH_TO_DENTRY
++ .fh_to_dentry = ll_fh_to_dentry,
++ .fh_to_parent = ll_fh_to_parent,
++#else
++ .get_dentry = ll_get_dentry,
++ .decode_fh = ll_decode_fh,
++#endif
+ };
+ #endif
+diff -urNad lustre~/lustre/llite/lloop.c lustre/lustre/llite/lloop.c
+--- lustre~/lustre/llite/lloop.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/lloop.c 2009-03-12 11:04:30.000000000 +0100
+@@ -152,7 +152,7 @@
+ struct semaphore lo_bh_mutex;
+ atomic_t lo_pending;
+
+- request_queue_t *lo_queue;
++ struct request_queue *lo_queue;
+
+ /* data to handle bio for lustre. */
+ struct lo_request_data {
+@@ -283,7 +283,7 @@
+ return bio;
+ }
+
+-static int loop_make_request(request_queue_t *q, struct bio *old_bio)
++static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+ {
+ struct lloop_device *lo = q->queuedata;
+ int rw = bio_rw(old_bio);
+@@ -312,7 +312,7 @@
+ if (atomic_dec_and_test(&lo->lo_pending))
+ up(&lo->lo_bh_mutex);
+ out:
+- bio_io_error(old_bio, old_bio->bi_size);
++ cfs_bio_io_error(old_bio, old_bio->bi_size);
+ return 0;
+ inactive:
+ spin_unlock_irq(&lo->lo_lock);
+@@ -322,7 +322,7 @@
+ /*
+ * kick off io on the underlying address space
+ */
+-static void loop_unplug(request_queue_t *q)
++static void loop_unplug(struct request_queue *q)
+ {
+ struct lloop_device *lo = q->queuedata;
+
+@@ -736,7 +736,7 @@
+
+ out_mem4:
+ while (i--)
+- blk_put_queue(loop_dev[i].lo_queue);
++ blk_cleanup_queue(loop_dev[i].lo_queue);
+ i = max_loop;
+ out_mem3:
+ while (i--)
+@@ -758,7 +758,7 @@
+ ll_iocontrol_unregister(ll_iocontrol_magic);
+ for (i = 0; i < max_loop; i++) {
+ del_gendisk(disks[i]);
+- blk_put_queue(loop_dev[i].lo_queue);
++ blk_cleanup_queue(loop_dev[i].lo_queue);
+ put_disk(disks[i]);
+ }
+ if (ll_unregister_blkdev(lloop_major, "lloop"))
+diff -urNad lustre~/lustre/llite/lloop.c.orig lustre/lustre/llite/lloop.c.orig
+--- lustre~/lustre/llite/lloop.c.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/llite/lloop.c.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,777 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ */
++
++/*
++ * linux/drivers/block/loop.c
++ *
++ * Written by Theodore Ts'o, 3/29/93
++ *
++ * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
++ * permitted under the GNU General Public License.
++ *
++ * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
++ * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
++ *
++ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
++ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
++ *
++ * Fixed do_loop_request() re-entrancy - Vincent.Renardias at waw.com Mar 20, 1997
++ *
++ * Added devfs support - Richard Gooch <rgooch at atnf.csiro.au> 16-Jan-1998
++ *
++ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
++ *
++ * Loadable modules and other fixes by AK, 1998
++ *
++ * Make real block number available to downstream transfer functions, enables
++ * CBC (and relatives) mode encryption requiring unique IVs per data block.
++ * Reed H. Petty, rhp at draper.net
++ *
++ * Maximum number of loop devices now dynamic via max_loop module parameter.
++ * Russell Kroll <rkroll at exploits.org> 19990701
++ *
++ * Maximum number of loop devices when compiled-in now selectable by passing
++ * max_loop=<1-255> to the kernel on boot.
++ * Erik I. Bols?, <eriki at himolde.no>, Oct 31, 1999
++ *
++ * Completely rewrite request handling to be make_request_fn style and
++ * non blocking, pushing work to a helper thread. Lots of fixes from
++ * Al Viro too.
++ * Jens Axboe <axboe at suse.de>, Nov 2000
++ *
++ * Support up to 256 loop devices
++ * Heinz Mauelshagen <mge at sistina.com>, Feb 2002
++ *
++ * Support for falling back on the write file operation when the address space
++ * operations prepare_write and/or commit_write are not available on the
++ * backing filesystem.
++ * Anton Altaparmakov, 16 Feb 2005
++ *
++ * Still To Fix:
++ * - Advisory locking is ignored here.
++ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
++ *
++ */
++
++#ifndef AUTOCONF_INCLUDED
++#include <linux/config.h>
++#endif
++#include <linux/module.h>
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/stat.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/wait.h>
++#include <linux/blkdev.h>
++#include <linux/blkpg.h>
++#include <linux/init.h>
++#include <linux/smp_lock.h>
++#include <linux/swap.h>
++#include <linux/slab.h>
++#include <linux/suspend.h>
++#include <linux/writeback.h>
++#include <linux/buffer_head.h> /* for invalidate_bdev() */
++#include <linux/completion.h>
++#include <linux/highmem.h>
++#include <linux/gfp.h>
++#include <linux/swap.h>
++#include <linux/pagevec.h>
++
++#include <asm/uaccess.h>
++
++#include <lustre_lib.h>
++#include <lustre_lite.h>
++#include "llite_internal.h"
++
++#define LLOOP_MAX_SEGMENTS PTLRPC_MAX_BRW_PAGES
++
++/* Possible states of device */
++enum {
++ LLOOP_UNBOUND,
++ LLOOP_BOUND,
++ LLOOP_RUNDOWN,
++};
++
++struct lloop_device {
++ int lo_number;
++ int lo_refcnt;
++ loff_t lo_offset;
++ loff_t lo_sizelimit;
++ int lo_flags;
++ int (*ioctl)(struct lloop_device *, int cmd,
++ unsigned long arg);
++
++ struct file * lo_backing_file;
++ struct block_device *lo_device;
++ unsigned lo_blocksize;
++
++ int old_gfp_mask;
++
++ spinlock_t lo_lock;
++ struct bio *lo_bio;
++ struct bio *lo_biotail;
++ int lo_state;
++ struct semaphore lo_sem;
++ struct semaphore lo_ctl_mutex;
++ struct semaphore lo_bh_mutex;
++ atomic_t lo_pending;
++
++ request_queue_t *lo_queue;
++
++ /* data to handle bio for lustre. */
++ struct lo_request_data {
++ struct brw_page lrd_pages[LLOOP_MAX_SEGMENTS];
++ struct obdo lrd_oa;
++ } lo_requests[1];
++
++};
++
++/*
++ * Loop flags
++ */
++enum {
++ LO_FLAGS_READ_ONLY = 1,
++};
++
++static int lloop_major;
++static int max_loop = 8;
++static struct lloop_device *loop_dev;
++static struct gendisk **disks;
++static struct semaphore lloop_mutex;
++static void *ll_iocontrol_magic = NULL;
++
++static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
++{
++ loff_t size, offset, loopsize;
++
++ /* Compute loopsize in bytes */
++ size = i_size_read(file->f_mapping->host);
++ offset = lo->lo_offset;
++ loopsize = size - offset;
++ if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
++ loopsize = lo->lo_sizelimit;
++
++ /*
++ * Unfortunately, if we want to do I/O on the device,
++ * the number of 512-byte sectors has to fit into a sector_t.
++ */
++ return loopsize >> 9;
++}
++
++static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
++{
++ struct inode *inode = lo->lo_backing_file->f_dentry->d_inode;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct obd_info oinfo = {{{0}}};
++ struct brw_page *pg = lo->lo_requests[0].lrd_pages;
++ struct obdo *oa = &lo->lo_requests[0].lrd_oa;
++ pgoff_t offset;
++ int ret, cmd, i;
++ struct bio_vec *bvec;
++
++ BUG_ON(bio->bi_hw_segments > LLOOP_MAX_SEGMENTS);
++
++ offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
++ bio_for_each_segment(bvec, bio, i) {
++ BUG_ON(bvec->bv_offset != 0);
++ BUG_ON(bvec->bv_len != CFS_PAGE_SIZE);
++
++ pg->pg = bvec->bv_page;
++ pg->off = offset;
++ pg->count = bvec->bv_len;
++ pg->flag = OBD_BRW_SRVLOCK;
++
++ pg++;
++ offset += bvec->bv_len;
++ }
++
++ oa->o_mode = inode->i_mode;
++ oa->o_id = lsm->lsm_object_id;
++ oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
++ obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
++
++ cmd = OBD_BRW_READ;
++ if (bio_rw(bio) == WRITE)
++ cmd = OBD_BRW_WRITE;
++
++ if (cmd == OBD_BRW_WRITE)
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, bio->bi_size);
++ else
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, bio->bi_size);
++ oinfo.oi_oa = oa;
++ oinfo.oi_md = lsm;
++ ret = obd_brw(cmd, ll_i2obdexp(inode), &oinfo,
++ (obd_count)(i - bio->bi_idx),
++ lo->lo_requests[0].lrd_pages, NULL);
++ if (ret == 0)
++ obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
++ return ret;
++}
++
++
++/*
++ * Add bio to back of pending list
++ */
++static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&lo->lo_lock, flags);
++ if (lo->lo_biotail) {
++ lo->lo_biotail->bi_next = bio;
++ lo->lo_biotail = bio;
++ } else
++ lo->lo_bio = lo->lo_biotail = bio;
++ spin_unlock_irqrestore(&lo->lo_lock, flags);
++
++ up(&lo->lo_bh_mutex);
++}
++
++/*
++ * Grab first pending buffer
++ */
++static struct bio *loop_get_bio(struct lloop_device *lo)
++{
++ struct bio *bio;
++
++ spin_lock_irq(&lo->lo_lock);
++ if ((bio = lo->lo_bio)) {
++ if (bio == lo->lo_biotail)
++ lo->lo_biotail = NULL;
++ lo->lo_bio = bio->bi_next;
++ bio->bi_next = NULL;
++ }
++ spin_unlock_irq(&lo->lo_lock);
++
++ return bio;
++}
++
++static int loop_make_request(request_queue_t *q, struct bio *old_bio)
++{
++ struct lloop_device *lo = q->queuedata;
++ int rw = bio_rw(old_bio);
++
++ if (!lo)
++ goto out;
++
++ spin_lock_irq(&lo->lo_lock);
++ if (lo->lo_state != LLOOP_BOUND)
++ goto inactive;
++ atomic_inc(&lo->lo_pending);
++ spin_unlock_irq(&lo->lo_lock);
++
++ if (rw == WRITE) {
++ if (lo->lo_flags & LO_FLAGS_READ_ONLY)
++ goto err;
++ } else if (rw == READA) {
++ rw = READ;
++ } else if (rw != READ) {
++ CERROR("lloop: unknown command (%x)\n", rw);
++ goto err;
++ }
++ loop_add_bio(lo, old_bio);
++ return 0;
++err:
++ if (atomic_dec_and_test(&lo->lo_pending))
++ up(&lo->lo_bh_mutex);
++out:
++ bio_io_error(old_bio, old_bio->bi_size);
++ return 0;
++inactive:
++ spin_unlock_irq(&lo->lo_lock);
++ goto out;
++}
++
++/*
++ * kick off io on the underlying address space
++ */
++static void loop_unplug(request_queue_t *q)
++{
++ struct lloop_device *lo = q->queuedata;
++
++ clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
++ blk_run_address_space(lo->lo_backing_file->f_mapping);
++}
++
++static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
++{
++ int ret;
++ ret = do_bio_filebacked(lo, bio);
++ bio_endio(bio, bio->bi_size, ret);
++}
++
++/*
++ * worker thread that handles reads/writes to file backed loop devices,
++ * to avoid blocking in our make_request_fn. it also does loop decrypting
++ * on reads for block backed loop, as that is too heavy to do from
++ * b_end_io context where irqs may be disabled.
++ */
++static int loop_thread(void *data)
++{
++ struct lloop_device *lo = data;
++ struct bio *bio;
++
++ daemonize("lloop%d", lo->lo_number);
++
++ set_user_nice(current, -20);
++
++ lo->lo_state = LLOOP_BOUND;
++ atomic_inc(&lo->lo_pending);
++
++ /*
++ * up sem, we are running
++ */
++ up(&lo->lo_sem);
++
++ for (;;) {
++ down_interruptible(&lo->lo_bh_mutex);
++ /*
++ * could be upped because of tear-down, not because of
++ * pending work
++ */
++ if (!atomic_read(&lo->lo_pending))
++ break;
++
++ bio = loop_get_bio(lo);
++ if (!bio) {
++ CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
++ continue;
++ }
++ loop_handle_bio(lo, bio);
++
++ /*
++ * upped both for pending work and tear-down, lo_pending
++ * will hit zero then
++ */
++ if (atomic_dec_and_test(&lo->lo_pending))
++ break;
++ }
++
++ up(&lo->lo_sem);
++ return 0;
++}
++
++static int loop_set_fd(struct lloop_device *lo, struct file *unused,
++ struct block_device *bdev, struct file *file)
++{
++ struct inode *inode;
++ struct address_space *mapping;
++ int lo_flags = 0;
++ int error;
++ loff_t size;
++
++ if (!try_module_get(THIS_MODULE))
++ return -ENODEV;
++
++ error = -EBUSY;
++ if (lo->lo_state != LLOOP_UNBOUND)
++ goto out;
++
++ mapping = file->f_mapping;
++ inode = mapping->host;
++
++ error = -EINVAL;
++ if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
++ goto out;
++
++ if (!(file->f_mode & FMODE_WRITE))
++ lo_flags |= LO_FLAGS_READ_ONLY;
++
++ size = get_loop_size(lo, file);
++
++ if ((loff_t)(sector_t)size != size) {
++ error = -EFBIG;
++ goto out;
++ }
++
++ /* remove all pages in cache so as dirty pages not to be existent. */
++ truncate_inode_pages(mapping, 0);
++
++ set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
++
++ lo->lo_blocksize = CFS_PAGE_SIZE;
++ lo->lo_device = bdev;
++ lo->lo_flags = lo_flags;
++ lo->lo_backing_file = file;
++ lo->ioctl = NULL;
++ lo->lo_sizelimit = 0;
++ lo->old_gfp_mask = mapping_gfp_mask(mapping);
++ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
++
++ lo->lo_bio = lo->lo_biotail = NULL;
++
++ /*
++ * set queue make_request_fn, and add limits based on lower level
++ * device
++ */
++ blk_queue_make_request(lo->lo_queue, loop_make_request);
++ lo->lo_queue->queuedata = lo;
++ lo->lo_queue->unplug_fn = loop_unplug;
++
++ /* queue parameters */
++ blk_queue_hardsect_size(lo->lo_queue, CFS_PAGE_SIZE);
++ blk_queue_max_sectors(lo->lo_queue, LLOOP_MAX_SEGMENTS);
++ blk_queue_max_phys_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
++
++ set_capacity(disks[lo->lo_number], size);
++ bd_set_size(bdev, size << 9);
++
++ set_blocksize(bdev, lo->lo_blocksize);
++
++ kernel_thread(loop_thread, lo, CLONE_KERNEL);
++ down(&lo->lo_sem);
++ return 0;
++
++ out:
++ /* This is safe: open() is still holding a reference. */
++ module_put(THIS_MODULE);
++ return error;
++}
++
++static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
++ int count)
++{
++ struct file *filp = lo->lo_backing_file;
++ int gfp = lo->old_gfp_mask;
++
++ if (lo->lo_state != LLOOP_BOUND)
++ return -ENXIO;
++
++ if (lo->lo_refcnt > count) /* we needed one fd for the ioctl */
++ return -EBUSY;
++
++ if (filp == NULL)
++ return -EINVAL;
++
++ spin_lock_irq(&lo->lo_lock);
++ lo->lo_state = LLOOP_RUNDOWN;
++ if (atomic_dec_and_test(&lo->lo_pending))
++ up(&lo->lo_bh_mutex);
++ spin_unlock_irq(&lo->lo_lock);
++
++ down(&lo->lo_sem);
++ lo->lo_backing_file = NULL;
++ lo->ioctl = NULL;
++ lo->lo_device = NULL;
++ lo->lo_offset = 0;
++ lo->lo_sizelimit = 0;
++ lo->lo_flags = 0;
++ ll_invalidate_bdev(bdev, 0);
++ set_capacity(disks[lo->lo_number], 0);
++ bd_set_size(bdev, 0);
++ mapping_set_gfp_mask(filp->f_mapping, gfp);
++ lo->lo_state = LLOOP_UNBOUND;
++ fput(filp);
++ /* This is safe: open() is still holding a reference. */
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++static int lo_open(struct inode *inode, struct file *file)
++{
++ struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
++
++ down(&lo->lo_ctl_mutex);
++ lo->lo_refcnt++;
++ up(&lo->lo_ctl_mutex);
++
++ return 0;
++}
++
++static int lo_release(struct inode *inode, struct file *file)
++{
++ struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
++
++ down(&lo->lo_ctl_mutex);
++ --lo->lo_refcnt;
++ up(&lo->lo_ctl_mutex);
++
++ return 0;
++}
++
++/* lloop device node's ioctl function. */
++static int lo_ioctl(struct inode *inode, struct file *unused,
++ unsigned int cmd, unsigned long arg)
++{
++ struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
++ struct block_device *bdev = inode->i_bdev;
++ int err = 0;
++
++ down(&lloop_mutex);
++ switch (cmd) {
++ case LL_IOC_LLOOP_DETACH: {
++ err = loop_clr_fd(lo, bdev, 2);
++ if (err == 0)
++ blkdev_put(bdev); /* grabbed in LLOOP_ATTACH */
++ break;
++ }
++
++ case LL_IOC_LLOOP_INFO: {
++ __u64 ino = 0;
++
++ if (lo->lo_state == LLOOP_BOUND)
++ ino = lo->lo_backing_file->f_dentry->d_inode->i_ino;
++
++ if (put_user(ino, (__u64 *)arg))
++ err = -EFAULT;
++ break;
++ }
++
++ default:
++ err = -EINVAL;
++ break;
++ }
++ up(&lloop_mutex);
++
++ return err;
++}
++
++static struct block_device_operations lo_fops = {
++ .owner = THIS_MODULE,
++ .open = lo_open,
++ .release = lo_release,
++ .ioctl = lo_ioctl,
++};
++
++/* dynamic iocontrol callback.
++ * This callback is registered in lloop_init and will be called by
++ * ll_iocontrol_call.
++ * This is a llite regular file ioctl function. It takes the responsibility
++ * of attaching a file, and detaching a file by a lloop's device numner.
++ */
++static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
++ unsigned int cmd, unsigned long arg,
++ void *magic, int *rcp)
++{
++ struct lloop_device *lo = NULL;
++ struct block_device *bdev = NULL;
++ int err = 0;
++ dev_t dev;
++
++ if (magic != ll_iocontrol_magic)
++ return LLIOC_CONT;
++
++ if (disks == NULL)
++ GOTO(out1, err = -ENODEV);
++
++ down(&lloop_mutex);
++ switch (cmd) {
++ case LL_IOC_LLOOP_ATTACH: {
++ struct lloop_device *lo_free = NULL;
++ int i;
++
++ for (i = 0; i < max_loop; i++, lo = NULL) {
++ lo = &loop_dev[i];
++ if (lo->lo_state == LLOOP_UNBOUND) {
++ if (!lo_free)
++ lo_free = lo;
++ continue;
++ }
++ if (lo->lo_backing_file->f_dentry->d_inode ==
++ file->f_dentry->d_inode)
++ break;
++ }
++ if (lo || !lo_free)
++ GOTO(out, err = -EBUSY);
++
++ lo = lo_free;
++ dev = MKDEV(lloop_major, lo->lo_number);
++
++ /* quit if the used pointer is writable */
++ if (put_user((long)old_encode_dev(dev), (long*)arg))
++ GOTO(out, err = -EFAULT);
++
++ bdev = open_by_devnum(dev, file->f_mode);
++ if (IS_ERR(bdev))
++ GOTO(out, err = PTR_ERR(bdev));
++
++ get_file(file);
++ err = loop_set_fd(lo, NULL, bdev, file);
++ if (err) {
++ fput(file);
++ blkdev_put(bdev);
++ }
++
++ break;
++ }
++
++ case LL_IOC_LLOOP_DETACH_BYDEV: {
++ int minor;
++
++ dev = old_decode_dev(arg);
++ if (MAJOR(dev) != lloop_major)
++ GOTO(out, err = -EINVAL);
++
++ minor = MINOR(dev);
++ if (minor > max_loop - 1)
++ GOTO(out, err = -EINVAL);
++
++ lo = &loop_dev[minor];
++ if (lo->lo_state != LLOOP_BOUND)
++ GOTO(out, err = -EINVAL);
++
++ bdev = lo->lo_device;
++ err = loop_clr_fd(lo, bdev, 1);
++ if (err == 0)
++ blkdev_put(bdev); /* grabbed in LLOOP_ATTACH */
++
++ break;
++ }
++
++ default:
++ err = -EINVAL;
++ break;
++ }
++
++out:
++ up(&lloop_mutex);
++out1:
++ if (rcp)
++ *rcp = err;
++ return LLIOC_STOP;
++}
++
++static int __init lloop_init(void)
++{
++ int i;
++ unsigned int cmdlist[] = {
++ LL_IOC_LLOOP_ATTACH,
++ LL_IOC_LLOOP_DETACH_BYDEV,
++ };
++
++ if (max_loop < 1 || max_loop > 256) {
++ CWARN("lloop: invalid max_loop (must be between"
++ " 1 and 256), using default (8)\n");
++ max_loop = 8;
++ }
++
++ lloop_major = register_blkdev(0, "lloop");
++ if (lloop_major < 0)
++ return -EIO;
++
++ ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
++ if (ll_iocontrol_magic == NULL)
++ goto out_mem1;
++
++ loop_dev = kmalloc(max_loop * sizeof(struct lloop_device), GFP_KERNEL);
++ if (!loop_dev)
++ goto out_mem1;
++ memset(loop_dev, 0, max_loop * sizeof(struct lloop_device));
++
++ disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
++ if (!disks)
++ goto out_mem2;
++
++ for (i = 0; i < max_loop; i++) {
++ disks[i] = alloc_disk(1);
++ if (!disks[i])
++ goto out_mem3;
++ }
++
++ init_MUTEX(&lloop_mutex);
++
++ for (i = 0; i < max_loop; i++) {
++ struct lloop_device *lo = &loop_dev[i];
++ struct gendisk *disk = disks[i];
++
++ memset(lo, 0, sizeof(*lo));
++ lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
++ if (!lo->lo_queue)
++ goto out_mem4;
++
++ init_MUTEX(&lo->lo_ctl_mutex);
++ init_MUTEX_LOCKED(&lo->lo_sem);
++ init_MUTEX_LOCKED(&lo->lo_bh_mutex);
++ lo->lo_number = i;
++ spin_lock_init(&lo->lo_lock);
++ disk->major = lloop_major;
++ disk->first_minor = i;
++ disk->fops = &lo_fops;
++ sprintf(disk->disk_name, "lloop%d", i);
++ disk->private_data = lo;
++ disk->queue = lo->lo_queue;
++ }
++
++ /* We cannot fail after we call this, so another loop!*/
++ for (i = 0; i < max_loop; i++)
++ add_disk(disks[i]);
++ return 0;
++
++out_mem4:
++ while (i--)
++ blk_put_queue(loop_dev[i].lo_queue);
++ i = max_loop;
++out_mem3:
++ while (i--)
++ put_disk(disks[i]);
++ kfree(disks);
++out_mem2:
++ kfree(loop_dev);
++out_mem1:
++ unregister_blkdev(lloop_major, "lloop");
++ ll_iocontrol_unregister(ll_iocontrol_magic);
++ CERROR("lloop: ran out of memory\n");
++ return -ENOMEM;
++}
++
++static void lloop_exit(void)
++{
++ int i;
++
++ ll_iocontrol_unregister(ll_iocontrol_magic);
++ for (i = 0; i < max_loop; i++) {
++ del_gendisk(disks[i]);
++ blk_put_queue(loop_dev[i].lo_queue);
++ put_disk(disks[i]);
++ }
++ if (ll_unregister_blkdev(lloop_major, "lloop"))
++ CWARN("lloop: cannot unregister blkdev\n");
++
++ kfree(disks);
++ kfree(loop_dev);
++}
++
++module_init(lloop_init);
++module_exit(lloop_exit);
++
++CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
++MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
++MODULE_DESCRIPTION("Lustre virtual block device");
++MODULE_LICENSE("GPL");
+diff -urNad lustre~/lustre/llite/rw.c lustre/lustre/llite/rw.c
+--- lustre~/lustre/llite/rw.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/rw.c 2009-03-12 11:02:51.000000000 +0100
+@@ -61,6 +61,8 @@
+
+ #define DEBUG_SUBSYSTEM S_LLITE
+
++#include <linux/page-flags.h>
++
+ #include <lustre_lite.h>
+ #include "llite_internal.h"
+ #include <linux/lustre_compat25.h>
+@@ -186,7 +188,7 @@
+ GOTO(out_unlock, 0);
+ }
+
+- LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
++ LASSERT(SEM_COUNT(&lli->lli_size_sem) <= 0);
+
+ if (!srvlock) {
+ struct ost_lvb lvb;
+@@ -2122,7 +2124,7 @@
+ rc = generic_write_checks(file, ppos, &count, 0);
+ if (rc)
+ GOTO(out, rc);
+- rc = ll_remove_suid(file->f_dentry, file->f_vfsmnt);
++ rc = ll_remove_suid(file, file->f_vfsmnt);
+ if (rc)
+ GOTO(out, rc);
+ }
+diff -urNad lustre~/lustre/llite/rw.c.orig lustre/lustre/llite/rw.c.orig
+--- lustre~/lustre/llite/rw.c.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/llite/rw.c.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,2215 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ *
++ * lustre/llite/rw.c
++ *
++ * Lustre Lite I/O page cache routines shared by different kernel revs
++ */
++
++#ifndef AUTOCONF_INCLUDED
++#include <linux/config.h>
++#endif
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <linux/stat.h>
++#include <linux/errno.h>
++#include <linux/smp_lock.h>
++#include <linux/unistd.h>
++#include <linux/version.h>
++#include <asm/system.h>
++#include <asm/uaccess.h>
++
++#include <linux/fs.h>
++#include <linux/stat.h>
++#include <asm/uaccess.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/smp_lock.h>
++
++#define DEBUG_SUBSYSTEM S_LLITE
++
++#include <lustre_lite.h>
++#include "llite_internal.h"
++#include <linux/lustre_compat25.h>
++
++#ifndef list_for_each_prev_safe
++#define list_for_each_prev_safe(pos, n, head) \
++ for (pos = (head)->prev, n = pos->prev; pos != (head); \
++ pos = n, n = pos->prev )
++#endif
++
++cfs_mem_cache_t *ll_async_page_slab = NULL;
++size_t ll_async_page_slab_size = 0;
++
++/* SYNCHRONOUS I/O to object storage for an inode */
++static int ll_brw(int cmd, struct inode *inode, struct obdo *oa,
++ struct page *page, int flags)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct obd_info oinfo = { { { 0 } } };
++ struct brw_page pg;
++ int rc;
++ ENTRY;
++
++ pg.pg = page;
++ pg.off = ((obd_off)page->index) << CFS_PAGE_SHIFT;
++
++ if ((cmd & OBD_BRW_WRITE) && (pg.off+CFS_PAGE_SIZE>i_size_read(inode)))
++ pg.count = i_size_read(inode) % CFS_PAGE_SIZE;
++ else
++ pg.count = CFS_PAGE_SIZE;
++
++ LL_CDEBUG_PAGE(D_PAGE, page, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
++ cmd & OBD_BRW_WRITE ? "write" : "read", pg.count,
++ inode->i_ino, pg.off, pg.off);
++ if (pg.count == 0) {
++ CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off "
++ LPU64"\n", inode->i_ino, inode, i_size_read(inode),
++ page->mapping->host, i_size_read(page->mapping->host),
++ page->index, pg.off);
++ }
++
++ pg.flag = flags;
++
++ if (cmd & OBD_BRW_WRITE)
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE,
++ pg.count);
++ else
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ,
++ pg.count);
++ oinfo.oi_oa = oa;
++ oinfo.oi_md = lsm;
++ rc = obd_brw(cmd, ll_i2obdexp(inode), &oinfo, 1, &pg, NULL);
++ if (rc == 0)
++ obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
++ else if (rc != -EIO)
++ CERROR("error from obd_brw: rc = %d\n", rc);
++ RETURN(rc);
++}
++
++int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct obd_info oinfo = { { { 0 } } };
++ struct obdo oa;
++ int rc;
++
++ ENTRY;
++ CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n",
++ lli->lli_smd->lsm_object_id, new_size, new_size);
++
++ oinfo.oi_md = lli->lli_smd;
++ oinfo.oi_policy.l_extent.start = new_size;
++ oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
++ oinfo.oi_oa = &oa;
++ oa.o_id = lli->lli_smd->lsm_object_id;
++ oa.o_valid = OBD_MD_FLID;
++ if (srvlock) {
++ /* set OBD_MD_FLFLAGS in o_valid, only if we
++ * set OBD_FL_TRUNCLOCK, otherwise ost_punch
++ * and filter_setattr get confused, see the comment
++ * in ost_punch */
++ oa.o_flags = OBD_FL_TRUNCLOCK;
++ oa.o_valid |= OBD_MD_FLFLAGS;
++ }
++ obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |OBD_MD_FLFID|
++ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
++ OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER |
++ OBD_MD_FLBLOCKS);
++ rc = obd_punch_rqset(ll_i2obdexp(inode), &oinfo, NULL);
++ if (rc) {
++ CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
++ RETURN(rc);
++ }
++ obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
++ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
++ RETURN(0);
++}
++/* this isn't where truncate starts. roughly:
++ * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs
++ * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
++ * avoid races.
++ *
++ * must be called under ->lli_size_sem */
++void ll_truncate(struct inode *inode)
++{
++ struct ll_inode_info *lli = ll_i2info(inode);
++ int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags);
++ loff_t new_size;
++ ENTRY;
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
++ inode->i_generation, inode, i_size_read(inode), i_size_read(inode));
++
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
++ if (lli->lli_size_sem_owner != current) {
++ EXIT;
++ return;
++ }
++
++ if (!lli->lli_smd) {
++ CDEBUG(D_INODE, "truncate on inode %lu with no objects\n",
++ inode->i_ino);
++ GOTO(out_unlock, 0);
++ }
++
++ LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
++
++ if (!srvlock) {
++ struct ost_lvb lvb;
++ int rc;
++
++ /* XXX I'm pretty sure this is a hack to paper over a more fundamental
++ * race condition. */
++ lov_stripe_lock(lli->lli_smd);
++ inode_init_lvb(inode, &lvb);
++ rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
++ inode->i_blocks = lvb.lvb_blocks;
++ if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
++ CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
++ lli->lli_smd->lsm_object_id, i_size_read(inode),
++ i_size_read(inode));
++ lov_stripe_unlock(lli->lli_smd);
++ GOTO(out_unlock, 0);
++ }
++
++ obd_adjust_kms(ll_i2obdexp(inode), lli->lli_smd,
++ i_size_read(inode), 1);
++ lov_stripe_unlock(lli->lli_smd);
++ }
++
++ if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_LLITE_CHECKSUM) &&
++ (i_size_read(inode) & ~CFS_PAGE_MASK))) {
++ /* If the truncate leaves a partial page, update its checksum */
++ struct page *page = find_get_page(inode->i_mapping,
++ i_size_read(inode) >>
++ CFS_PAGE_SHIFT);
++ if (page != NULL) {
++ struct ll_async_page *llap = llap_cast_private(page);
++ if (llap != NULL) {
++ char *kaddr = kmap_atomic(page, KM_USER0);
++ llap->llap_checksum =
++ init_checksum(OSC_DEFAULT_CKSUM);
++ llap->llap_checksum =
++ compute_checksum(llap->llap_checksum,
++ kaddr, CFS_PAGE_SIZE,
++ OSC_DEFAULT_CKSUM);
++ kunmap_atomic(kaddr, KM_USER0);
++ }
++ page_cache_release(page);
++ }
++ }
++
++ new_size = i_size_read(inode);
++ ll_inode_size_unlock(inode, 0);
++ if (!srvlock)
++ ll_file_punch(inode, new_size, 0);
++ else
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LOCKLESS_TRUNC, 1);
++
++ EXIT;
++ return;
++
++ out_unlock:
++ ll_inode_size_unlock(inode, 0);
++} /* ll_truncate */
++
++int ll_prepare_write(struct file *file, struct page *page, unsigned from,
++ unsigned to)
++{
++ struct inode *inode = page->mapping->host;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ obd_off offset = ((obd_off)page->index) << CFS_PAGE_SHIFT;
++ struct obd_info oinfo = { { { 0 } } };
++ struct brw_page pga;
++ struct obdo oa;
++ struct ost_lvb lvb;
++ int rc = 0;
++ ENTRY;
++
++ LASSERT(PageLocked(page));
++ (void)llap_cast_private(page); /* assertion */
++
++ /* Check to see if we should return -EIO right away */
++ pga.pg = page;
++ pga.off = offset;
++ pga.count = CFS_PAGE_SIZE;
++ pga.flag = 0;
++
++ oa.o_mode = inode->i_mode;
++ oa.o_id = lsm->lsm_object_id;
++ oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
++ obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
++
++ oinfo.oi_oa = &oa;
++ oinfo.oi_md = lsm;
++ rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oinfo, 1, &pga, NULL);
++ if (rc)
++ RETURN(rc);
++
++ if (PageUptodate(page)) {
++ LL_CDEBUG_PAGE(D_PAGE, page, "uptodate\n");
++ RETURN(0);
++ }
++
++ /* We're completely overwriting an existing page, so _don't_ set it up
++ * to date until commit_write */
++ if (from == 0 && to == CFS_PAGE_SIZE) {
++ LL_CDEBUG_PAGE(D_PAGE, page, "full page write\n");
++ POISON_PAGE(page, 0x11);
++ RETURN(0);
++ }
++
++ /* If are writing to a new page, no need to read old data. The extent
++ * locking will have updated the KMS, and for our purposes here we can
++ * treat it like i_size. */
++ lov_stripe_lock(lsm);
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);
++ lov_stripe_unlock(lsm);
++ if (lvb.lvb_size <= offset) {
++ char *kaddr = kmap_atomic(page, KM_USER0);
++ LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n",
++ lvb.lvb_size, offset);
++ memset(kaddr, 0, CFS_PAGE_SIZE);
++ kunmap_atomic(kaddr, KM_USER0);
++ GOTO(prepare_done, rc = 0);
++ }
++
++ /* XXX could be an async ocp read.. read-ahead? */
++ rc = ll_brw(OBD_BRW_READ, inode, &oa, page, 0);
++ if (rc == 0) {
++ /* bug 1598: don't clobber blksize */
++ oa.o_valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLKSZ);
++ obdo_refresh_inode(inode, &oa, oa.o_valid);
++ }
++
++ EXIT;
++ prepare_done:
++ if (rc == 0)
++ SetPageUptodate(page);
++
++ return rc;
++}
++
++/**
++ * make page ready for ASYNC write
++ * \param data - pointer to llap cookie
++ * \param cmd - is OBD_BRW_* macroses
++ *
++ * \retval 0 is page successfully prepared to send
++ * \retval -EAGAIN is page not need to send
++ */
++static int ll_ap_make_ready(void *data, int cmd)
++{
++ struct ll_async_page *llap;
++ struct page *page;
++ ENTRY;
++
++ llap = LLAP_FROM_COOKIE(data);
++ page = llap->llap_page;
++
++ /* we're trying to write, but the page is locked.. come back later */
++ if (TryLockPage(page))
++ RETURN(-EAGAIN);
++
++ LASSERTF(!(cmd & OBD_BRW_READ) || !PageWriteback(page),
++ "cmd %x page %p ino %lu index %lu fl %lx\n", cmd, page,
++ page->mapping->host->i_ino, page->index, page->flags);
++
++ /* if we left PageDirty we might get another writepage call
++ * in the future. list walkers are bright enough
++ * to check page dirty so we can leave it on whatever list
++ * its on. XXX also, we're called with the cli list so if
++ * we got the page cache list we'd create a lock inversion
++ * with the removepage path which gets the page lock then the
++ * cli lock */
++ if(!clear_page_dirty_for_io(page)) {
++ unlock_page(page);
++ RETURN(-EAGAIN);
++ }
++
++ /* This actually clears the dirty bit in the radix tree.*/
++ set_page_writeback(page);
++
++ LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n");
++ page_cache_get(page);
++
++ RETURN(0);
++}
++
++/* We have two reasons for giving llite the opportunity to change the
++ * write length of a given queued page as it builds the RPC containing
++ * the page:
++ *
++ * 1) Further extending writes may have landed in the page cache
++ * since a partial write first queued this page requiring us
++ * to write more from the page cache. (No further races are possible, since
++ * by the time this is called, the page is locked.)
++ * 2) We might have raced with truncate and want to avoid performing
++ * write RPCs that are just going to be thrown away by the
++ * truncate's punch on the storage targets.
++ *
++ * The kms serves these purposes as it is set at both truncate and extending
++ * writes.
++ */
++static int ll_ap_refresh_count(void *data, int cmd)
++{
++ struct ll_inode_info *lli;
++ struct ll_async_page *llap;
++ struct lov_stripe_md *lsm;
++ struct page *page;
++ struct inode *inode;
++ struct ost_lvb lvb;
++ __u64 kms;
++ ENTRY;
++
++ /* readpage queues with _COUNT_STABLE, shouldn't get here. */
++ LASSERT(cmd != OBD_BRW_READ);
++
++ llap = LLAP_FROM_COOKIE(data);
++ page = llap->llap_page;
++ inode = page->mapping->host;
++ lli = ll_i2info(inode);
++ lsm = lli->lli_smd;
++
++ lov_stripe_lock(lsm);
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);
++ kms = lvb.lvb_size;
++ lov_stripe_unlock(lsm);
++
++ /* catch race with truncate */
++ if (((__u64)page->index << CFS_PAGE_SHIFT) >= kms)
++ return 0;
++
++ /* catch sub-page write at end of file */
++ if (((__u64)page->index << CFS_PAGE_SHIFT) + CFS_PAGE_SIZE > kms)
++ return kms % CFS_PAGE_SIZE;
++
++ return CFS_PAGE_SIZE;
++}
++
++void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa)
++{
++ struct lov_stripe_md *lsm;
++ obd_flag valid_flags;
++
++ lsm = ll_i2info(inode)->lli_smd;
++
++ oa->o_id = lsm->lsm_object_id;
++ oa->o_valid = OBD_MD_FLID;
++ valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
++ if (cmd & OBD_BRW_WRITE) {
++ oa->o_valid |= OBD_MD_FLEPOCH;
++ oa->o_easize = ll_i2info(inode)->lli_io_epoch;
++
++ valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
++ OBD_MD_FLUID | OBD_MD_FLGID |
++ OBD_MD_FLFID | OBD_MD_FLGENER;
++ }
++
++ obdo_from_inode(oa, inode, valid_flags);
++}
++
++static void ll_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
++{
++ struct ll_async_page *llap;
++ ENTRY;
++
++ llap = LLAP_FROM_COOKIE(data);
++ ll_inode_fill_obdo(llap->llap_page->mapping->host, cmd, oa);
++
++ EXIT;
++}
++
++static void ll_ap_update_obdo(void *data, int cmd, struct obdo *oa,
++ obd_valid valid)
++{
++ struct ll_async_page *llap;
++ ENTRY;
++
++ llap = LLAP_FROM_COOKIE(data);
++ obdo_from_inode(oa, llap->llap_page->mapping->host, valid);
++
++ EXIT;
++}
++
++static struct obd_async_page_ops ll_async_page_ops = {
++ .ap_make_ready = ll_ap_make_ready,
++ .ap_refresh_count = ll_ap_refresh_count,
++ .ap_fill_obdo = ll_ap_fill_obdo,
++ .ap_update_obdo = ll_ap_update_obdo,
++ .ap_completion = ll_ap_completion,
++};
++
++struct ll_async_page *llap_cast_private(struct page *page)
++{
++ struct ll_async_page *llap = (struct ll_async_page *)page_private(page);
++
++ LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC,
++ "page %p private %lu gave magic %d which != %d\n",
++ page, page_private(page), llap->llap_magic, LLAP_MAGIC);
++
++ return llap;
++}
++
++/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
++ *
++ * There is an llap attached onto every page in lustre, linked off @sbi.
++ * We add an llap to the list so we don't lose our place during list walking.
++ * If llaps in the list are being moved they will only move to the end
++ * of the LRU, and we aren't terribly interested in those pages here (we
++ * start at the beginning of the list where the least-used llaps are.
++ */
++int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
++{
++ struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
++ unsigned long total, want, count = 0;
++
++ total = sbi->ll_async_page_count;
++
++ /* There can be a large number of llaps (600k or more in a large
++ * memory machine) so the VM 1/6 shrink ratio is likely too much.
++ * Since we are freeing pages also, we don't necessarily want to
++ * shrink so much. Limit to 40MB of pages + llaps per call. */
++ if (shrink_fraction == 0)
++ want = sbi->ll_async_page_count - sbi->ll_async_page_max + 32;
++ else
++ want = (total + shrink_fraction - 1) / shrink_fraction;
++
++ if (want > 40 << (20 - CFS_PAGE_SHIFT))
++ want = 40 << (20 - CFS_PAGE_SHIFT);
++
++ CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n",
++ want, total, shrink_fraction);
++
++ spin_lock(&sbi->ll_lock);
++ list_add(&dummy_llap.llap_pglist_item, &sbi->ll_pglist);
++
++ while (--total >= 0 && count < want) {
++ struct page *page;
++ int keep;
++
++ if (unlikely(need_resched())) {
++ spin_unlock(&sbi->ll_lock);
++ cond_resched();
++ spin_lock(&sbi->ll_lock);
++ }
++
++ llap = llite_pglist_next_llap(sbi,&dummy_llap.llap_pglist_item);
++ list_del_init(&dummy_llap.llap_pglist_item);
++ if (llap == NULL)
++ break;
++
++ page = llap->llap_page;
++ LASSERT(page != NULL);
++
++ list_add(&dummy_llap.llap_pglist_item, &llap->llap_pglist_item);
++
++ /* Page needs/undergoing IO */
++ if (TryLockPage(page)) {
++ LL_CDEBUG_PAGE(D_PAGE, page, "can't lock\n");
++ continue;
++ }
++
++ keep = (llap->llap_write_queued || PageDirty(page) ||
++ PageWriteback(page) || (!PageUptodate(page) &&
++ llap->llap_origin != LLAP_ORIGIN_READAHEAD));
++
++ LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n",
++ keep ? "keep" : "drop",
++ llap->llap_write_queued ? "wq " : "",
++ PageDirty(page) ? "pd " : "",
++ PageUptodate(page) ? "" : "!pu ",
++ PageWriteback(page) ? "wb" : "",
++ llap->llap_defer_uptodate ? "" : "!du",
++ llap_origins[llap->llap_origin]);
++
++ /* If page is dirty or undergoing IO don't discard it */
++ if (keep) {
++ unlock_page(page);
++ continue;
++ }
++
++ page_cache_get(page);
++ spin_unlock(&sbi->ll_lock);
++
++ if (page->mapping != NULL) {
++ ll_teardown_mmaps(page->mapping,
++ (__u64)page->index << CFS_PAGE_SHIFT,
++ ((__u64)page->index << CFS_PAGE_SHIFT)|
++ ~CFS_PAGE_MASK);
++ if (!PageDirty(page) && !page_mapped(page)) {
++ ll_ra_accounting(llap, page->mapping);
++ ll_truncate_complete_page(page);
++ ++count;
++ } else {
++ LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page"
++ " because it is "
++ "%s\n",
++ PageDirty(page)?
++ "dirty":"mapped");
++ }
++ }
++ unlock_page(page);
++ page_cache_release(page);
++
++ spin_lock(&sbi->ll_lock);
++ }
++ list_del(&dummy_llap.llap_pglist_item);
++ spin_unlock(&sbi->ll_lock);
++
++ CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n",
++ count, want, total);
++
++ return count;
++}
++
++static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
++ unsigned origin,
++ struct lustre_handle *lockh)
++{
++ struct ll_async_page *llap;
++ struct obd_export *exp;
++ struct inode *inode = page->mapping->host;
++ struct ll_sb_info *sbi;
++ int rc;
++ ENTRY;
++
++ if (!inode) {
++ static int triggered;
++
++ if (!triggered) {
++ LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon "
++ "page received\n");
++ libcfs_debug_dumpstack(NULL);
++ triggered = 1;
++ }
++ RETURN(ERR_PTR(-EINVAL));
++ }
++ sbi = ll_i2sbi(inode);
++ LASSERT(ll_async_page_slab);
++ LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin);
++
++ llap = llap_cast_private(page);
++ if (llap != NULL) {
++ /* move to end of LRU list, except when page is just about to
++ * die */
++ if (origin != LLAP_ORIGIN_REMOVEPAGE) {
++ spin_lock(&sbi->ll_lock);
++ sbi->ll_pglist_gen++;
++ list_del_init(&llap->llap_pglist_item);
++ list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
++ spin_unlock(&sbi->ll_lock);
++ }
++ GOTO(out, llap);
++ }
++
++ exp = ll_i2obdexp(page->mapping->host);
++ if (exp == NULL)
++ RETURN(ERR_PTR(-EINVAL));
++
++ /* limit the number of lustre-cached pages */
++ if (sbi->ll_async_page_count >= sbi->ll_async_page_max)
++ llap_shrink_cache(sbi, 0);
++
++ OBD_SLAB_ALLOC(llap, ll_async_page_slab, CFS_ALLOC_STD,
++ ll_async_page_slab_size);
++ if (llap == NULL)
++ RETURN(ERR_PTR(-ENOMEM));
++ llap->llap_magic = LLAP_MAGIC;
++ llap->llap_cookie = (void *)llap + size_round(sizeof(*llap));
++
++ /* XXX: for bug 11270 - check for lockless origin here! */
++ if (origin == LLAP_ORIGIN_LOCKLESS_IO)
++ llap->llap_nocache = 1;
++
++ rc = obd_prep_async_page(exp, ll_i2info(inode)->lli_smd, NULL, page,
++ (obd_off)page->index << CFS_PAGE_SHIFT,
++ &ll_async_page_ops, llap, &llap->llap_cookie,
++ llap->llap_nocache, lockh);
++ if (rc) {
++ OBD_SLAB_FREE(llap, ll_async_page_slab,
++ ll_async_page_slab_size);
++ RETURN(ERR_PTR(rc));
++ }
++
++ CDEBUG(D_CACHE, "llap %p page %p cookie %p obj off "LPU64"\n", llap,
++ page, llap->llap_cookie, (obd_off)page->index << CFS_PAGE_SHIFT);
++ /* also zeroing the PRIVBITS low order bitflags */
++ __set_page_ll_data(page, llap);
++ llap->llap_page = page;
++
++ spin_lock(&sbi->ll_lock);
++ sbi->ll_pglist_gen++;
++ sbi->ll_async_page_count++;
++ list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
++ spin_unlock(&sbi->ll_lock);
++
++ out:
++ if (unlikely(sbi->ll_flags & LL_SBI_LLITE_CHECKSUM)) {
++ __u32 csum;
++ char *kaddr = kmap_atomic(page, KM_USER0);
++ csum = init_checksum(OSC_DEFAULT_CKSUM);
++ csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE,
++ OSC_DEFAULT_CKSUM);
++ kunmap_atomic(kaddr, KM_USER0);
++ if (origin == LLAP_ORIGIN_READAHEAD ||
++ origin == LLAP_ORIGIN_READPAGE ||
++ origin == LLAP_ORIGIN_LOCKLESS_IO) {
++ llap->llap_checksum = 0;
++ } else if (origin == LLAP_ORIGIN_COMMIT_WRITE ||
++ llap->llap_checksum == 0) {
++ llap->llap_checksum = csum;
++ CDEBUG(D_PAGE, "page %p cksum %x\n", page, csum);
++ } else if (llap->llap_checksum == csum) {
++ /* origin == LLAP_ORIGIN_WRITEPAGE */
++ CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
++ page, csum);
++ } else {
++ /* origin == LLAP_ORIGIN_WRITEPAGE */
++ LL_CDEBUG_PAGE(D_ERROR, page, "old cksum %x != new "
++ "%x!\n", llap->llap_checksum, csum);
++ }
++ }
++
++ llap->llap_origin = origin;
++ RETURN(llap);
++}
++
++static inline struct ll_async_page *llap_from_page(struct page *page,
++ unsigned origin)
++{
++ return llap_from_page_with_lockh(page, origin, NULL);
++}
++
++static int queue_or_sync_write(struct obd_export *exp, struct inode *inode,
++ struct ll_async_page *llap,
++ unsigned to, obd_flag async_flags)
++{
++ unsigned long size_index = i_size_read(inode) >> CFS_PAGE_SHIFT;
++ struct obd_io_group *oig;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0;
++ ENTRY;
++
++ /* _make_ready only sees llap once we've unlocked the page */
++ llap->llap_write_queued = 1;
++ rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
++ llap->llap_cookie, OBD_BRW_WRITE | noquot,
++ 0, 0, 0, async_flags);
++ if (rc == 0) {
++ LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n");
++ llap_write_pending(inode, llap);
++ GOTO(out, 0);
++ }
++
++ llap->llap_write_queued = 0;
++
++ rc = oig_init(&oig);
++ if (rc)
++ GOTO(out, rc);
++
++ /* make full-page requests if we are not at EOF (bug 4410) */
++ if (to != CFS_PAGE_SIZE && llap->llap_page->index < size_index) {
++ LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
++ "sync write before EOF: size_index %lu, to %d\n",
++ size_index, to);
++ to = CFS_PAGE_SIZE;
++ } else if (to != CFS_PAGE_SIZE && llap->llap_page->index == size_index){
++ int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
++ LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
++ "sync write at EOF: size_index %lu, to %d/%d\n",
++ size_index, to, size_to);
++ if (to < size_to)
++ to = size_to;
++ }
++
++ /* compare the checksum once before the page leaves llite */
++ if (unlikely((sbi->ll_flags & LL_SBI_LLITE_CHECKSUM) &&
++ llap->llap_checksum != 0)) {
++ __u32 csum;
++ struct page *page = llap->llap_page;
++ char *kaddr = kmap_atomic(page, KM_USER0);
++ csum = init_checksum(OSC_DEFAULT_CKSUM);
++ csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE,
++ OSC_DEFAULT_CKSUM);
++ kunmap_atomic(kaddr, KM_USER0);
++ if (llap->llap_checksum == csum) {
++ CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
++ page, csum);
++ } else {
++ CERROR("page %p old cksum %x != new cksum %x!\n",
++ page, llap->llap_checksum, csum);
++ }
++ }
++
++ rc = obd_queue_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig,
++ llap->llap_cookie, OBD_BRW_WRITE | noquot,
++ 0, to, 0, ASYNC_READY | ASYNC_URGENT |
++ ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
++ if (rc)
++ GOTO(free_oig, rc);
++
++ rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
++ if (rc)
++ GOTO(free_oig, rc);
++
++ rc = oig_wait(oig);
++
++ if (!rc && async_flags & ASYNC_READY) {
++ unlock_page(llap->llap_page);
++ if (PageWriteback(llap->llap_page))
++ end_page_writeback(llap->llap_page);
++ }
++
++ LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", rc);
++
++free_oig:
++ oig_release(oig);
++out:
++ RETURN(rc);
++}
++
++/* update our write count to account for i_size increases that may have
++ * happened since we've queued the page for io. */
++
++/* be careful not to return success without setting the page Uptodate or
++ * the next pass through prepare_write will read in stale data from disk. */
++int ll_commit_write(struct file *file, struct page *page, unsigned from,
++ unsigned to)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
++ struct inode *inode = page->mapping->host;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct lov_stripe_md *lsm = lli->lli_smd;
++ struct obd_export *exp;
++ struct ll_async_page *llap;
++ loff_t size;
++ struct lustre_handle *lockh = NULL;
++ int rc = 0;
++ ENTRY;
++
++ SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
++ LASSERT(inode == file->f_dentry->d_inode);
++ LASSERT(PageLocked(page));
++
++ CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n",
++ inode, page, from, to, page->index);
++
++ if (fd->fd_flags & LL_FILE_GROUP_LOCKED)
++ lockh = &fd->fd_cwlockh;
++
++ llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_COMMIT_WRITE, lockh);
++ if (IS_ERR(llap))
++ RETURN(PTR_ERR(llap));
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ RETURN(-EINVAL);
++
++ llap->llap_ignore_quota = cfs_capable(CFS_CAP_SYS_RESOURCE);
++
++ /* queue a write for some time in the future the first time we
++ * dirty the page */
++ if (!PageDirty(page)) {
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_MISSES, 1);
++
++ rc = queue_or_sync_write(exp, inode, llap, to, 0);
++ if (rc)
++ GOTO(out, rc);
++ } else {
++ ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_HITS, 1);
++ }
++
++ /* put the page in the page cache, from now on ll_removepage is
++ * responsible for cleaning up the llap.
++ * only set page dirty when it's queued to be write out */
++ if (llap->llap_write_queued)
++ set_page_dirty(page);
++
++out:
++ size = (((obd_off)page->index) << CFS_PAGE_SHIFT) + to;
++ ll_inode_size_lock(inode, 0);
++ if (rc == 0) {
++ lov_stripe_lock(lsm);
++ obd_adjust_kms(exp, lsm, size, 0);
++ lov_stripe_unlock(lsm);
++ if (size > i_size_read(inode))
++ i_size_write(inode, size);
++ SetPageUptodate(page);
++ } else if (size > i_size_read(inode)) {
++ /* this page beyond the pales of i_size, so it can't be
++ * truncated in ll_p_r_e during lock revoking. we must
++ * teardown our book-keeping here. */
++ ll_removepage(page);
++ }
++ ll_inode_size_unlock(inode, 0);
++ RETURN(rc);
++}
++
++static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
++{
++ struct ll_ra_info *ra = &sbi->ll_ra_info;
++ unsigned long ret;
++ ENTRY;
++
++ spin_lock(&sbi->ll_lock);
++ ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
++ ra->ra_cur_pages += ret;
++ spin_unlock(&sbi->ll_lock);
++
++ RETURN(ret);
++}
++
++static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
++{
++ struct ll_ra_info *ra = &sbi->ll_ra_info;
++ spin_lock(&sbi->ll_lock);
++ LASSERTF(ra->ra_cur_pages >= len, "r_c_p %lu len %lu\n",
++ ra->ra_cur_pages, len);
++ ra->ra_cur_pages -= len;
++ spin_unlock(&sbi->ll_lock);
++}
++
++/* called for each page in a completed rpc.*/
++int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
++{
++ struct ll_async_page *llap;
++ struct page *page;
++ int ret = 0;
++ ENTRY;
++
++ llap = LLAP_FROM_COOKIE(data);
++ page = llap->llap_page;
++ LASSERT(PageLocked(page));
++ LASSERT(CheckWriteback(page,cmd));
++
++ LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc);
++
++ if (cmd & OBD_BRW_READ && llap->llap_defer_uptodate)
++ ll_ra_count_put(ll_i2sbi(page->mapping->host), 1);
++
++ if (rc == 0) {
++ if (cmd & OBD_BRW_READ) {
++ if (!llap->llap_defer_uptodate)
++ SetPageUptodate(page);
++ } else {
++ llap->llap_write_queued = 0;
++ }
++ ClearPageError(page);
++ } else {
++ if (cmd & OBD_BRW_READ) {
++ llap->llap_defer_uptodate = 0;
++ }
++ SetPageError(page);
++ if (rc == -ENOSPC)
++ set_bit(AS_ENOSPC, &page->mapping->flags);
++ else
++ set_bit(AS_EIO, &page->mapping->flags);
++ }
++
++ /* be carefull about clear WB.
++ * if WB will cleared after page lock is released - paralel IO can be
++ * started before ap_make_ready is finished - so we will be have page
++ * with PG_Writeback set from ->writepage() and completed READ which
++ * clear this flag */
++ if ((cmd & OBD_BRW_WRITE) && PageWriteback(page))
++ end_page_writeback(page);
++
++ unlock_page(page);
++
++ if (cmd & OBD_BRW_WRITE) {
++ llap_write_complete(page->mapping->host, llap);
++ ll_try_done_writing(page->mapping->host);
++ }
++
++ page_cache_release(page);
++
++ RETURN(ret);
++}
++
++static void __ll_put_llap(struct page *page)
++{
++ struct inode *inode = page->mapping->host;
++ struct obd_export *exp;
++ struct ll_async_page *llap;
++ struct ll_sb_info *sbi = ll_i2sbi(inode);
++ int rc;
++ ENTRY;
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL) {
++ CERROR("page %p ind %lu gave null export\n", page, page->index);
++ EXIT;
++ return;
++ }
++
++ llap = llap_from_page(page, LLAP_ORIGIN_REMOVEPAGE);
++ if (IS_ERR(llap)) {
++ CERROR("page %p ind %lu couldn't find llap: %ld\n", page,
++ page->index, PTR_ERR(llap));
++ EXIT;
++ return;
++ }
++
++ //llap_write_complete(inode, llap);
++ rc = obd_teardown_async_page(exp, ll_i2info(inode)->lli_smd, NULL,
++ llap->llap_cookie);
++ if (rc != 0)
++ CERROR("page %p ind %lu failed: %d\n", page, page->index, rc);
++
++ /* this unconditional free is only safe because the page lock
++ * is providing exclusivity to memory pressure/truncate/writeback..*/
++ __clear_page_ll_data(page);
++
++ spin_lock(&sbi->ll_lock);
++ if (!list_empty(&llap->llap_pglist_item))
++ list_del_init(&llap->llap_pglist_item);
++ sbi->ll_pglist_gen++;
++ sbi->ll_async_page_count--;
++ spin_unlock(&sbi->ll_lock);
++ OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size);
++
++ EXIT;
++}
++
++/* the kernel calls us here when a page is unhashed from the page cache.
++ * the page will be locked and the kernel is holding a spinlock, so
++ * we need to be careful. we're just tearing down our book-keeping
++ * here. */
++void ll_removepage(struct page *page)
++{
++ struct ll_async_page *llap = llap_cast_private(page);
++ ENTRY;
++
++ LASSERT(!in_interrupt());
++
++ /* sync pages or failed read pages can leave pages in the page
++ * cache that don't have our data associated with them anymore */
++ if (page_private(page) == 0) {
++ EXIT;
++ return;
++ }
++
++ LASSERT(!llap->llap_lockless_io_page);
++ LASSERT(!llap->llap_nocache);
++
++ LL_CDEBUG_PAGE(D_PAGE, page, "being evicted\n");
++ __ll_put_llap(page);
++
++ EXIT;
++}
++
++static int ll_issue_page_read(struct obd_export *exp,
++ struct ll_async_page *llap,
++ struct obd_io_group *oig, int defer)
++{
++ struct page *page = llap->llap_page;
++ int rc;
++
++ page_cache_get(page);
++ llap->llap_defer_uptodate = defer;
++ llap->llap_ra_used = 0;
++ rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
++ NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
++ CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
++ ASYNC_URGENT);
++ if (rc) {
++ LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
++ page_cache_release(page);
++ }
++ RETURN(rc);
++}
++
++static void ll_ra_stats_inc_unlocked(struct ll_ra_info *ra, enum ra_stat which)
++{
++ LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
++ ra->ra_stats[which]++;
++}
++
++static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
++{
++ struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
++ struct ll_ra_info *ra = &ll_i2sbi(mapping->host)->ll_ra_info;
++
++ spin_lock(&sbi->ll_lock);
++ ll_ra_stats_inc_unlocked(ra, which);
++ spin_unlock(&sbi->ll_lock);
++}
++
++void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
++{
++ if (!llap->llap_defer_uptodate || llap->llap_ra_used)
++ return;
++
++ ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
++}
++
++#define RAS_CDEBUG(ras) \
++ CDEBUG(D_READA, \
++ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \
++ "csr %lu sf %lu sp %lu sl %lu \n", \
++ ras->ras_last_readpage, ras->ras_consecutive_requests, \
++ ras->ras_consecutive_pages, ras->ras_window_start, \
++ ras->ras_window_len, ras->ras_next_readahead, \
++ ras->ras_requests, ras->ras_request_index, \
++ ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
++ ras->ras_stride_pages, ras->ras_stride_length)
++
++static int index_in_window(unsigned long index, unsigned long point,
++ unsigned long before, unsigned long after)
++{
++ unsigned long start = point - before, end = point + after;
++
++ if (start > point)
++ start = 0;
++ if (end < point)
++ end = ~0;
++
++ return start <= index && index <= end;
++}
++
++static struct ll_readahead_state *ll_ras_get(struct file *f)
++{
++ struct ll_file_data *fd;
++
++ fd = LUSTRE_FPRIVATE(f);
++ return &fd->fd_ras;
++}
++
++void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
++{
++ struct ll_readahead_state *ras;
++
++ ras = ll_ras_get(f);
++
++ spin_lock(&ras->ras_lock);
++ ras->ras_requests++;
++ ras->ras_request_index = 0;
++ ras->ras_consecutive_requests++;
++ rar->lrr_reader = current;
++
++ list_add(&rar->lrr_linkage, &ras->ras_read_beads);
++ spin_unlock(&ras->ras_lock);
++}
++
++void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
++{
++ struct ll_readahead_state *ras;
++
++ ras = ll_ras_get(f);
++
++ spin_lock(&ras->ras_lock);
++ list_del_init(&rar->lrr_linkage);
++ spin_unlock(&ras->ras_lock);
++}
++
++static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
++{
++ struct ll_ra_read *scan;
++
++ list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
++ if (scan->lrr_reader == current)
++ return scan;
++ }
++ return NULL;
++}
++
++struct ll_ra_read *ll_ra_read_get(struct file *f)
++{
++ struct ll_readahead_state *ras;
++ struct ll_ra_read *bead;
++
++ ras = ll_ras_get(f);
++
++ spin_lock(&ras->ras_lock);
++ bead = ll_ra_read_get_locked(ras);
++ spin_unlock(&ras->ras_lock);
++ return bead;
++}
++
++static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
++ int index, struct address_space *mapping)
++{
++ struct ll_async_page *llap;
++ struct page *page;
++ unsigned int gfp_mask = 0;
++ int rc = 0;
++
++ gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
++#ifdef __GFP_NOWARN
++ gfp_mask |= __GFP_NOWARN;
++#endif
++ page = grab_cache_page_nowait_gfp(mapping, index, gfp_mask);
++ if (page == NULL) {
++ ll_ra_stats_inc(mapping, RA_STAT_FAILED_GRAB_PAGE);
++ CDEBUG(D_READA, "g_c_p_n failed\n");
++ return 0;
++ }
++
++ /* Check if page was truncated or reclaimed */
++ if (page->mapping != mapping) {
++ ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
++ CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
++ GOTO(unlock_page, rc = 0);
++ }
++
++ /* we do this first so that we can see the page in the /proc
++ * accounting */
++ llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
++ if (IS_ERR(llap) || llap->llap_defer_uptodate) {
++ if (PTR_ERR(llap) == -ENOLCK) {
++ ll_ra_stats_inc(mapping, RA_STAT_FAILED_MATCH);
++ CDEBUG(D_READA | D_PAGE,
++ "Adding page to cache failed index "
++ "%d\n", index);
++ CDEBUG(D_READA, "nolock page\n");
++ GOTO(unlock_page, rc = -ENOLCK);
++ }
++ CDEBUG(D_READA, "read-ahead page\n");
++ GOTO(unlock_page, rc = 0);
++ }
++
++ /* skip completed pages */
++ if (Page_Uptodate(page))
++ GOTO(unlock_page, rc = 0);
++
++ /* bail out when we hit the end of the lock. */
++ rc = ll_issue_page_read(exp, llap, oig, 1);
++ if (rc == 0) {
++ LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
++ rc = 1;
++ } else {
++unlock_page:
++ unlock_page(page);
++ LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
++ }
++ page_cache_release(page);
++ return rc;
++}
++
++/* ra_io_arg will be filled in the beginning of ll_readahead with
++ * ras_lock, then the following ll_read_ahead_pages will read RA
++ * pages according to this arg, all the items in this structure are
++ * counted by page index.
++ */
++struct ra_io_arg {
++ unsigned long ria_start; /* start offset of read-ahead*/
++ unsigned long ria_end; /* end offset of read-ahead*/
++ /* If stride read pattern is detected, ria_stoff means where
++ * stride read is started. Note: for normal read-ahead, the
++ * value here is meaningless, and also it will not be accessed*/
++ pgoff_t ria_stoff;
++ /* ria_length and ria_pages are the length and pages length in the
++ * stride I/O mode. And they will also be used to check whether
++ * it is stride I/O read-ahead in the read-ahead pages*/
++ unsigned long ria_length;
++ unsigned long ria_pages;
++};
++
++#define RIA_DEBUG(ria) \
++ CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \
++ ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
++ ria->ria_pages)
++
++#define RAS_INCREASE_STEP (1024 * 1024 >> CFS_PAGE_SHIFT)
++
++static inline int stride_io_mode(struct ll_readahead_state *ras)
++{
++ return ras->ras_consecutive_stride_requests > 1;
++}
++
++/* The function calculates how much pages will be read in
++ * [off, off + length], which will be read by stride I/O mode,
++ * stride_offset = st_off, stride_lengh = st_len,
++ * stride_pages = st_pgs
++ */
++static unsigned long
++stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
++ unsigned long off, unsigned length)
++{
++ unsigned long cont_len = st_off > off ? st_off - off : 0;
++ __u64 stride_len = length + off > st_off ?
++ length + off + 1 - st_off : 0;
++ unsigned long left, pg_count;
++
++ if (st_len == 0 || length == 0)
++ return length;
++
++ left = do_div(stride_len, st_len);
++ left = min(left, st_pgs);
++
++ pg_count = left + stride_len * st_pgs + cont_len;
++
++ LASSERT(pg_count >= left);
++
++ CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %u"
++ "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
++
++ return pg_count;
++}
++
++static int ria_page_count(struct ra_io_arg *ria)
++{
++ __u64 length = ria->ria_end >= ria->ria_start ?
++ ria->ria_end - ria->ria_start + 1 : 0;
++
++ return stride_pg_count(ria->ria_stoff, ria->ria_length,
++ ria->ria_pages, ria->ria_start,
++ length);
++}
++
++/*Check whether the index is in the defined ra-window */
++static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
++{
++ /* If ria_length == ria_pages, it means non-stride I/O mode,
++ * idx should always inside read-ahead window in this case
++ * For stride I/O mode, just check whether the idx is inside
++ * the ria_pages. */
++ return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
++ (idx - ria->ria_stoff) % ria->ria_length < ria->ria_pages;
++}
++
++static int ll_read_ahead_pages(struct obd_export *exp,
++ struct obd_io_group *oig,
++ struct ra_io_arg *ria,
++ unsigned long *reserved_pages,
++ struct address_space *mapping,
++ unsigned long *ra_end)
++{
++ int rc, count = 0, stride_ria;
++ unsigned long page_idx;
++
++ LASSERT(ria != NULL);
++ RIA_DEBUG(ria);
++
++ stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
++ for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
++ *reserved_pages > 0; page_idx++) {
++ if (ras_inside_ra_window(page_idx, ria)) {
++ /* If the page is inside the read-ahead window*/
++ rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
++ if (rc == 1) {
++ (*reserved_pages)--;
++ count ++;
++ } else if (rc == -ENOLCK)
++ break;
++ } else if (stride_ria) {
++ /* If it is not in the read-ahead window, and it is
++ * read-ahead mode, then check whether it should skip
++ * the stride gap */
++ pgoff_t offset;
++ /* FIXME: This assertion only is valid when it is for
++ * forward read-ahead, it will be fixed when backward
++ * read-ahead is implemented */
++ LASSERTF(page_idx > ria->ria_stoff, "since %lu in the"
++ " gap of ra window,it should bigger than stride"
++ " offset %lu \n", page_idx, ria->ria_stoff);
++
++ offset = page_idx - ria->ria_stoff;
++ offset = offset % (ria->ria_length);
++ if (offset > ria->ria_pages) {
++ page_idx += ria->ria_length - offset;
++ CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
++ ria->ria_length - offset);
++ continue;
++ }
++ }
++ }
++ *ra_end = page_idx;
++ return count;
++}
++
++static int ll_readahead(struct ll_readahead_state *ras,
++ struct obd_export *exp, struct address_space *mapping,
++ struct obd_io_group *oig, int flags)
++{
++ unsigned long start = 0, end = 0, reserved;
++ unsigned long ra_end, len;
++ struct inode *inode;
++ struct lov_stripe_md *lsm;
++ struct ll_ra_read *bead;
++ struct ost_lvb lvb;
++ struct ra_io_arg ria = { 0 };
++ int ret = 0;
++ __u64 kms;
++ ENTRY;
++
++ inode = mapping->host;
++ lsm = ll_i2info(inode)->lli_smd;
++
++ lov_stripe_lock(lsm);
++ inode_init_lvb(inode, &lvb);
++ obd_merge_lvb(ll_i2obdexp(inode), lsm, &lvb, 1);
++ kms = lvb.lvb_size;
++ lov_stripe_unlock(lsm);
++ if (kms == 0) {
++ ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
++ RETURN(0);
++ }
++
++ spin_lock(&ras->ras_lock);
++ bead = ll_ra_read_get_locked(ras);
++ /* Enlarge the RA window to encompass the full read */
++ if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
++ bead->lrr_start + bead->lrr_count) {
++ ras->ras_window_len = bead->lrr_start + bead->lrr_count -
++ ras->ras_window_start;
++ }
++ /* Reserve a part of the read-ahead window that we'll be issuing */
++ if (ras->ras_window_len) {
++ start = ras->ras_next_readahead;
++ end = ras->ras_window_start + ras->ras_window_len - 1;
++ }
++ if (end != 0) {
++ /* Truncate RA window to end of file */
++ end = min(end, (unsigned long)((kms - 1) >> CFS_PAGE_SHIFT));
++ ras->ras_next_readahead = max(end, end + 1);
++ RAS_CDEBUG(ras);
++ }
++ ria.ria_start = start;
++ ria.ria_end = end;
++ /* If stride I/O mode is detected, get stride window*/
++ if (stride_io_mode(ras)) {
++ ria.ria_stoff = ras->ras_stride_offset;
++ ria.ria_length = ras->ras_stride_length;
++ ria.ria_pages = ras->ras_stride_pages;
++ }
++ spin_unlock(&ras->ras_lock);
++
++ if (end == 0) {
++ ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
++ RETURN(0);
++ }
++
++ len = ria_page_count(&ria);
++ if (len == 0)
++ RETURN(0);
++
++ reserved = ll_ra_count_get(ll_i2sbi(inode), len);
++ if (reserved < len)
++ ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
++
++ CDEBUG(D_READA, "reserved page %lu \n", reserved);
++
++ ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end);
++
++ LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
++ if (reserved != 0)
++ ll_ra_count_put(ll_i2sbi(inode), reserved);
++
++ if (ra_end == end + 1 && ra_end == (kms >> CFS_PAGE_SHIFT))
++ ll_ra_stats_inc(mapping, RA_STAT_EOF);
++
++ /* if we didn't get to the end of the region we reserved from
++ * the ras we need to go back and update the ras so that the
++ * next read-ahead tries from where we left off. we only do so
++ * if the region we failed to issue read-ahead on is still ahead
++ * of the app and behind the next index to start read-ahead from */
++ CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
++ ra_end, end, ria.ria_end);
++
++ if (ra_end != (end + 1)) {
++ spin_lock(&ras->ras_lock);
++ if (ra_end < ras->ras_next_readahead &&
++ index_in_window(ra_end, ras->ras_window_start, 0,
++ ras->ras_window_len)) {
++ ras->ras_next_readahead = ra_end;
++ RAS_CDEBUG(ras);
++ }
++ spin_unlock(&ras->ras_lock);
++ }
++
++ RETURN(ret);
++}
++
++static void ras_set_start(struct ll_readahead_state *ras, unsigned long index)
++{
++ ras->ras_window_start = index & (~(RAS_INCREASE_STEP - 1));
++}
++
++/* called with the ras_lock held or from places where it doesn't matter */
++static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
++{
++ ras->ras_last_readpage = index;
++ ras->ras_consecutive_requests = 0;
++ ras->ras_consecutive_pages = 0;
++ ras->ras_window_len = 0;
++ ras_set_start(ras, index);
++ ras->ras_next_readahead = max(ras->ras_window_start, index);
++
++ RAS_CDEBUG(ras);
++}
++
++/* called with the ras_lock held or from places where it doesn't matter */
++static void ras_stride_reset(struct ll_readahead_state *ras)
++{
++ ras->ras_consecutive_stride_requests = 0;
++ ras->ras_stride_length = 0;
++ ras->ras_stride_pages = 0;
++ RAS_CDEBUG(ras);
++}
++
++void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
++{
++ spin_lock_init(&ras->ras_lock);
++ ras_reset(ras, 0);
++ ras->ras_requests = 0;
++ INIT_LIST_HEAD(&ras->ras_read_beads);
++}
++
++/*
++ * Check whether the read request is in the stride window.
++ * If it is in the stride window, return 1, otherwise return 0.
++ */
++static int index_in_stride_window(unsigned long index,
++ struct ll_readahead_state *ras,
++ struct inode *inode)
++{
++ unsigned long stride_gap = index - ras->ras_last_readpage - 1;
++
++ if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
++ return 0;
++
++ /* If it is contiguous read */
++ if (stride_gap == 0)
++ return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
++
++ /*Otherwise check the stride by itself */
++ return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
++ ras->ras_consecutive_pages == ras->ras_stride_pages;
++}
++
++static void ras_update_stride_detector(struct ll_readahead_state *ras,
++ unsigned long index)
++{
++ unsigned long stride_gap = index - ras->ras_last_readpage - 1;
++
++ if (!stride_io_mode(ras) && (stride_gap != 0 ||
++ ras->ras_consecutive_stride_requests == 0)) {
++ ras->ras_stride_pages = ras->ras_consecutive_pages;
++ ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
++ }
++ RAS_CDEBUG(ras);
++}
++
++static unsigned long
++stride_page_count(struct ll_readahead_state *ras, unsigned long len)
++{
++ return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
++ ras->ras_stride_pages, ras->ras_stride_offset,
++ len);
++}
++
++/* Stride Read-ahead window will be increased inc_len according to
++ * stride I/O pattern */
++static void ras_stride_increase_window(struct ll_readahead_state *ras,
++ struct ll_ra_info *ra,
++ unsigned long inc_len)
++{
++ unsigned long left, step, window_len;
++ unsigned long stride_len;
++
++ LASSERT(ras->ras_stride_length > 0);
++
++ stride_len = ras->ras_window_start + ras->ras_window_len -
++ ras->ras_stride_offset;
++
++ LASSERTF(stride_len >= 0, "window_start %lu, window_len %lu"
++ " stride_offset %lu\n", ras->ras_window_start,
++ ras->ras_window_len, ras->ras_stride_offset);
++
++ left = stride_len % ras->ras_stride_length;
++
++ window_len = ras->ras_window_len - left;
++
++ if (left < ras->ras_stride_pages)
++ left += inc_len;
++ else
++ left = ras->ras_stride_pages + inc_len;
++
++ LASSERT(ras->ras_stride_pages != 0);
++
++ step = left / ras->ras_stride_pages;
++ left %= ras->ras_stride_pages;
++
++ window_len += step * ras->ras_stride_length + left;
++
++ if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
++ ras->ras_window_len = window_len;
++
++ RAS_CDEBUG(ras);
++}
++
++/* Set stride I/O read-ahead window start offset */
++static void ras_set_stride_offset(struct ll_readahead_state *ras)
++{
++ unsigned long window_len = ras->ras_next_readahead -
++ ras->ras_window_start;
++ unsigned long left;
++
++ LASSERT(ras->ras_stride_length != 0);
++
++ left = window_len % ras->ras_stride_length;
++
++ ras->ras_stride_offset = ras->ras_next_readahead - left;
++
++ RAS_CDEBUG(ras);
++}
++
++static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
++ struct ll_readahead_state *ras, unsigned long index,
++ unsigned hit)
++{
++ struct ll_ra_info *ra = &sbi->ll_ra_info;
++ int zero = 0, stride_detect = 0, ra_miss = 0;
++ ENTRY;
++
++ spin_lock(&sbi->ll_lock);
++ spin_lock(&ras->ras_lock);
++
++ ll_ra_stats_inc_unlocked(ra, hit ? RA_STAT_HIT : RA_STAT_MISS);
++
++ /* reset the read-ahead window in two cases. First when the app seeks
++ * or reads to some other part of the file. Secondly if we get a
++ * read-ahead miss that we think we've previously issued. This can
++ * be a symptom of there being so many read-ahead pages that the VM is
++ * reclaiming it before we get to it. */
++ if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
++ zero = 1;
++ ll_ra_stats_inc_unlocked(ra, RA_STAT_DISTANT_READPAGE);
++ } else if (!hit && ras->ras_window_len &&
++ index < ras->ras_next_readahead &&
++ index_in_window(index, ras->ras_window_start, 0,
++ ras->ras_window_len)) {
++ ra_miss = 1;
++ ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
++ }
++
++ /* On the second access to a file smaller than the tunable
++ * ra_max_read_ahead_whole_pages trigger RA on all pages in the
++ * file up to ra_max_pages. This is simply a best effort and
++ * only occurs once per open file. Normal RA behavior is reverted
++ * to for subsequent IO. The mmap case does not increment
++ * ras_requests and thus can never trigger this behavior. */
++ if (ras->ras_requests == 2 && !ras->ras_request_index) {
++ __u64 kms_pages;
++
++ kms_pages = (i_size_read(inode) + CFS_PAGE_SIZE - 1) >>
++ CFS_PAGE_SHIFT;
++
++ CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
++ ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
++
++ if (kms_pages &&
++ kms_pages <= ra->ra_max_read_ahead_whole_pages) {
++ ras->ras_window_start = 0;
++ ras->ras_last_readpage = 0;
++ ras->ras_next_readahead = 0;
++ ras->ras_window_len = min(ra->ra_max_pages,
++ ra->ra_max_read_ahead_whole_pages);
++ GOTO(out_unlock, 0);
++ }
++ }
++ if (zero) {
++ /* check whether it is in stride I/O mode*/
++ if (!index_in_stride_window(index, ras, inode)) {
++ ras_reset(ras, index);
++ ras->ras_consecutive_pages++;
++ ras_stride_reset(ras);
++ GOTO(out_unlock, 0);
++ } else {
++ ras->ras_consecutive_requests = 0;
++ if (++ras->ras_consecutive_stride_requests > 1)
++ stride_detect = 1;
++ RAS_CDEBUG(ras);
++ }
++ } else {
++ if (ra_miss) {
++ if (index_in_stride_window(index, ras, inode) &&
++ stride_io_mode(ras)) {
++ /*If stride-RA hit cache miss, the stride dector
++ *will not be reset to avoid the overhead of
++ *redetecting read-ahead mode */
++ if (index != ras->ras_last_readpage + 1)
++ ras->ras_consecutive_pages = 0;
++ RAS_CDEBUG(ras);
++ } else {
++ /*Reset both stride window and normal RA window*/
++ ras_reset(ras, index);
++ ras->ras_consecutive_pages++;
++ ras_stride_reset(ras);
++ GOTO(out_unlock, 0);
++ }
++ } else if (stride_io_mode(ras)) {
++ /* If this is contiguous read but in stride I/O mode
++ * currently, check whether stride step still is valid,
++ * if invalid, it will reset the stride ra window*/
++ if (!index_in_stride_window(index, ras, inode)) {
++ /*Shrink stride read-ahead window to be zero*/
++ ras_stride_reset(ras);
++ ras->ras_window_len = 0;
++ ras->ras_next_readahead = index;
++ }
++ }
++ }
++ ras->ras_consecutive_pages++;
++ ras_update_stride_detector(ras, index);
++ ras->ras_last_readpage = index;
++ ras_set_start(ras, index);
++ ras->ras_next_readahead = max(ras->ras_window_start,
++ ras->ras_next_readahead);
++ RAS_CDEBUG(ras);
++
++ /* Trigger RA in the mmap case where ras_consecutive_requests
++ * is not incremented and thus can't be used to trigger RA */
++ if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
++ ras->ras_window_len = RAS_INCREASE_STEP;
++ GOTO(out_unlock, 0);
++ }
++
++ /* Initially reset the stride window offset to next_readahead*/
++ if (ras->ras_consecutive_stride_requests == 2 && stride_detect)
++ ras_set_stride_offset(ras);
++
++ /* The initial ras_window_len is set to the request size. To avoid
++ * uselessly reading and discarding pages for random IO the window is
++ * only increased once per consecutive request received. */
++ if ((ras->ras_consecutive_requests > 1 &&
++ !ras->ras_request_index) || stride_detect) {
++ if (stride_io_mode(ras))
++ ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP);
++ else
++ ras->ras_window_len = min(ras->ras_window_len +
++ RAS_INCREASE_STEP,
++ ra->ra_max_pages);
++ }
++ EXIT;
++out_unlock:
++ RAS_CDEBUG(ras);
++ ras->ras_request_index++;
++ spin_unlock(&ras->ras_lock);
++ spin_unlock(&sbi->ll_lock);
++ return;
++}
++
++int ll_writepage(struct page *page)
++{
++ struct inode *inode = page->mapping->host;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct obd_export *exp;
++ struct ll_async_page *llap;
++ int rc = 0;
++ ENTRY;
++
++ LASSERT(PageLocked(page));
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ GOTO(out, rc = -EINVAL);
++
++ llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
++ if (IS_ERR(llap))
++ GOTO(out, rc = PTR_ERR(llap));
++
++ LASSERT(!llap->llap_nocache);
++ LASSERT(!PageWriteback(page));
++ set_page_writeback(page);
++
++ page_cache_get(page);
++ if (llap->llap_write_queued) {
++ LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
++ rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
++ llap->llap_cookie,
++ ASYNC_READY | ASYNC_URGENT);
++ } else {
++ rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE,
++ ASYNC_READY | ASYNC_URGENT);
++ }
++ if (rc) {
++ /* re-dirty page on error so it retries write */
++ if (PageWriteback(page))
++ end_page_writeback(page);
++
++ /* resend page only for not started IO*/
++ if (!PageError(page))
++ ll_redirty_page(page);
++
++ page_cache_release(page);
++ }
++out:
++ if (rc) {
++ if (!lli->lli_async_rc)
++ lli->lli_async_rc = rc;
++ /* resend page only for not started IO*/
++ unlock_page(page);
++ }
++ RETURN(rc);
++}
++
++/*
++ * for now we do our readpage the same on both 2.4 and 2.5. The kernel's
++ * read-ahead assumes it is valid to issue readpage all the way up to
++ * i_size, but our dlm locks make that not the case. We disable the
++ * kernel's read-ahead and do our own by walking ahead in the page cache
++ * checking for dlm lock coverage. the main difference between 2.4 and
++ * 2.6 is how read-ahead gets batched and issued, but we're using our own,
++ * so they look the same.
++ */
++int ll_readpage(struct file *filp, struct page *page)
++{
++ struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
++ struct inode *inode = page->mapping->host;
++ struct obd_export *exp;
++ struct ll_async_page *llap;
++ struct obd_io_group *oig = NULL;
++ struct lustre_handle *lockh = NULL;
++ int rc;
++ ENTRY;
++
++ LASSERT(PageLocked(page));
++ LASSERT(!PageUptodate(page));
++ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset=%Lu=%#Lx\n",
++ inode->i_ino, inode->i_generation, inode,
++ (((loff_t)page->index) << CFS_PAGE_SHIFT),
++ (((loff_t)page->index) << CFS_PAGE_SHIFT));
++ LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
++
++ if (!ll_i2info(inode)->lli_smd) {
++ /* File with no objects - one big hole */
++ /* We use this just for remove_from_page_cache that is not
++ * exported, we'd make page back up to date. */
++ ll_truncate_complete_page(page);
++ clear_page(kmap(page));
++ kunmap(page);
++ SetPageUptodate(page);
++ unlock_page(page);
++ RETURN(0);
++ }
++
++ rc = oig_init(&oig);
++ if (rc < 0)
++ GOTO(out, rc);
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ GOTO(out, rc = -EINVAL);
++
++ if (fd->fd_flags & LL_FILE_GROUP_LOCKED)
++ lockh = &fd->fd_cwlockh;
++
++ llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_READPAGE, lockh);
++ if (IS_ERR(llap)) {
++ if (PTR_ERR(llap) == -ENOLCK) {
++ CWARN("ino %lu page %lu (%llu) not covered by "
++ "a lock (mmap?). check debug logs.\n",
++ inode->i_ino, page->index,
++ (long long)page->index << PAGE_CACHE_SHIFT);
++ }
++ GOTO(out, rc = PTR_ERR(llap));
++ }
++
++ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
++ ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
++ llap->llap_defer_uptodate);
++
++
++ if (llap->llap_defer_uptodate) {
++ /* This is the callpath if we got the page from a readahead */
++ llap->llap_ra_used = 1;
++ rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
++ fd->fd_flags);
++ if (rc > 0)
++ obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd,
++ NULL, oig);
++ LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n");
++ SetPageUptodate(page);
++ unlock_page(page);
++ GOTO(out_oig, rc = 0);
++ }
++
++ rc = ll_issue_page_read(exp, llap, oig, 0);
++ if (rc)
++ GOTO(out, rc);
++
++ LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
++ /* We have just requested the actual page we want, see if we can tack
++ * on some readahead to that page's RPC before it is sent. */
++ if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
++ ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
++ fd->fd_flags);
++
++ rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
++
++out:
++ if (rc)
++ unlock_page(page);
++out_oig:
++ if (oig != NULL)
++ oig_release(oig);
++ RETURN(rc);
++}
++
++static void ll_file_put_pages(struct page **pages, int numpages)
++{
++ int i;
++ struct page **pp;
++ ENTRY;
++
++ for (i = 0, pp = pages; i < numpages; i++, pp++) {
++ if (*pp) {
++ LL_CDEBUG_PAGE(D_PAGE, (*pp), "free\n");
++ __ll_put_llap(*pp);
++ if (page_private(*pp))
++ CERROR("the llap wasn't freed\n");
++ (*pp)->mapping = NULL;
++ if (page_count(*pp) != 1)
++ CERROR("page %p, flags %#lx, count %i, private %p\n",
++ (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
++ (void*)page_private(*pp));
++ __free_pages(*pp, 0);
++ }
++ }
++ OBD_FREE(pages, numpages * sizeof(struct page*));
++ EXIT;
++}
++
++static struct page **ll_file_prepare_pages(int numpages, struct inode *inode,
++ unsigned long first)
++{
++ struct page **pages;
++ int i;
++ int rc = 0;
++ ENTRY;
++
++ OBD_ALLOC(pages, sizeof(struct page *) * numpages);
++ if (pages == NULL)
++ RETURN(ERR_PTR(-ENOMEM));
++ for (i = 0; i < numpages; i++) {
++ struct page *page;
++ struct ll_async_page *llap;
++
++ page = alloc_pages(GFP_HIGHUSER, 0);
++ if (page == NULL)
++ GOTO(err, rc = -ENOMEM);
++ pages[i] = page;
++ /* llap_from_page needs page index and mapping to be set */
++ page->index = first++;
++ page->mapping = inode->i_mapping;
++ llap = llap_from_page(page, LLAP_ORIGIN_LOCKLESS_IO);
++ if (IS_ERR(llap))
++ GOTO(err, rc = PTR_ERR(llap));
++ llap->llap_lockless_io_page = 1;
++ }
++ RETURN(pages);
++err:
++ ll_file_put_pages(pages, numpages);
++ RETURN(ERR_PTR(rc));
++ }
++
++static ssize_t ll_file_copy_pages(struct page **pages, int numpages,
++ const struct iovec *iov, unsigned long nsegs,
++ ssize_t iov_offset, loff_t pos, size_t count,
++ int rw)
++{
++ ssize_t amount = 0;
++ int i;
++ int updatechecksum = ll_i2sbi(pages[0]->mapping->host)->ll_flags &
++ LL_SBI_LLITE_CHECKSUM;
++ ENTRY;
++
++ for (i = 0; i < numpages; i++) {
++ unsigned offset, bytes, left = 0;
++ char *vaddr;
++
++ vaddr = kmap(pages[i]);
++ offset = pos & (CFS_PAGE_SIZE - 1);
++ bytes = min_t(unsigned, CFS_PAGE_SIZE - offset, count);
++ LL_CDEBUG_PAGE(D_PAGE, pages[i], "op = %s, addr = %p, "
++ "bytes = %u\n",
++ (rw == WRITE) ? "CFU" : "CTU",
++ vaddr + offset, bytes);
++ while (bytes > 0 && !left && nsegs) {
++ unsigned copy = min_t(ssize_t, bytes,
++ iov->iov_len - iov_offset);
++ if (rw == WRITE) {
++ left = copy_from_user(vaddr + offset,
++ iov->iov_base +iov_offset,
++ copy);
++ if (updatechecksum) {
++ struct ll_async_page *llap;
++
++ llap = llap_cast_private(pages[i]);
++ llap->llap_checksum =
++ init_checksum(OSC_DEFAULT_CKSUM);
++ llap->llap_checksum =
++ compute_checksum(llap->llap_checksum,
++ vaddr,CFS_PAGE_SIZE,
++ OSC_DEFAULT_CKSUM);
++ }
++ } else {
++ left = copy_to_user(iov->iov_base + iov_offset,
++ vaddr + offset, copy);
++ }
++
++ amount += copy;
++ count -= copy;
++ pos += copy;
++ iov_offset += copy;
++ bytes -= copy;
++ if (iov_offset == iov->iov_len) {
++ iov_offset = 0;
++ iov++;
++ nsegs--;
++ }
++ }
++ kunmap(pages[i]);
++ if (left) {
++ amount -= left;
++ break;
++ }
++ }
++ if (amount == 0)
++ RETURN(-EFAULT);
++ RETURN(amount);
++}
++
++static int ll_file_oig_pages(struct inode * inode, struct page **pages,
++ int numpages, loff_t pos, size_t count, int rw)
++{
++ struct obd_io_group *oig;
++ struct ll_inode_info *lli = ll_i2info(inode);
++ struct obd_export *exp;
++ loff_t org_pos = pos;
++ obd_flag brw_flags;
++ int rc;
++ int i;
++ ENTRY;
++
++ exp = ll_i2obdexp(inode);
++ if (exp == NULL)
++ RETURN(-EINVAL);
++ rc = oig_init(&oig);
++ if (rc)
++ RETURN(rc);
++ brw_flags = OBD_BRW_SRVLOCK;
++ if (cfs_capable(CFS_CAP_SYS_RESOURCE))
++ brw_flags |= OBD_BRW_NOQUOTA;
++
++ for (i = 0; i < numpages; i++) {
++ struct ll_async_page *llap;
++ unsigned from, bytes;
++
++ from = pos & (CFS_PAGE_SIZE - 1);
++ bytes = min_t(unsigned, CFS_PAGE_SIZE - from,
++ count - pos + org_pos);
++ llap = llap_cast_private(pages[i]);
++ LASSERT(llap);
++
++ lock_page(pages[i]);
++
++ LL_CDEBUG_PAGE(D_PAGE, pages[i], "offset "LPU64","
++ " from %u, bytes = %u\n",
++ pos, from, bytes);
++ LASSERTF(pos >> CFS_PAGE_SHIFT == pages[i]->index,
++ "wrong page index %lu (%lu)\n",
++ pages[i]->index,
++ (unsigned long)(pos >> CFS_PAGE_SHIFT));
++ rc = obd_queue_group_io(exp, lli->lli_smd, NULL, oig,
++ llap->llap_cookie,
++ (rw == WRITE) ?
++ OBD_BRW_WRITE:OBD_BRW_READ,
++ from, bytes, brw_flags,
++ ASYNC_READY | ASYNC_URGENT |
++ ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
++ if (rc) {
++ i++;
++ GOTO(out, rc);
++ }
++ pos += bytes;
++ }
++ rc = obd_trigger_group_io(exp, lli->lli_smd, NULL, oig);
++ if (rc)
++ GOTO(out, rc);
++ rc = oig_wait(oig);
++out:
++ while(--i >= 0)
++ unlock_page(pages[i]);
++ oig_release(oig);
++ RETURN(rc);
++}
++
++/* Advance through passed iov, adjust iov pointer as necessary and return
++ * starting offset in individual entry we are pointing at. Also reduce
++ * nr_segs as needed */
++static ssize_t ll_iov_advance(const struct iovec **iov, unsigned long *nr_segs,
++ ssize_t offset)
++{
++ while (*nr_segs > 0) {
++ if ((*iov)->iov_len > offset)
++ return ((*iov)->iov_len - offset);
++ offset -= (*iov)->iov_len;
++ (*iov)++;
++ (*nr_segs)--;
++ }
++ return 0;
++}
++
++ssize_t ll_file_lockless_io(struct file *file, const struct iovec *iov,
++ unsigned long nr_segs,
++ loff_t *ppos, int rw, ssize_t count)
++{
++ loff_t pos;
++ struct inode *inode = file->f_dentry->d_inode;
++ ssize_t rc = 0;
++ int max_pages;
++ size_t amount = 0;
++ unsigned long first, last;
++ const struct iovec *iv = &iov[0];
++ unsigned long nsegs = nr_segs;
++ unsigned long offset = 0;
++ ENTRY;
++
++ if (rw == READ) {
++ loff_t isize;
++
++ ll_inode_size_lock(inode, 0);
++ isize = i_size_read(inode);
++ ll_inode_size_unlock(inode, 0);
++ if (*ppos >= isize)
++ GOTO(out, rc = 0);
++ if (*ppos + count >= isize)
++ count -= *ppos + count - isize;
++ if (count == 0)
++ GOTO(out, rc);
++ } else {
++ rc = generic_write_checks(file, ppos, &count, 0);
++ if (rc)
++ GOTO(out, rc);
++ rc = ll_remove_suid(file->f_dentry, file->f_vfsmnt);
++ if (rc)
++ GOTO(out, rc);
++ }
++
++ pos = *ppos;
++ first = pos >> CFS_PAGE_SHIFT;
++ last = (pos + count - 1) >> CFS_PAGE_SHIFT;
++ max_pages = PTLRPC_MAX_BRW_PAGES *
++ ll_i2info(inode)->lli_smd->lsm_stripe_count;
++ CDEBUG(D_INFO, "%u, stripe_count = %u\n",
++ PTLRPC_MAX_BRW_PAGES /* max_pages_per_rpc */,
++ ll_i2info(inode)->lli_smd->lsm_stripe_count);
++
++ while (first <= last && rc >= 0) {
++ int pages_for_io;
++ struct page **pages;
++ size_t bytes = count - amount;
++
++ pages_for_io = min_t(int, last - first + 1, max_pages);
++ pages = ll_file_prepare_pages(pages_for_io, inode, first);
++ if (IS_ERR(pages)) {
++ rc = PTR_ERR(pages);
++ break;
++ }
++ if (rw == WRITE) {
++ rc = ll_file_copy_pages(pages, pages_for_io, iv, nsegs,
++ offset, pos + amount, bytes,
++ rw);
++ if (rc < 0)
++ GOTO(put_pages, rc);
++ offset = ll_iov_advance(&iv, &nsegs, offset + rc);
++ bytes = rc;
++ }
++ rc = ll_file_oig_pages(inode, pages, pages_for_io,
++ pos + amount, bytes, rw);
++ if (rc)
++ GOTO(put_pages, rc);
++ if (rw == READ) {
++ rc = ll_file_copy_pages(pages, pages_for_io, iv, nsegs,
++ offset, pos + amount, bytes, rw);
++ if (rc < 0)
++ GOTO(put_pages, rc);
++ offset = ll_iov_advance(&iv, &nsegs, offset + rc);
++ bytes = rc;
++ }
++ amount += bytes;
++put_pages:
++ ll_file_put_pages(pages, pages_for_io);
++ first += pages_for_io;
++ /* a short read/write check */
++ if (pos + amount < ((loff_t)first << CFS_PAGE_SHIFT))
++ break;
++ /* Check if we are out of userspace buffers. (how that could
++ happen?) */
++ if (nsegs == 0)
++ break;
++ }
++ /* NOTE: don't update i_size and KMS in absence of LDLM locks even
++ * write makes the file large */
++ file_accessed(file);
++ if (rw == READ && amount < count && rc == 0) {
++ unsigned long not_cleared;
++
++ while (nsegs > 0) {
++ ssize_t to_clear = min_t(ssize_t, count - amount,
++ iv->iov_len - offset);
++ not_cleared = clear_user(iv->iov_base + offset,
++ to_clear);
++ amount += to_clear - not_cleared;
++ if (not_cleared) {
++ rc = -EFAULT;
++ break;
++ }
++ offset = 0;
++ iv++;
++ nsegs--;
++ }
++ }
++ if (amount > 0) {
++ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
++ (rw == WRITE) ?
++ LPROC_LL_LOCKLESS_WRITE :
++ LPROC_LL_LOCKLESS_READ,
++ (long)amount);
++ *ppos += amount;
++ RETURN(amount);
++ }
++out:
++ RETURN(rc);
++}
+diff -urNad lustre~/lustre/llite/symlink.c lustre/lustre/llite/symlink.c
+--- lustre~/lustre/llite/symlink.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/llite/symlink.c 2009-03-12 11:02:51.000000000 +0100
+@@ -177,8 +177,12 @@
+ up(&lli->lli_size_sem);
+ }
+ if (rc) {
++#ifdef HAVE_PATH_RELEASE
+ path_release(nd); /* Kernel assumes that ->follow_link()
+ releases nameidata on error */
++#else
++ path_put(&nd->path);
++#endif
+ GOTO(out, rc);
+ }
+
+diff -urNad lustre~/lustre/lvfs/lvfs_linux.c lustre/lustre/lvfs/lvfs_linux.c
+--- lustre~/lustre/lvfs/lvfs_linux.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/lvfs/lvfs_linux.c 2009-03-12 11:02:51.000000000 +0100
+@@ -148,10 +148,10 @@
+ */
+
+ save->fs = get_fs();
+- LASSERT(atomic_read(¤t->fs->pwd->d_count));
++ LASSERT(atomic_read(&cfs_fs_pwd(current->fs)->d_count));
+ LASSERT(atomic_read(&new_ctx->pwd->d_count));
+- save->pwd = dget(current->fs->pwd);
+- save->pwdmnt = mntget(current->fs->pwdmnt);
++ save->pwd = dget(cfs_fs_pwd(current->fs));
++ save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
+ save->luc.luc_umask = current->fs->umask;
+
+ LASSERT(save->pwd);
+@@ -205,10 +205,10 @@
+ atomic_read(¤t->fs->pwdmnt->mnt_count));
+ */
+
+- LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
+- current->fs->pwd, new_ctx->pwd);
+- LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
+- current->fs->pwdmnt, new_ctx->pwdmnt);
++ LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
++ cfs_fs_pwd(current->fs), new_ctx->pwd);
++ LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
++ cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
+
+ set_fs(saved->fs);
+ ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+diff -urNad lustre~/lustre/mgc/mgc_request.c lustre/lustre/mgc/mgc_request.c
+--- lustre~/lustre/mgc/mgc_request.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/mgc/mgc_request.c 2009-03-12 11:02:51.000000000 +0100
+@@ -415,7 +415,7 @@
+ obd->obd_lvfs_ctxt.fs = get_ds();
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+- dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd,
++ dentry = lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+ strlen(MOUNT_CONFIGS_DIR));
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ if (IS_ERR(dentry)) {
+diff -urNad lustre~/lustre/obdclass/linux/linux-module.c lustre/lustre/obdclass/linux/linux-module.c
+--- lustre~/lustre/obdclass/linux/linux-module.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/obdclass/linux/linux-module.c 2009-03-12 11:02:51.000000000 +0100
+@@ -419,13 +419,14 @@
+ ENTRY;
+
+ obd_sysctl_init();
+- proc_lustre_root = proc_mkdir("lustre", proc_root_fs);
++ proc_lustre_root = lprocfs_register("fs/lustre", NULL,
++ lprocfs_base, NULL);
+ if (!proc_lustre_root) {
+ printk(KERN_ERR
+ "LustreError: error registering /proc/fs/lustre\n");
+ RETURN(-ENOMEM);
+ }
+- proc_version = lprocfs_add_vars(proc_lustre_root, lprocfs_base, NULL);
++
+ entry = create_proc_entry("devices", 0444, proc_lustre_root);
+ if (entry == NULL) {
+ CERROR("error registering /proc/fs/lustre/devices\n");
+diff -urNad lustre~/lustre/obdclass/linux/linux-sysctl.c lustre/lustre/obdclass/linux/linux-sysctl.c
+--- lustre~/lustre/obdclass/linux/linux-sysctl.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/obdclass/linux/linux-sysctl.c 2009-03-12 11:02:51.000000000 +0100
+@@ -56,7 +56,9 @@
+
+ cfs_sysctl_table_header_t *obd_table_header = NULL;
+
+-#define OBD_SYSCTL 300
++#ifndef HAVE_SYSCTL_UNNUMBERED
++
++#define CTL_LUSTRE 300
+
+ enum {
+ OBD_FAIL_LOC = 1, /* control test failures instrumentation */
+@@ -74,6 +76,23 @@
+ OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
+ OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */
+ };
++#else
++#define CTL_LUSTRE CTL_UNNUMBERED
++#define OBD_FAIL_LOC CTL_UNNUMBERED
++#define OBD_FAIL_VAL CTL_UNNUMBERED
++#define OBD_TIMEOUT CTL_UNNUMBERED
++#define OBD_DUMP_ON_TIMEOUT CTL_UNNUMBERED
++#define OBD_MEMUSED CTL_UNNUMBERED
++#define OBD_PAGESUSED CTL_UNNUMBERED
++#define OBD_MAXMEMUSED CTL_UNNUMBERED
++#define OBD_MAXPAGESUSED CTL_UNNUMBERED
++#define OBD_SYNCFILTER CTL_UNNUMBERED
++#define OBD_LDLM_TIMEOUT CTL_UNNUMBERED
++#define OBD_DUMP_ON_EVICTION CTL_UNNUMBERED
++#define OBD_DEBUG_PEER_ON_TIMEOUT CTL_UNNUMBERED
++#define OBD_ALLOC_FAIL_RATE CTL_UNNUMBERED
++#define OBD_MAX_DIRTY_PAGES CTL_UNNUMBERED
++#endif
+
+ int LL_PROC_PROTO(proc_fail_loc)
+ {
+@@ -120,7 +139,8 @@
+ obd_max_dirty_pages = 4 << (20 - CFS_PAGE_SHIFT);
+ }
+ } else {
+- char buf[21];
++ char buf[22];
++ struct ctl_table dummy;
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, sizeof(buf),
+@@ -129,7 +149,13 @@
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
++
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++
++ rc = ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
++ if (rc)
+ return -EFAULT;
+ *lenp = len;
+ }
+@@ -152,7 +178,8 @@
+ (unsigned int*)table->data,
+ OBD_ALLOC_FAIL_MULT);
+ } else {
+- char buf[21];
++ char buf[22];
++ struct ctl_table dummy;
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, sizeof(buf),
+@@ -161,7 +188,12 @@
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++
++ rc = ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
++ if(rc)
+ return -EFAULT;
+ *lenp = len;
+ }
+@@ -172,6 +204,7 @@
+
+ int LL_PROC_PROTO(proc_memory_alloc)
+ {
++ struct ctl_table dummy;
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+@@ -187,15 +220,17 @@
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
+- return -EFAULT;
+- *lenp = len;
+- *ppos += *lenp;
+- return 0;
++
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++
++ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
+ }
+
+ int LL_PROC_PROTO(proc_pages_alloc)
+ {
++ struct ctl_table dummy;
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+@@ -211,15 +246,17 @@
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
+- return -EFAULT;
+- *lenp = len;
+- *ppos += *lenp;
+- return 0;
++
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++
++ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
+ }
+
+ int LL_PROC_PROTO(proc_mem_max)
+ {
++ struct ctl_table dummy;
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+@@ -235,17 +272,19 @@
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
+- return -EFAULT;
+- *lenp = len;
+- *ppos += *lenp;
+- return 0;
++
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++
++ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
+ }
+
+ int LL_PROC_PROTO(proc_pages_max)
+ {
+ char buf[22];
+ int len;
++ struct ctl_table dummy;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!*lenp || (*ppos && !write)) {
+@@ -254,16 +293,17 @@
+ }
+ if (write)
+ return -EINVAL;
++ dummy = *table;
++ dummy.data = buf;
++ dummy.maxlen = sizeof(buf);
++ len = snprintf(buf, sizeof(buf), LPU64,
++ obd_pages_max());
+
+- len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+- if (len > *lenp)
+- len = *lenp;
+- buf[len] = '\0';
+- if (copy_to_user(buffer, buf, len))
+- return -EFAULT;
+- *lenp = len;
+- *ppos += *lenp;
+- return 0;
++ if (len > *lenp)
++ len = *lenp;
++ buf[len] = '\0';
++
++ return ll_proc_dostring(&dummy,write,filp,buffer,lenp, ppos);
+ }
+
+ static cfs_sysctl_table_t obd_table[] = {
+@@ -281,7 +321,8 @@
+ .data = &obd_fail_val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = &proc_dointvec
++ .proc_handler = &proc_dointvec,
++ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = OBD_TIMEOUT,
+@@ -297,7 +338,7 @@
+ .data = &obd_debug_peer_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = &proc_dointvec
++ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = OBD_DUMP_ON_TIMEOUT,
+@@ -305,7 +346,7 @@
+ .data = &obd_dump_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = &proc_dointvec
++ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = OBD_DUMP_ON_EVICTION,
+@@ -313,7 +354,7 @@
+ .data = &obd_dump_on_eviction,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = &proc_dointvec
++ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = OBD_MEMUSED,
+@@ -321,7 +362,7 @@
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+- .proc_handler = &proc_memory_alloc
++ .proc_handler = &proc_memory_alloc,
+ },
+ {
+ .ctl_name = OBD_PAGESUSED,
+@@ -329,7 +370,7 @@
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+- .proc_handler = &proc_pages_alloc
++ .proc_handler = &proc_pages_alloc,
+ },
+ {
+ .ctl_name = OBD_MAXMEMUSED,
+@@ -337,7 +378,7 @@
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+- .proc_handler = &proc_mem_max
++ .proc_handler = &proc_mem_max,
+ },
+ {
+ .ctl_name = OBD_MAXPAGESUSED,
+@@ -345,7 +386,7 @@
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+- .proc_handler = &proc_pages_max
++ .proc_handler = &proc_pages_max,
+ },
+ {
+ .ctl_name = OBD_LDLM_TIMEOUT,
+@@ -378,7 +419,7 @@
+
+ static cfs_sysctl_table_t parent_table[] = {
+ {
+- .ctl_name = OBD_SYSCTL,
++ .ctl_name = CTL_LUSTRE,
+ .procname = "lustre",
+ .data = NULL,
+ .maxlen = 0,
+diff -urNad lustre~/lustre/obdclass/lprocfs_status.c lustre/lustre/obdclass/lprocfs_status.c
+--- lustre~/lustre/obdclass/lprocfs_status.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/obdclass/lprocfs_status.c 2009-03-12 11:02:51.000000000 +0100
+@@ -151,7 +151,7 @@
+
+ LPROCFS_ENTRY();
+ OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
+- if (!dp->deleted && dp->read_proc)
++ if (!LPROCFS_CHECK_DELETED(dp) && dp->read_proc)
+ rc = dp->read_proc(page, &start, *ppos, PAGE_SIZE,
+ &eof, dp->data);
+ LPROCFS_EXIT();
+@@ -191,7 +191,7 @@
+ int rc = -EIO;
+
+ LPROCFS_ENTRY();
+- if (!dp->deleted && dp->write_proc)
++ if (!LPROCFS_CHECK_DELETED(dp) && dp->write_proc)
+ rc = dp->write_proc(f, buf, size, dp->data);
+ LPROCFS_EXIT();
+ return rc;
+diff -urNad lustre~/lustre/obdclass/lprocfs_status.c.orig lustre/lustre/obdclass/lprocfs_status.c.orig
+--- lustre~/lustre/obdclass/lprocfs_status.c.orig 1970-01-01 00:00:00.000000000 +0000
++++ lustre/lustre/obdclass/lprocfs_status.c.orig 2009-03-12 10:32:27.000000000 +0100
+@@ -0,0 +1,2062 @@
++/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
++ * vim:expandtab:shiftwidth=8:tabstop=8:
++ *
++ * GPL HEADER START
++ *
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 only,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License version 2 for more details (a copy is included
++ * in the LICENSE file that accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License
++ * version 2 along with this program; If not, see
++ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
++ *
++ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
++ * CA 95054 USA or visit www.sun.com if you need additional information or
++ * have any questions.
++ *
++ * GPL HEADER END
++ */
++/*
++ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
++ * Use is subject to license terms.
++ */
++/*
++ * This file is part of Lustre, http://www.lustre.org/
++ * Lustre is a trademark of Sun Microsystems, Inc.
++ *
++ * lustre/obdclass/lprocfs_status.c
++ *
++ * Author: Hariharan Thantry <thantry at users.sourceforge.net>
++ */
++
++#ifndef EXPORT_SYMTAB
++# define EXPORT_SYMTAB
++#endif
++#define DEBUG_SUBSYSTEM S_CLASS
++
++#ifndef __KERNEL__
++# include <liblustre.h>
++#endif
++
++#include <obd_class.h>
++#include <lprocfs_status.h>
++#include <lustre_fsfilt.h>
++
++#if defined(LPROCFS)
++
++#define MAX_STRING_SIZE 128
++
++/* for bug 10866, global variable */
++DECLARE_RWSEM(_lprocfs_lock);
++EXPORT_SYMBOL(_lprocfs_lock);
++
++int lprocfs_seq_release(struct inode *inode, struct file *file)
++{
++ LPROCFS_EXIT();
++ return seq_release(inode, file);
++}
++EXPORT_SYMBOL(lprocfs_seq_release);
++
++struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head,
++ const char *name)
++{
++ struct proc_dir_entry *temp;
++
++ if (head == NULL)
++ return NULL;
++
++ LPROCFS_ENTRY();
++ temp = head->subdir;
++ while (temp != NULL) {
++ if (strcmp(temp->name, name) == 0) {
++ LPROCFS_EXIT();
++ return temp;
++ }
++
++ temp = temp->next;
++ }
++ LPROCFS_EXIT();
++ return NULL;
++}
++
++/* lprocfs API calls */
++
++/* Function that emulates snprintf but also has the side effect of advancing
++ the page pointer for the next write into the buffer, incrementing the total
++ length written to the buffer, and decrementing the size left in the
++ buffer. */
++static int lprocfs_obd_snprintf(char **page, int end, int *len,
++ const char *format, ...)
++{
++ va_list list;
++ int n;
++
++ if (*len >= end)
++ return 0;
++
++ va_start(list, format);
++ n = vsnprintf(*page, end - *len, format, list);
++ va_end(list);
++
++ *page += n; *len += n;
++ return n;
++}
++
++int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
++ read_proc_t *read_proc, write_proc_t *write_proc,
++ void *data)
++{
++ struct proc_dir_entry *proc;
++ mode_t mode = 0;
++
++ if (root == NULL || name == NULL)
++ return -EINVAL;
++ if (read_proc)
++ mode = 0444;
++ if (write_proc)
++ mode |= 0200;
++ proc = create_proc_entry(name, mode, root);
++ if (!proc) {
++ CERROR("LprocFS: No memory to create /proc entry %s", name);
++ return -ENOMEM;
++ }
++ proc->read_proc = read_proc;
++ proc->write_proc = write_proc;
++ proc->data = data;
++ return 0;
++}
++
++static ssize_t lprocfs_fops_read(struct file *f, char __user *buf, size_t size,
++ loff_t *ppos)
++{
++ struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
++ char *page, *start = NULL;
++ int rc = 0, eof = 1, count;
++
++ if (*ppos >= PAGE_SIZE)
++ return 0;
++
++ page = (char *)__get_free_page(GFP_KERNEL);
++ if (page == NULL)
++ return -ENOMEM;
++
++ LPROCFS_ENTRY();
++ OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10);
++ if (!dp->deleted && dp->read_proc)
++ rc = dp->read_proc(page, &start, *ppos, PAGE_SIZE,
++ &eof, dp->data);
++ LPROCFS_EXIT();
++ if (rc <= 0)
++ goto out;
++
++ /* for lustre proc read, the read count must be less than PAGE_SIZE */
++ LASSERT(eof == 1);
++
++ if (start == NULL) {
++ rc -= *ppos;
++ if (rc < 0)
++ rc = 0;
++ if (rc == 0)
++ goto out;
++ start = page + *ppos;
++ } else if (start < page) {
++ start = page;
++ }
++
++ count = (rc < size) ? rc : size;
++ if (copy_to_user(buf, start, count)) {
++ rc = -EFAULT;
++ goto out;
++ }
++ *ppos += count;
++
++out:
++ free_page((unsigned long)page);
++ return rc;
++}
++
++static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf,
++ size_t size, loff_t *ppos)
++{
++ struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
++ int rc = -EIO;
++
++ LPROCFS_ENTRY();
++ if (!dp->deleted && dp->write_proc)
++ rc = dp->write_proc(f, buf, size, dp->data);
++ LPROCFS_EXIT();
++ return rc;
++}
++
++static struct file_operations lprocfs_generic_fops = {
++ .owner = THIS_MODULE,
++ .read = lprocfs_fops_read,
++ .write = lprocfs_fops_write,
++};
++
++int lprocfs_evict_client_open(struct inode *inode, struct file *f)
++{
++ struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
++ struct obd_device *obd = dp->data;
++
++ atomic_inc(&obd->obd_evict_inprogress);
++
++ return 0;
++}
++
++int lprocfs_evict_client_release(struct inode *inode, struct file *f)
++{
++ struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode);
++ struct obd_device *obd = dp->data;
++
++ atomic_dec(&obd->obd_evict_inprogress);
++ wake_up(&obd->obd_evict_inprogress_waitq);
++
++ return 0;
++}
++
++struct file_operations lprocfs_evict_client_fops = {
++ .owner = THIS_MODULE,
++ .read = lprocfs_fops_read,
++ .write = lprocfs_fops_write,
++ .open = lprocfs_evict_client_open,
++ .release = lprocfs_evict_client_release,
++};
++EXPORT_SYMBOL(lprocfs_evict_client_fops);
++
++/**
++ * Add /proc entrys.
++ *
++ * \param root [in] The parent proc entry on which new entry will be added.
++ * \param list [in] Array of proc entries to be added.
++ * \param data [in] The argument to be passed when entries read/write routines
++ * are called through /proc file.
++ *
++ * \retval 0 on success
++ * < 0 on error
++ */
++int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
++ void *data)
++{
++ if (root == NULL || list == NULL)
++ return -EINVAL;
++
++ while (list->name != NULL) {
++ struct proc_dir_entry *cur_root, *proc;
++ char *pathcopy, *cur, *next, pathbuf[64];
++ int pathsize = strlen(list->name) + 1;
++
++ proc = NULL;
++ cur_root = root;
++
++ /* need copy of path for strsep */
++ if (strlen(list->name) > sizeof(pathbuf) - 1) {
++ OBD_ALLOC(pathcopy, pathsize);
++ if (pathcopy == NULL)
++ return -ENOMEM;
++ } else {
++ pathcopy = pathbuf;
++ }
++
++ next = pathcopy;
++ strcpy(pathcopy, list->name);
++
++ while (cur_root != NULL && (cur = strsep(&next, "/"))) {
++ if (*cur =='\0') /* skip double/trailing "/" */
++ continue;
++
++ proc = lprocfs_srch(cur_root, cur);
++ CDEBUG(D_OTHER, "cur_root=%s, cur=%s, next=%s, (%s)\n",
++ cur_root->name, cur, next,
++ (proc ? "exists" : "new"));
++ if (next != NULL) {
++ cur_root = (proc ? proc :
++ proc_mkdir(cur, cur_root));
++ } else if (proc == NULL) {
++ mode_t mode = 0;
++ if (list->proc_mode != 0000) {
++ mode = list->proc_mode;
++ } else {
++ if (list->read_fptr)
++ mode = 0444;
++ if (list->write_fptr)
++ mode |= 0200;
++ }
++ proc = create_proc_entry(cur, mode, cur_root);
++ }
++ }
++
++ if (pathcopy != pathbuf)
++ OBD_FREE(pathcopy, pathsize);
++
++ if (cur_root == NULL || proc == NULL) {
++ CERROR("LprocFS: No memory to create /proc entry %s",
++ list->name);
++ return -ENOMEM;
++ }
++
++ if (list->fops)
++ proc->proc_fops = list->fops;
++ else
++ proc->proc_fops = &lprocfs_generic_fops;
++ proc->read_proc = list->read_fptr;
++ proc->write_proc = list->write_fptr;
++ proc->data = (list->data ? list->data : data);
++ list++;
++ }
++ return 0;
++}
++
++void lprocfs_remove(struct proc_dir_entry **rooth)
++{
++ struct proc_dir_entry *root = *rooth;
++ struct proc_dir_entry *temp = root;
++ struct proc_dir_entry *rm_entry;
++ struct proc_dir_entry *parent;
++
++ if (!root)
++ return;
++ *rooth = NULL;
++
++ parent = root->parent;
++ LASSERT(parent != NULL);
++ LPROCFS_WRITE_ENTRY(); /* search vs remove race */
++
++ while (1) {
++ while (temp->subdir != NULL)
++ temp = temp->subdir;
++
++ rm_entry = temp;
++ temp = temp->parent;
++
++ /* Memory corruption once caused this to fail, and
++ without this LASSERT we would loop here forever. */
++ LASSERTF(strlen(rm_entry->name) == rm_entry->namelen,
++ "0x%p %s/%s len %d\n", rm_entry, temp->name,
++ rm_entry->name, (int)strlen(rm_entry->name));
++
++ /* Now, the rm_entry->deleted flags is protected
++ * by _lprocfs_lock. */
++ rm_entry->data = NULL;
++ remove_proc_entry(rm_entry->name, temp);
++ if (temp == parent)
++ break;
++ }
++ LPROCFS_WRITE_EXIT();
++}
++
++struct proc_dir_entry *lprocfs_register(const char *name,
++ struct proc_dir_entry *parent,
++ struct lprocfs_vars *list, void *data)
++{
++ struct proc_dir_entry *newchild;
++
++ newchild = lprocfs_srch(parent, name);
++ if (newchild != NULL) {
++ CERROR(" Lproc: Attempting to register %s more than once \n",
++ name);
++ return ERR_PTR(-EALREADY);
++ }
++
++ newchild = proc_mkdir(name, parent);
++ if (newchild != NULL && list != NULL) {
++ int rc = lprocfs_add_vars(newchild, list, data);
++ if (rc) {
++ lprocfs_remove(&newchild);
++ return ERR_PTR(rc);
++ }
++ }
++ return newchild;
++}
++
++/* Generic callbacks */
++int lprocfs_rd_uint(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ unsigned int *temp = (unsigned int *)data;
++ return snprintf(page, count, "%u\n", *temp);
++}
++
++int lprocfs_wr_uint(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ unsigned *p = data;
++ char dummy[MAX_STRING_SIZE + 1] = { '\0' }, *end;
++ unsigned long tmp;
++
++ if (count >= sizeof(dummy) || count == 0)
++ return -EINVAL;
++
++ if (copy_from_user(dummy, buffer, count))
++ return -EFAULT;
++
++ tmp = simple_strtoul(dummy, &end, 0);
++ if (dummy == end)
++ return -EINVAL;
++
++ *p = (unsigned int)tmp;
++ return count;
++}
++
++int lprocfs_rd_u64(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ LASSERT(data != NULL);
++ *eof = 1;
++ return snprintf(page, count, LPU64"\n", *(__u64 *)data);
++}
++
++int lprocfs_rd_atomic(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ atomic_t *atom = (atomic_t *)data;
++ LASSERT(atom != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%d\n", atomic_read(atom));
++}
++
++int lprocfs_wr_atomic(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ atomic_t *atm = data;
++ int val = 0;
++ int rc;
++
++ rc = lprocfs_write_helper(buffer, count, &val);
++ if (rc < 0)
++ return rc;
++
++ if (val <= 0)
++ return -ERANGE;
++
++ atomic_set(atm, val);
++ return count;
++}
++
++int lprocfs_rd_uuid(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device*)data;
++
++ LASSERT(obd != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%s\n", obd->obd_uuid.uuid);
++}
++
++int lprocfs_rd_name(char *page, char **start, off_t off, int count,
++ int *eof, void* data)
++{
++ struct obd_device *dev = (struct obd_device *)data;
++
++ LASSERT(dev != NULL);
++ LASSERT(dev->obd_name != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%s\n", dev->obd_name);
++}
++
++int lprocfs_rd_fstype(char *page, char **start, off_t off, int count, int *eof,
++ void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++
++ LASSERT(obd != NULL);
++ LASSERT(obd->obd_fsops != NULL);
++ LASSERT(obd->obd_fsops->fs_type != NULL);
++ return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type);
++}
++
++int lprocfs_rd_blksize(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ *eof = 1;
++ rc = snprintf(page, count, "%u\n", osfs.os_bsize);
++ }
++ return rc;
++}
++
++int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ __u32 blk_size = osfs.os_bsize >> 10;
++ __u64 result = osfs.os_blocks;
++
++ while (blk_size >>= 1)
++ result <<= 1;
++
++ *eof = 1;
++ rc = snprintf(page, count, LPU64"\n", result);
++ }
++ return rc;
++}
++
++int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ __u32 blk_size = osfs.os_bsize >> 10;
++ __u64 result = osfs.os_bfree;
++
++ while (blk_size >>= 1)
++ result <<= 1;
++
++ *eof = 1;
++ rc = snprintf(page, count, LPU64"\n", result);
++ }
++ return rc;
++}
++
++int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ __u32 blk_size = osfs.os_bsize >> 10;
++ __u64 result = osfs.os_bavail;
++
++ while (blk_size >>= 1)
++ result <<= 1;
++
++ *eof = 1;
++ rc = snprintf(page, count, LPU64"\n", result);
++ }
++ return rc;
++}
++
++int lprocfs_rd_filestotal(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ *eof = 1;
++ rc = snprintf(page, count, LPU64"\n", osfs.os_files);
++ }
++
++ return rc;
++}
++
++int lprocfs_rd_filesfree(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_statfs osfs;
++ int rc = obd_statfs(data, &osfs, cfs_time_current_64() - HZ,
++ OBD_STATFS_NODELAY);
++ if (!rc) {
++ *eof = 1;
++ rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
++ }
++ return rc;
++}
++
++int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ struct obd_import *imp;
++ char *imp_state_name = NULL;
++ int rc = 0;
++
++ LASSERT(obd != NULL);
++ LPROCFS_CLIMP_CHECK(obd);
++ imp = obd->u.cli.cl_import;
++ imp_state_name = ptlrpc_import_state_name(imp->imp_state);
++ *eof = 1;
++ rc = snprintf(page, count, "%s\t%s%s\n",
++ obd2cli_tgt(obd), imp_state_name,
++ imp->imp_deactive ? "\tDEACTIVATED" : "");
++
++ LPROCFS_CLIMP_EXIT(obd);
++ return rc;
++}
++
++int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device*)data;
++ struct ptlrpc_connection *conn;
++ int rc = 0;
++
++ LASSERT(obd != NULL);
++ LPROCFS_CLIMP_CHECK(obd);
++ conn = obd->u.cli.cl_import->imp_connection;
++ LASSERT(conn != NULL);
++ *eof = 1;
++ rc = snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid);
++
++ LPROCFS_CLIMP_EXIT(obd);
++ return rc;
++}
++
++#define flag2str(flag) \
++ if (imp->imp_##flag && max - len > 0) \
++ len += snprintf(str + len, max - len, " " #flag);
++
++/**
++ * Append a space separated list of current set flags to str.
++ */
++static int obd_import_flags2str(struct obd_import *imp, char *str,
++ int max)
++{
++ int len = 0;
++
++ if (imp->imp_obd->obd_no_recov)
++ len += snprintf(str, max - len, " no_recov");
++
++ flag2str(invalid);
++ flag2str(deactive);
++ flag2str(replayable);
++ flag2str(pingable);
++ flag2str(recon_bk);
++ flag2str(last_recon);
++ return len;
++}
++#undef flags2str
++
++int lprocfs_rd_import(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ struct obd_import *imp;
++ char *imp_state_name = NULL;
++ int rc = 0;
++
++ LASSERT(obd != NULL);
++ LPROCFS_CLIMP_CHECK(obd);
++ imp = obd->u.cli.cl_import;
++ imp_state_name = ptlrpc_import_state_name(imp->imp_state);
++ *eof = 1;
++
++ rc = snprintf(page, count,
++ "import: %s\n"
++ " target: %s@%s\n"
++ " state: %s\n"
++ " inflight: %u\n"
++ " unregistering: %u\n"
++ " conn_cnt: %u\n"
++ " generation: %u\n"
++ " inval_cnt: %u\n"
++ " last_replay_transno: "LPU64"\n"
++ " peer_committed_transno: "LPU64"\n"
++ " last_trasno_checked: "LPU64"\n"
++ " flags:",
++ obd->obd_name,
++ obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid,
++ imp_state_name,
++ atomic_read(&imp->imp_inflight),
++ atomic_read(&imp->imp_unregistering),
++ imp->imp_conn_cnt,
++ imp->imp_generation,
++ atomic_read(&imp->imp_inval_count),
++ imp->imp_last_replay_transno,
++ imp->imp_peer_committed_transno,
++ imp->imp_last_transno_checked);
++ rc += obd_import_flags2str(imp, page + rc, count - rc);
++ rc += snprintf(page+rc, count - rc, "\n");
++ LPROCFS_CLIMP_EXIT(obd);
++ return rc;
++}
++
++int lprocfs_at_hist_helper(char *page, int count, int rc,
++ struct adaptive_timeout *at)
++{
++ int i;
++ for (i = 0; i < AT_BINS; i++)
++ rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]);
++ rc += snprintf(page + rc, count - rc, "\n");
++ return rc;
++}
++
++/* See also ptlrpc_lprocfs_rd_timeouts */
++int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ struct obd_import *imp;
++ unsigned int cur, worst;
++ time_t now, worstt;
++ struct dhms ts;
++ int i, rc = 0;
++
++ LASSERT(obd != NULL);
++ LPROCFS_CLIMP_CHECK(obd);
++ imp = obd->u.cli.cl_import;
++ *eof = 1;
++
++ now = cfs_time_current_sec();
++
++ /* Some network health info for kicks */
++ s2dhms(&ts, now - imp->imp_last_reply_time);
++ rc += snprintf(page + rc, count - rc,
++ "%-10s : %ld, "DHMS_FMT" ago\n",
++ "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
++
++ cur = at_get(&imp->imp_at.iat_net_latency);
++ worst = imp->imp_at.iat_net_latency.at_worst_ever;
++ worstt = imp->imp_at.iat_net_latency.at_worst_time;
++ s2dhms(&ts, now - worstt);
++ rc += snprintf(page + rc, count - rc,
++ "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
++ "network", cur, worst, worstt, DHMS_VARS(&ts));
++ rc = lprocfs_at_hist_helper(page, count, rc,
++ &imp->imp_at.iat_net_latency);
++
++ for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
++ if (imp->imp_at.iat_portal[i] == 0)
++ break;
++ cur = at_get(&imp->imp_at.iat_service_estimate[i]);
++ worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
++ worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
++ s2dhms(&ts, now - worstt);
++ rc += snprintf(page + rc, count - rc,
++ "portal %-2d : cur %3u worst %3u (at %ld, "
++ DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
++ cur, worst, worstt, DHMS_VARS(&ts));
++ rc = lprocfs_at_hist_helper(page, count, rc,
++ &imp->imp_at.iat_service_estimate[i]);
++ }
++
++ LPROCFS_CLIMP_EXIT(obd);
++ return rc;
++}
++
++static const char *obd_connect_names[] = {
++ "read_only",
++ "lov_index",
++ "unused",
++ "write_grant",
++ "server_lock",
++ "version",
++ "request_portal",
++ "acl",
++ "xattr",
++ "create_on_write",
++ "truncate_lock",
++ "initial_transno",
++ "inode_bit_locks",
++ "join_file",
++ "getattr_by_fid",
++ "no_oh_for_devices",
++ "local_1.8_client",
++ "remote_1.8_client",
++ "max_byte_per_rpc",
++ "64bit_qdata",
++ "fid_capability",
++ "oss_capability",
++ "early_lock_cancel",
++ "size_on_mds",
++ "adaptive_timeout",
++ "lru_resize",
++ "mds_mds_connection",
++ "real_conn",
++ "change_qunit_size",
++ "alt_checksum_algorithm",
++ "fid_is_enabled",
++ "version_recovery",
++ "pools",
++ NULL
++};
++
++int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct obd_device *obd = data;
++ __u64 mask = 1, flags;
++ int i, ret = 0;
++
++ LPROCFS_CLIMP_CHECK(obd);
++ flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
++ ret = snprintf(page, count, "flags="LPX64"\n", flags);
++ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
++ if (flags & mask)
++ ret += snprintf(page + ret, count - ret, "%s\n",
++ obd_connect_names[i]);
++ }
++ if (flags & ~(mask - 1))
++ ret += snprintf(page + ret, count - ret,
++ "unknown flags "LPX64"\n", flags & ~(mask - 1));
++
++ LPROCFS_CLIMP_EXIT(obd);
++ return ret;
++}
++EXPORT_SYMBOL(lprocfs_rd_connect_flags);
++
++int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device*)data;
++
++ LASSERT(obd != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%u\n", obd->obd_num_exports);
++}
++
++int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_type *class = (struct obd_type*) data;
++
++ LASSERT(class != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%d\n", class->typ_refcnt);
++}
++
++int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
++{
++ int rc = 0;
++
++ LASSERT(obd != NULL);
++ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
++ LASSERT(obd->obd_type->typ_procroot != NULL);
++
++ obd->obd_proc_entry = lprocfs_register(obd->obd_name,
++ obd->obd_type->typ_procroot,
++ list, obd);
++ if (IS_ERR(obd->obd_proc_entry)) {
++ rc = PTR_ERR(obd->obd_proc_entry);
++ CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
++ obd->obd_proc_entry = NULL;
++ }
++ return rc;
++}
++
++int lprocfs_obd_cleanup(struct obd_device *obd)
++{
++ if (!obd)
++ return -EINVAL;
++ if (obd->obd_proc_exports_entry) {
++ /* Should be no exports left */
++ LASSERT(obd->obd_proc_exports_entry->subdir == NULL);
++ lprocfs_remove(&obd->obd_proc_exports_entry);
++ }
++ lprocfs_remove(&obd->obd_proc_entry);
++ return 0;
++}
++
++static void lprocfs_free_client_stats(struct nid_stat *client_stat)
++{
++ CDEBUG(D_CONFIG, "stat %p - data %p/%p/%p\n", client_stat,
++ client_stat->nid_proc, client_stat->nid_stats,
++ client_stat->nid_brw_stats);
++
++ LASSERTF(client_stat->nid_exp_ref_count == 0, "count %d\n",
++ client_stat->nid_exp_ref_count);
++
++ hlist_del_init(&client_stat->nid_hash);
++
++ if (client_stat->nid_proc)
++ lprocfs_remove(&client_stat->nid_proc);
++
++ if (client_stat->nid_stats)
++ lprocfs_free_stats(&client_stat->nid_stats);
++
++ if (client_stat->nid_brw_stats)
++ OBD_FREE_PTR(client_stat->nid_brw_stats);
++
++ if (client_stat->nid_ldlm_stats)
++ lprocfs_free_stats(&client_stat->nid_ldlm_stats);
++
++ OBD_FREE_PTR(client_stat);
++ return;
++
++}
++
++void lprocfs_free_per_client_stats(struct obd_device *obd)
++{
++ struct nid_stat *stat;
++ ENTRY;
++
++ /* we need extra list - because hash_exit called to early */
++ /* not need locking because all clients is died */
++ while(!list_empty(&obd->obd_nid_stats)) {
++ stat = list_entry(obd->obd_nid_stats.next,
++ struct nid_stat, nid_list);
++ list_del_init(&stat->nid_list);
++ lprocfs_free_client_stats(stat);
++ }
++
++ EXIT;
++}
++
++struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
++ enum lprocfs_stats_flags flags)
++{
++ struct lprocfs_stats *stats;
++ unsigned int percpusize;
++ unsigned int i, j;
++ unsigned int num_cpu;
++
++ if (num == 0)
++ return NULL;
++
++ if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
++ num_cpu = 1;
++ else
++ num_cpu = num_possible_cpus();
++
++ OBD_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_cpu]));
++ if (stats == NULL)
++ return NULL;
++
++ if (flags & LPROCFS_STATS_FLAG_NOPERCPU) {
++ stats->ls_flags = flags;
++ spin_lock_init(&stats->ls_lock);
++ /* Use this lock only if there are no percpu areas */
++ } else {
++ stats->ls_flags = 0;
++ }
++
++ percpusize = offsetof(struct lprocfs_percpu, lp_cntr[num]);
++ if (num_cpu > 1)
++ percpusize = L1_CACHE_ALIGN(percpusize);
++
++ for (i = 0; i < num_cpu; i++) {
++ OBD_ALLOC(stats->ls_percpu[i], percpusize);
++ if (stats->ls_percpu[i] == NULL) {
++ for (j = 0; j < i; j++) {
++ OBD_FREE(stats->ls_percpu[j], percpusize);
++ stats->ls_percpu[j] = NULL;
++ }
++ break;
++ }
++ }
++ if (stats->ls_percpu[0] == NULL) {
++ OBD_FREE(stats, offsetof(typeof(*stats),
++ ls_percpu[num_cpu]));
++ return NULL;
++ }
++
++ stats->ls_num = num;
++ return stats;
++}
++
++void lprocfs_free_stats(struct lprocfs_stats **statsh)
++{
++ struct lprocfs_stats *stats = *statsh;
++ unsigned int num_cpu;
++ unsigned int percpusize;
++ unsigned int i;
++
++ if (!stats || (stats->ls_num == 0))
++ return;
++ *statsh = NULL;
++ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
++ num_cpu = 1;
++ else
++ num_cpu = num_possible_cpus();
++
++ percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
++ if (num_cpu > 1)
++ percpusize = L1_CACHE_ALIGN(percpusize);
++ for (i = 0; i < num_cpu; i++)
++ OBD_FREE(stats->ls_percpu[i], percpusize);
++ OBD_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_cpu]));
++}
++
++void lprocfs_clear_stats(struct lprocfs_stats *stats)
++{
++ struct lprocfs_counter *percpu_cntr;
++ int i, j;
++ unsigned int num_cpu;
++
++ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU);
++
++ for (i = 0; i < num_cpu; i++) {
++ for (j = 0; j < stats->ls_num; j++) {
++ percpu_cntr = &(stats->ls_percpu[i])->lp_cntr[j];
++ atomic_inc(&percpu_cntr->lc_cntl.la_entry);
++ percpu_cntr->lc_count = 0;
++ percpu_cntr->lc_sum = 0;
++ percpu_cntr->lc_min = LC_MIN_INIT;
++ percpu_cntr->lc_max = 0;
++ percpu_cntr->lc_sumsquare = 0;
++ atomic_inc(&percpu_cntr->lc_cntl.la_exit);
++ }
++ }
++
++ lprocfs_stats_unlock(stats);
++}
++
++static ssize_t lprocfs_stats_seq_write(struct file *file, const char *buf,
++ size_t len, loff_t *off)
++{
++ struct seq_file *seq = file->private_data;
++ struct lprocfs_stats *stats = seq->private;
++
++ lprocfs_clear_stats(stats);
++
++ return len;
++}
++
++static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
++{
++ struct lprocfs_stats *stats = p->private;
++ /* return 1st cpu location */
++ return (*pos >= stats->ls_num) ? NULL :
++ &(stats->ls_percpu[0]->lp_cntr[*pos]);
++}
++
++static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
++{
++}
++
++static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
++{
++ struct lprocfs_stats *stats = p->private;
++ ++*pos;
++ return (*pos >= stats->ls_num) ? NULL :
++ &(stats->ls_percpu[0]->lp_cntr[*pos]);
++}
++
++/* seq file export of one lprocfs counter */
++static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
++{
++ struct lprocfs_stats *stats = p->private;
++ struct lprocfs_counter *cntr = v;
++ struct lprocfs_counter t, ret = { .lc_min = LC_MIN_INIT };
++ int i, idx, rc = 0;
++ unsigned int num_cpu;
++
++ if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) {
++ struct timeval now;
++ do_gettimeofday(&now);
++ rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
++ "snapshot_time", now.tv_sec, now.tv_usec);
++ if (rc < 0)
++ return rc;
++ }
++ idx = cntr - &(stats->ls_percpu[0])->lp_cntr[0];
++
++ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
++ num_cpu = 1;
++ else
++ num_cpu = num_possible_cpus();
++
++ for (i = 0; i < num_cpu; i++) {
++ struct lprocfs_counter *percpu_cntr =
++ &(stats->ls_percpu[i])->lp_cntr[idx];
++ int centry;
++
++ do {
++ centry = atomic_read(&percpu_cntr->lc_cntl.la_entry);
++ t.lc_count = percpu_cntr->lc_count;
++ t.lc_sum = percpu_cntr->lc_sum;
++ t.lc_min = percpu_cntr->lc_min;
++ t.lc_max = percpu_cntr->lc_max;
++ t.lc_sumsquare = percpu_cntr->lc_sumsquare;
++ } while (centry != atomic_read(&percpu_cntr->lc_cntl.la_entry) &&
++ centry != atomic_read(&percpu_cntr->lc_cntl.la_exit));
++ ret.lc_count += t.lc_count;
++ ret.lc_sum += t.lc_sum;
++ if (t.lc_min < ret.lc_min)
++ ret.lc_min = t.lc_min;
++ if (t.lc_max > ret.lc_max)
++ ret.lc_max = t.lc_max;
++ ret.lc_sumsquare += t.lc_sumsquare;
++ }
++
++ if (ret.lc_count == 0)
++ goto out;
++
++ rc = seq_printf(p, "%-25s "LPD64" samples [%s]", cntr->lc_name,
++ ret.lc_count, cntr->lc_units);
++ if (rc < 0)
++ goto out;
++
++ if ((cntr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ret.lc_count > 0)) {
++ rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
++ ret.lc_min, ret.lc_max, ret.lc_sum);
++ if (rc < 0)
++ goto out;
++ if (cntr->lc_config & LPROCFS_CNTR_STDDEV)
++ rc = seq_printf(p, " "LPD64, ret.lc_sumsquare);
++ if (rc < 0)
++ goto out;
++ }
++ rc = seq_printf(p, "\n");
++ out:
++ return (rc < 0) ? rc : 0;
++}
++
++struct seq_operations lprocfs_stats_seq_sops = {
++ start: lprocfs_stats_seq_start,
++ stop: lprocfs_stats_seq_stop,
++ next: lprocfs_stats_seq_next,
++ show: lprocfs_stats_seq_show,
++};
++
++static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
++{
++ struct proc_dir_entry *dp = PDE(inode);
++ struct seq_file *seq;
++ int rc;
++
++ LPROCFS_ENTRY_AND_CHECK(dp);
++ rc = seq_open(file, &lprocfs_stats_seq_sops);
++ if (rc) {
++ LPROCFS_EXIT();
++ return rc;
++ }
++
++ seq = file->private_data;
++ seq->private = dp->data;
++ return 0;
++}
++
++struct file_operations lprocfs_stats_seq_fops = {
++ .owner = THIS_MODULE,
++ .open = lprocfs_stats_seq_open,
++ .read = seq_read,
++ .write = lprocfs_stats_seq_write,
++ .llseek = seq_lseek,
++ .release = lprocfs_seq_release,
++};
++
++int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
++ struct lprocfs_stats *stats)
++{
++ struct proc_dir_entry *entry;
++ LASSERT(root != NULL);
++
++ entry = create_proc_entry(name, 0644, root);
++ if (entry == NULL)
++ return -ENOMEM;
++ entry->proc_fops = &lprocfs_stats_seq_fops;
++ entry->data = (void *)stats;
++ return 0;
++}
++
++void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
++ unsigned conf, const char *name, const char *units)
++{
++ struct lprocfs_counter *c;
++ int i;
++ unsigned int num_cpu;
++
++ LASSERT(stats != NULL);
++
++ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU);
++
++ for (i = 0; i < num_cpu; i++) {
++ c = &(stats->ls_percpu[i]->lp_cntr[index]);
++ c->lc_config = conf;
++ c->lc_count = 0;
++ c->lc_sum = 0;
++ c->lc_min = LC_MIN_INIT;
++ c->lc_max = 0;
++ c->lc_name = name;
++ c->lc_units = units;
++ }
++
++ lprocfs_stats_unlock(stats);
++}
++EXPORT_SYMBOL(lprocfs_counter_init);
++
++#define LPROCFS_OBD_OP_INIT(base, stats, op) \
++do { \
++ unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \
++ LASSERT(coffset < stats->ls_num); \
++ lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \
++} while (0)
++
++void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
++{
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw_async);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, prep_async_page);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, reget_short_lock);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, release_short_lock);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_async_io);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_group_io);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, trigger_group_io);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_async_flags);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, teardown_async_page);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, match);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, join_lru);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quota_adjust_qunit);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_page_removal_cb);
++ LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb);
++ LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb);
++}
++
++void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
++{
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_ENQUEUE - LDLM_FIRST_OPC,
++ 0, "ldlm_enqueue", "reqs");
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_CONVERT - LDLM_FIRST_OPC,
++ 0, "ldlm_convert", "reqs");
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_CANCEL - LDLM_FIRST_OPC,
++ 0, "ldlm_cancel", "reqs");
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
++ 0, "ldlm_bl_callback", "reqs");
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
++ 0, "ldlm_cp_callback", "reqs");
++ lprocfs_counter_init(ldlm_stats,
++ LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
++ 0, "ldlm_gl_callback", "reqs");
++}
++
++int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
++{
++ struct lprocfs_stats *stats;
++ unsigned int num_stats;
++ int rc, i;
++
++ LASSERT(obd->obd_stats == NULL);
++ LASSERT(obd->obd_proc_entry != NULL);
++ LASSERT(obd->obd_cntr_base == 0);
++
++ num_stats = ((int)sizeof(*obd->obd_type->typ_ops) / sizeof(void *)) +
++ num_private_stats - 1 /* o_owner */;
++ stats = lprocfs_alloc_stats(num_stats, 0);
++ if (stats == NULL)
++ return -ENOMEM;
++
++ lprocfs_init_ops_stats(num_private_stats, stats);
++
++ for (i = num_private_stats; i < num_stats; i++) {
++ /* If this LBUGs, it is likely that an obd
++ * operation was added to struct obd_ops in
++ * <obd.h>, and that the corresponding line item
++ * LPROCFS_OBD_OP_INIT(.., .., opname)
++ * is missing from the list above. */
++ LASSERTF(stats->ls_percpu[0]->lp_cntr[i].lc_name != NULL,
++ "Missing obd_stat initializer obd_op "
++ "operation at offset %d.\n", i - num_private_stats);
++ }
++ rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
++ if (rc < 0) {
++ lprocfs_free_stats(&stats);
++ } else {
++ obd->obd_stats = stats;
++ obd->obd_cntr_base = num_private_stats;
++ }
++ return rc;
++}
++
++void lprocfs_free_obd_stats(struct obd_device *obd)
++{
++ if (obd->obd_stats)
++ lprocfs_free_stats(&obd->obd_stats);
++}
++
++int lprocfs_exp_rd_nid(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct obd_export *exp = (struct obd_export*)data;
++ LASSERT(exp != NULL);
++ *eof = 1;
++ return snprintf(page, count, "%s\n", obd_export_nid2str(exp));
++}
++
++struct exp_uuid_cb_data {
++ char *page;
++ int count;
++ int *eof;
++ int *len;
++};
++
++static void
++lprocfs_exp_rd_cb_data_init(struct exp_uuid_cb_data *cb_data, char *page,
++ int count, int *eof, int *len)
++{
++ cb_data->page = page;
++ cb_data->count = count;
++ cb_data->eof = eof;
++ cb_data->len = len;
++}
++
++void lprocfs_exp_print_uuid(void *obj, void *cb_data)
++{
++ struct obd_export *exp = (struct obd_export *)obj;
++ struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data;
++
++ if (exp->exp_nid_stats)
++ *data->len += snprintf((data->page + *data->len),
++ data->count, "%s\n",
++ obd_uuid2str(&exp->exp_client_uuid));
++}
++
++int lprocfs_exp_rd_uuid(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct nid_stat *stats = (struct nid_stat *)data;
++ struct exp_uuid_cb_data cb_data;
++ struct obd_device *obd = stats->nid_obd;
++ int len = 0;
++
++ *eof = 1;
++ page[0] = '\0';
++ lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
++ lustre_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
++ lprocfs_exp_print_uuid, &cb_data);
++ return (*cb_data.len);
++}
++
++void lprocfs_exp_print_hash(void *obj, void *cb_data)
++{
++ struct obd_export *exp = (struct obd_export *)obj;
++ struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data;
++ lustre_hash_t *lh;
++
++ lh = exp->exp_lock_hash;
++ if (lh) {
++ if (!*data->len)
++ *data->len += lustre_hash_debug_header(data->page,
++ data->count);
++
++ *data->len += lustre_hash_debug_str(lh, data->page +
++ *data->len,
++ data->count);
++ }
++}
++
++int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ struct nid_stat *stats = (struct nid_stat *)data;
++ struct exp_uuid_cb_data cb_data;
++ struct obd_device *obd = stats->nid_obd;
++ int len = 0;
++
++ *eof = 1;
++ page[0] = '\0';
++ lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len);
++ lustre_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
++ lprocfs_exp_print_hash, &cb_data);
++ return (*cb_data.len);
++}
++
++int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ *eof = 1;
++ return snprintf(page, count, "%s\n",
++ "Write into this file to clear all nid stats and "
++ "stale nid entries");
++}
++EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
++
++void lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
++{
++ struct nid_stat *stat = obj;
++ int i;
++
++ /* object has only hash + iterate_all references.
++ * add/delete blocked by hash bucket lock */
++ CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count);
++ if (stat->nid_exp_ref_count == 2) {
++ hlist_del_init(&stat->nid_hash);
++ stat->nid_exp_ref_count--;
++ spin_lock(&stat->nid_obd->obd_nid_lock);
++ list_del_init(&stat->nid_list);
++ spin_unlock(&stat->nid_obd->obd_nid_lock);
++ list_add(&stat->nid_list, data);
++ EXIT;
++ return;
++ }
++ /* we has reference to object - only clear data*/
++ if (stat->nid_stats)
++ lprocfs_clear_stats(stat->nid_stats);
++
++ if (stat->nid_brw_stats) {
++ for (i = 0; i < BRW_LAST; i++)
++ lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]);
++ }
++ EXIT;
++ return;
++}
++
++int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ struct nid_stat *client_stat;
++ CFS_LIST_HEAD(free_list);
++
++ lustre_hash_for_each(obd->obd_nid_stats_hash,
++ lprocfs_nid_stats_clear_write_cb, &free_list);
++
++ while (!list_empty(&free_list)) {
++ client_stat = list_entry(free_list.next, struct nid_stat,
++ nid_list);
++ list_del_init(&client_stat->nid_list);
++ lprocfs_free_client_stats(client_stat);
++ }
++
++ return count;
++}
++EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
++
++int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
++{
++ struct nid_stat *new_stat, *old_stat;
++ struct nid_stat_uuid *new_ns_uuid;
++ struct obd_device *obd;
++ int rc = 0;
++ ENTRY;
++
++ *newnid = 0;
++
++ if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
++ !exp->exp_obd->obd_nid_stats_hash)
++ RETURN(-EINVAL);
++
++ /* not test against zero because eric say:
++ * You may only test nid against another nid, or LNET_NID_ANY.
++ * Anything else is nonsense.*/
++ if (!nid || *nid == LNET_NID_ANY)
++ RETURN(0);
++
++ obd = exp->exp_obd;
++
++ CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
++
++ OBD_ALLOC_PTR(new_stat);
++ if (new_stat == NULL)
++ RETURN(-ENOMEM);
++
++ OBD_ALLOC_PTR(new_ns_uuid);
++ if (new_ns_uuid == NULL) {
++ OBD_FREE_PTR(new_stat);
++ RETURN(-ENOMEM);
++ }
++ CFS_INIT_LIST_HEAD(&new_ns_uuid->ns_uuid_list);
++ strncpy(new_ns_uuid->ns_uuid.uuid, exp->exp_client_uuid.uuid,
++ sizeof(struct obd_uuid));
++
++ CFS_INIT_LIST_HEAD(&new_stat->nid_uuid_list);
++ new_stat->nid = *nid;
++ new_stat->nid_obd = exp->exp_obd;
++ /* need live in hash after destroy export */
++ new_stat->nid_exp_ref_count = 1;
++
++ old_stat = lustre_hash_findadd_unique(obd->obd_nid_stats_hash,
++ nid, &new_stat->nid_hash);
++ CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
++ old_stat, libcfs_nid2str(*nid), new_stat->nid_exp_ref_count);
++
++ /* Return -EALREADY here so that we know that the /proc
++ * entry already has been created */
++ if (old_stat != new_stat) {
++ struct nid_stat_uuid *tmp_uuid;
++ int found = 0;
++
++ exp->exp_nid_stats = old_stat;
++
++ /* We need to decrement the refcount if the uuid was
++ * already in our list */
++ spin_lock(&obd->obd_nid_lock);
++ list_for_each_entry(tmp_uuid, &old_stat->nid_uuid_list,
++ ns_uuid_list) {
++ if (tmp_uuid && obd_uuid_equals(&tmp_uuid->ns_uuid,
++ &exp->exp_client_uuid)){
++ found = 1;
++ --old_stat->nid_exp_ref_count;
++ break;
++ }
++ }
++
++ if (!found)
++ list_add(&new_ns_uuid->ns_uuid_list,
++ &old_stat->nid_uuid_list);
++ else
++ OBD_FREE_PTR(new_ns_uuid);
++ spin_unlock(&obd->obd_nid_lock);
++
++ GOTO(destroy_new, rc = -EALREADY);
++ }
++ /* not found - create */
++ new_stat->nid_proc = proc_mkdir(libcfs_nid2str(*nid),
++ obd->obd_proc_exports_entry);
++ if (!new_stat->nid_proc) {
++ CERROR("Error making export directory for"
++ " nid %s\n", libcfs_nid2str(*nid));
++ GOTO(destroy_new_ns, rc = -ENOMEM);
++ }
++
++ /* Add in uuid to our nid_stats list */
++ spin_lock(&obd->obd_nid_lock);
++ list_add(&new_ns_uuid->ns_uuid_list, &new_stat->nid_uuid_list);
++ spin_unlock(&obd->obd_nid_lock);
++
++ rc = lprocfs_add_simple(new_stat->nid_proc, "uuid",
++ lprocfs_exp_rd_uuid, NULL, new_stat);
++ if (rc) {
++ CWARN("Error adding the uuid file\n");
++ GOTO(destroy_new_ns, rc);
++ }
++
++ rc = lprocfs_add_simple(new_stat->nid_proc, "hash",
++ lprocfs_exp_rd_hash, NULL, new_stat);
++ if (rc) {
++ CWARN("Error adding the hash file\n");
++ lprocfs_remove(&new_stat->nid_proc);
++ GOTO(destroy_new_ns, rc);
++ }
++
++ exp->exp_nid_stats = new_stat;
++ *newnid = 1;
++ /* protect competitive add to list, not need locking on destroy */
++ spin_lock(&obd->obd_nid_lock);
++ list_add(&new_stat->nid_list, &obd->obd_nid_stats);
++ spin_unlock(&obd->obd_nid_lock);
++
++ RETURN(rc);
++
++destroy_new_ns:
++ lustre_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
++ OBD_FREE_PTR(new_ns_uuid);
++
++destroy_new:
++ OBD_FREE_PTR(new_stat);
++ RETURN(rc);
++}
++
++int lprocfs_exp_cleanup(struct obd_export *exp)
++{
++ struct nid_stat *stat = exp->exp_nid_stats;
++ struct nid_stat_uuid *cursor, *tmp;
++ int found = 0;
++
++ if(!stat || !exp->exp_obd)
++ RETURN(0);
++
++ spin_lock(&exp->exp_obd->obd_nid_lock);
++ list_for_each_entry_safe(cursor, tmp,
++ &stat->nid_uuid_list,
++ ns_uuid_list) {
++ if (cursor && obd_uuid_equals(&cursor->ns_uuid,
++ &exp->exp_client_uuid)) {
++ found = 1;
++ list_del(&cursor->ns_uuid_list);
++ OBD_FREE_PTR(cursor);
++ --stat->nid_exp_ref_count;
++ CDEBUG(D_INFO, "Put stat %p - %d\n", stat,
++ stat->nid_exp_ref_count);
++ break;
++ }
++ }
++ spin_unlock(&exp->exp_obd->obd_nid_lock);
++ if (!found)
++ CERROR("obd_export's client uuid %s are not found in its "
++ "nid_stats list\n", exp->exp_client_uuid.uuid);
++
++ exp->exp_nid_stats = NULL;
++ lprocfs_free_stats(&exp->exp_ops_stats);
++
++ return 0;
++}
++
++int lprocfs_write_helper(const char *buffer, unsigned long count,
++ int *val)
++{
++ return lprocfs_write_frac_helper(buffer, count, val, 1);
++}
++
++int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
++ int *val, int mult)
++{
++ char kernbuf[20], *end, *pbuf;
++
++ if (count > (sizeof(kernbuf) - 1))
++ return -EINVAL;
++
++ if (copy_from_user(kernbuf, buffer, count))
++ return -EFAULT;
++
++ kernbuf[count] = '\0';
++ pbuf = kernbuf;
++ if (*pbuf == '-') {
++ mult = -mult;
++ pbuf++;
++ }
++
++ *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
++ if (pbuf == end)
++ return -EINVAL;
++
++ if (end != NULL && *end == '.') {
++ int temp_val, pow = 1;
++ int i;
++
++ pbuf = end + 1;
++ if (strlen(pbuf) > 5)
++ pbuf[5] = '\0'; /*only allow 5bits fractional*/
++
++ temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
++
++ if (pbuf < end) {
++ for (i = 0; i < (end - pbuf); i++)
++ pow *= 10;
++
++ *val += temp_val / pow;
++ }
++ }
++ return 0;
++}
++
++int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
++ int mult)
++{
++ long decimal_val, frac_val;
++ int prtn;
++
++ if (count < 10)
++ return -EINVAL;
++
++ decimal_val = val / mult;
++ prtn = snprintf(buffer, count, "%ld", decimal_val);
++ frac_val = val % mult;
++
++ if (prtn < (count - 4) && frac_val > 0) {
++ long temp_frac;
++ int i, temp_mult = 1, frac_bits = 0;
++
++ temp_frac = frac_val * 10;
++ buffer[prtn++] = '.';
++ while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
++ /*only reserved 2bits fraction*/
++ buffer[prtn++] ='0';
++ temp_frac *= 10;
++ frac_bits++;
++ }
++ /*
++ Need to think these cases :
++ 1. #echo x.00 > /proc/xxx output result : x
++ 2. #echo x.0x > /proc/xxx output result : x.0x
++ 3. #echo x.x0 > /proc/xxx output result : x.x
++ 4. #echo x.xx > /proc/xxx output result : x.xx
++ Only reserved 2bits fraction.
++ */
++ for (i = 0; i < (5 - prtn); i++)
++ temp_mult *= 10;
++
++ frac_bits = min((int)count - prtn, 3 - frac_bits);
++ prtn += snprintf(buffer + prtn, frac_bits, "%ld",
++ frac_val * temp_mult / mult);
++
++ prtn--;
++ while(buffer[prtn] < '1' || buffer[prtn] > '9') {
++ prtn--;
++ if (buffer[prtn] == '.') {
++ prtn--;
++ break;
++ }
++ }
++ prtn++;
++ }
++ buffer[prtn++] ='\n';
++ return prtn;
++}
++
++int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
++{
++ return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
++}
++
++int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
++ __u64 *val, int mult)
++{
++ char kernbuf[22], *end, *pbuf;
++ __u64 whole, frac = 0, units;
++ unsigned frac_d = 1;
++
++ if (count > (sizeof(kernbuf) - 1))
++ return -EINVAL;
++
++ if (copy_from_user(kernbuf, buffer, count))
++ return -EFAULT;
++
++ kernbuf[count] = '\0';
++ pbuf = kernbuf;
++ if (*pbuf == '-') {
++ mult = -mult;
++ pbuf++;
++ }
++
++ whole = simple_strtoull(pbuf, &end, 10);
++ if (pbuf == end)
++ return -EINVAL;
++
++ if (end != NULL && *end == '.') {
++ int i;
++ pbuf = end + 1;
++
++ /* need to limit frac_d to a __u32 */
++ if (strlen(pbuf) > 10)
++ pbuf[10] = '\0';
++
++ frac = simple_strtoull(pbuf, &end, 10);
++ /* count decimal places */
++ for (i = 0; i < (end - pbuf); i++)
++ frac_d *= 10;
++ }
++
++ units = 1;
++ switch(*end) {
++ case 'p': case 'P':
++ units <<= 10;
++ case 't': case 'T':
++ units <<= 10;
++ case 'g': case 'G':
++ units <<= 10;
++ case 'm': case 'M':
++ units <<= 10;
++ case 'k': case 'K':
++ units <<= 10;
++ }
++ /* Specified units override the multiplier */
++ if (units)
++ mult = mult < 0 ? -units : units;
++
++ frac *= mult;
++ do_div(frac, frac_d);
++ *val = whole * mult + frac;
++ return 0;
++}
++
++int lprocfs_seq_create(cfs_proc_dir_entry_t *parent,
++ char *name, mode_t mode,
++ struct file_operations *seq_fops, void *data)
++{
++ struct proc_dir_entry *entry;
++ ENTRY;
++
++ entry = create_proc_entry(name, mode, parent);
++ if (entry == NULL)
++ RETURN(-ENOMEM);
++ entry->proc_fops = seq_fops;
++ entry->data = data;
++
++ RETURN(0);
++}
++EXPORT_SYMBOL(lprocfs_seq_create);
++
++__inline__ int lprocfs_obd_seq_create(struct obd_device *dev, char *name,
++ mode_t mode,
++ struct file_operations *seq_fops,
++ void *data)
++{
++ return (lprocfs_seq_create(dev->obd_proc_entry, name,
++ mode, seq_fops, data));
++}
++EXPORT_SYMBOL(lprocfs_obd_seq_create);
++
++void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
++{
++ if (value >= OBD_HIST_MAX)
++ value = OBD_HIST_MAX - 1;
++
++ spin_lock(&oh->oh_lock);
++ oh->oh_buckets[value]++;
++ spin_unlock(&oh->oh_lock);
++}
++EXPORT_SYMBOL(lprocfs_oh_tally);
++
++void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
++{
++ unsigned int val;
++
++ for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
++ ;
++
++ lprocfs_oh_tally(oh, val);
++}
++EXPORT_SYMBOL(lprocfs_oh_tally_log2);
++
++unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
++{
++ unsigned long ret = 0;
++ int i;
++
++ for (i = 0; i < OBD_HIST_MAX; i++)
++ ret += oh->oh_buckets[i];
++ return ret;
++}
++EXPORT_SYMBOL(lprocfs_oh_sum);
++
++void lprocfs_oh_clear(struct obd_histogram *oh)
++{
++ spin_lock(&oh->oh_lock);
++ memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
++ spin_unlock(&oh->oh_lock);
++}
++EXPORT_SYMBOL(lprocfs_oh_clear);
++
++int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct obd_device *obd = data;
++ int len = 0, size;
++
++ LASSERT(obd != NULL);
++ LASSERT(count >= 0);
++
++ /* Set start of user data returned to
++ page + off since the user may have
++ requested to read much smaller than
++ what we need to read */
++ *start = page + off;
++
++ /* We know we are allocated a page here.
++ Also we know that this function will
++ not need to write more than a page
++ so we can truncate at CFS_PAGE_SIZE. */
++ size = min(count + (int)off + 1, (int)CFS_PAGE_SIZE);
++
++ /* Initialize the page */
++ memset(page, 0, size);
++
++ if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0)
++ goto out;
++ if (obd->obd_max_recoverable_clients == 0) {
++ if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0)
++ goto out;
++
++ goto fclose;
++ }
++
++ /* sampled unlocked, but really... */
++ if (obd->obd_recovering == 0) {
++ if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,
++ "recovery_start: %lu\n",
++ obd->obd_recovery_start) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,
++ "recovery_duration: %lu\n",
++ obd->obd_recovery_end -
++ obd->obd_recovery_start) <= 0)
++ goto out;
++ /* Number of clients that have completed recovery */
++ if (lprocfs_obd_snprintf(&page, size, &len,
++ "completed_clients: %d/%d\n",
++ obd->obd_max_recoverable_clients -
++ obd->obd_recoverable_clients,
++ obd->obd_max_recoverable_clients) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,
++ "replayed_requests: %d\n",
++ obd->obd_replayed_requests) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,
++ "last_transno: "LPD64"\n",
++ obd->obd_next_recovery_transno - 1)<=0)
++ goto out;
++ goto fclose;
++ }
++
++ if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n",
++ obd->obd_recovery_start) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n",
++ cfs_time_current_sec() >= obd->obd_recovery_end ? 0 :
++ obd->obd_recovery_end - cfs_time_current_sec()) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n",
++ obd->obd_connected_clients,
++ obd->obd_max_recoverable_clients) <= 0)
++ goto out;
++ /* Number of clients that have completed recovery */
++ if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d/%d\n",
++ obd->obd_max_recoverable_clients -
++ obd->obd_recoverable_clients,
++ obd->obd_max_recoverable_clients) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d/??\n",
++ obd->obd_replayed_requests) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n",
++ obd->obd_requests_queued_for_recovery) <= 0)
++ goto out;
++ if (lprocfs_obd_snprintf(&page, size, &len, "next_transno: "LPD64"\n",
++ obd->obd_next_recovery_transno) <= 0)
++ goto out;
++
++fclose:
++ *eof = 1;
++out:
++ return min(count, len - (int)off);
++}
++EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status);
++
++int lprocfs_obd_rd_hash(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct obd_device *obd = data;
++ int c = 0;
++
++ if (obd == NULL)
++ return 0;
++
++ c += lustre_hash_debug_header(page, count);
++ c += lustre_hash_debug_str(obd->obd_uuid_hash, page + c, count - c);
++ c += lustre_hash_debug_str(obd->obd_nid_hash, page + c, count - c);
++ c += lustre_hash_debug_str(obd->obd_nid_stats_hash, page+c, count-c);
++
++ return c;
++}
++EXPORT_SYMBOL(lprocfs_obd_rd_hash);
++
++#ifdef CRAY_XT3
++int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ LASSERT(obd != NULL);
++
++ return snprintf(page, count, "%lu\n",
++ obd->obd_recovery_max_time);
++}
++EXPORT_SYMBOL(lprocfs_obd_rd_recovery_maxtime);
++
++int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ struct obd_device *obd = (struct obd_device *)data;
++ int val, rc;
++ LASSERT(obd != NULL);
++
++ rc = lprocfs_write_helper(buffer, count, &val);
++ if (rc)
++ return rc;
++
++ obd->obd_recovery_max_time = val;
++ return count;
++}
++EXPORT_SYMBOL(lprocfs_obd_wr_recovery_maxtime);
++#endif /* CRAY_XT3 */
++
++EXPORT_SYMBOL(lprocfs_register);
++EXPORT_SYMBOL(lprocfs_srch);
++EXPORT_SYMBOL(lprocfs_remove);
++EXPORT_SYMBOL(lprocfs_add_vars);
++EXPORT_SYMBOL(lprocfs_obd_setup);
++EXPORT_SYMBOL(lprocfs_obd_cleanup);
++EXPORT_SYMBOL(lprocfs_add_simple);
++EXPORT_SYMBOL(lprocfs_free_per_client_stats);
++EXPORT_SYMBOL(lprocfs_alloc_stats);
++EXPORT_SYMBOL(lprocfs_free_stats);
++EXPORT_SYMBOL(lprocfs_clear_stats);
++EXPORT_SYMBOL(lprocfs_register_stats);
++EXPORT_SYMBOL(lprocfs_init_ops_stats);
++EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
++EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
++EXPORT_SYMBOL(lprocfs_free_obd_stats);
++EXPORT_SYMBOL(lprocfs_exp_setup);
++EXPORT_SYMBOL(lprocfs_exp_cleanup);
++
++EXPORT_SYMBOL(lprocfs_rd_u64);
++EXPORT_SYMBOL(lprocfs_rd_atomic);
++EXPORT_SYMBOL(lprocfs_wr_atomic);
++EXPORT_SYMBOL(lprocfs_rd_uint);
++EXPORT_SYMBOL(lprocfs_wr_uint);
++EXPORT_SYMBOL(lprocfs_rd_uuid);
++EXPORT_SYMBOL(lprocfs_rd_name);
++EXPORT_SYMBOL(lprocfs_rd_fstype);
++EXPORT_SYMBOL(lprocfs_rd_server_uuid);
++EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
++EXPORT_SYMBOL(lprocfs_rd_num_exports);
++EXPORT_SYMBOL(lprocfs_rd_numrefs);
++EXPORT_SYMBOL(lprocfs_at_hist_helper);
++EXPORT_SYMBOL(lprocfs_rd_import);
++EXPORT_SYMBOL(lprocfs_rd_timeouts);
++EXPORT_SYMBOL(lprocfs_rd_blksize);
++EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
++EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
++EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
++EXPORT_SYMBOL(lprocfs_rd_filestotal);
++EXPORT_SYMBOL(lprocfs_rd_filesfree);
++
++EXPORT_SYMBOL(lprocfs_write_helper);
++EXPORT_SYMBOL(lprocfs_write_frac_helper);
++EXPORT_SYMBOL(lprocfs_read_frac_helper);
++EXPORT_SYMBOL(lprocfs_write_u64_helper);
++EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
++#endif /* LPROCFS*/
+diff -urNad lustre~/lustre/ptlrpc/service.c lustre/lustre/ptlrpc/service.c
+--- lustre~/lustre/ptlrpc/service.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/ptlrpc/service.c 2009-03-12 11:02:51.000000000 +0100
+@@ -1501,7 +1501,7 @@
+ cfs_daemonize(name);
+ exit_fs(cfs_current());
+ current->fs = fs;
+- ll_set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
++ ll_set_fs_pwd(current->fs, cfs_fs_mnt(init_task.fs), cfs_fs_pwd(init_task.fs));
+ }
+
+ static void
diff --git a/debian/patches/posix_acl.dpatch b/debian/patches/posix_acl.dpatch
index 1ab99e0..c837e3b 100755
--- a/debian/patches/posix_acl.dpatch
+++ b/debian/patches/posix_acl.dpatch
@@ -5,22 +5,21 @@
## DP: Patch from Q-Leap Networks
@DPATCH@
-
-diff --git a/lustre/llite/file.c b/lustre/llite/file.c
---- a/lustre/llite/file.c
-+++ b/lustre/llite/file.c
-@@ -27,6 +27,7 @@
+diff -urNad lustre~/lustre/llite/file.c lustre/lustre/llite/file.c
+--- lustre~/lustre/llite/file.c 2009-03-12 10:33:45.000000000 +0100
++++ lustre/lustre/llite/file.c 2009-03-12 10:41:51.000000000 +0100
+@@ -45,6 +45,7 @@
#include <lustre_lite.h>
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/posix_acl.h>
- #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
- #include <linux/lustre_compat25.h>
- #endif
-diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c
---- a/lustre/llite/xattr.c
-+++ b/lustre/llite/xattr.c
-@@ -23,6 +23,7 @@
+ #include "llite_internal.h"
+ #include <lustre/ll_fiemap.h>
+
+diff -urNad lustre~/lustre/llite/xattr.c lustre/lustre/llite/xattr.c
+--- lustre~/lustre/llite/xattr.c 2009-03-12 10:27:57.000000000 +0100
++++ lustre/lustre/llite/xattr.c 2009-03-12 10:40:31.000000000 +0100
+@@ -38,6 +38,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp_lock.h>
@@ -28,10 +27,10 @@ diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c
#define DEBUG_SUBSYSTEM S_LLITE
-diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c
---- a/lustre/mdc/mdc_locks.c
-+++ b/lustre/mdc/mdc_locks.c
-@@ -32,6 +32,7 @@
+diff -urNad lustre~/lustre/mdc/mdc_locks.c lustre/lustre/mdc/mdc_locks.c
+--- lustre~/lustre/mdc/mdc_locks.c 2009-03-12 10:27:57.000000000 +0100
++++ lustre/lustre/mdc/mdc_locks.c 2009-03-12 10:40:31.000000000 +0100
+@@ -44,6 +44,7 @@
# include <linux/pagemap.h>
# include <linux/miscdevice.h>
# include <linux/init.h>
@@ -39,10 +38,10 @@ diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c
#else
# include <liblustre.h>
#endif
-diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
---- a/lustre/mdc/mdc_request.c
-+++ b/lustre/mdc/mdc_request.c
-@@ -32,6 +32,7 @@
+diff -urNad lustre~/lustre/mdc/mdc_request.c lustre/lustre/mdc/mdc_request.c
+--- lustre~/lustre/mdc/mdc_request.c 2009-03-12 10:27:57.000000000 +0100
++++ lustre/lustre/mdc/mdc_request.c 2009-03-12 10:40:31.000000000 +0100
+@@ -44,6 +44,7 @@
# include <linux/pagemap.h>
# include <linux/miscdevice.h>
# include <linux/init.h>
@@ -50,14 +49,14 @@ diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c
#else
# include <liblustre.h>
#endif
-diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
---- a/lustre/mds/handler.c
-+++ b/lustre/mds/handler.c
-@@ -40,6 +40,7 @@
+diff -urNad lustre~/lustre/mds/handler.c lustre/lustre/mds/handler.c
+--- lustre~/lustre/mds/handler.c 2009-03-12 10:40:31.000000000 +0100
++++ lustre/lustre/mds/handler.c 2009-03-12 10:42:18.000000000 +0100
+@@ -52,6 +52,7 @@
#include <linux/random.h>
#include <linux/fs.h>
#include <linux/jbd.h>
+#include <linux/posix_acl_xattr.h>
- #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
# include <linux/smp_lock.h>
# include <linux/buffer_head.h>
+ # include <linux/workqueue.h>
diff --git a/debian/patches/remove-set_tunables.dpatch b/debian/patches/remove-set_tunables.dpatch
index c7ff1c9..471e062 100755
--- a/debian/patches/remove-set_tunables.dpatch
+++ b/debian/patches/remove-set_tunables.dpatch
@@ -5,10 +5,10 @@
## DP: removed set tunables from mount.lustre since it doesn't work under etch
@DPATCH@
-diff -urNad lustre-1.6.6~/lustre/utils/mount_lustre.c lustre-1.6.6/lustre/utils/mount_lustre.c
---- lustre-1.6.6~/lustre/utils/mount_lustre.c 2008-11-26 10:50:51.000000000 +0100
-+++ lustre-1.6.6/lustre/utils/mount_lustre.c 2008-11-26 10:56:17.000000000 +0100
-@@ -305,7 +305,10 @@
+diff -urNad lustre~/lustre/utils/mount_lustre.c lustre/lustre/utils/mount_lustre.c
+--- lustre~/lustre/utils/mount_lustre.c 2009-03-12 10:32:27.000000000 +0100
++++ lustre/lustre/utils/mount_lustre.c 2009-03-12 10:44:12.000000000 +0100
+@@ -320,7 +320,10 @@
/* This is to tune the kernel for good SCSI performance.
* For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
@@ -20,7 +20,7 @@ diff -urNad lustre-1.6.6~/lustre/utils/mount_lustre.c lustre-1.6.6/lustre/utils/
int set_tunables(char *source, int src_len)
{
glob_t glob_info;
-@@ -359,10 +362,10 @@
+@@ -374,10 +377,10 @@
if (rc != ENOENT)
return rc;
@@ -33,7 +33,7 @@ diff -urNad lustre-1.6.6~/lustre/utils/mount_lustre.c lustre-1.6.6/lustre/utils/
dev = real_path + src_len - 1;
while (dev > real_path) {
if (isdigit(*dev))
-@@ -434,7 +437,7 @@
+@@ -449,7 +452,7 @@
out:
globfree(&glob_info);
return rc;
@@ -42,12 +42,13 @@ diff -urNad lustre-1.6.6~/lustre/utils/mount_lustre.c lustre-1.6.6/lustre/utils/
int main(int argc, char *const argv[])
{
-@@ -567,10 +570,10 @@
+@@ -582,11 +585,11 @@
printf("mounting device %s at %s, flags=%#x options=%s\n",
source, target, flags, optcopy);
-- if (set_tunables(source, strlen(source)) && verbose)
-+ /*if (set_tunables(source, strlen(source)) && verbose)
+- if (!strstr(usource, ":/") && set_tunables(source, strlen(source)) &&
++ /*if (!strstr(usource, ":/") && set_tunables(source, strlen(source)) &&
+ verbose)
fprintf(stderr, "%s: unable to set tunables for %s"
" (may cause reduced IO performance)\n",
- argv[0], source);
--
Lustre Debian Packaging
More information about the Pkg-lustre-svn-commit
mailing list