All of the above --- fs/Kconfig | 14 fs/exec.c | 12 fs/namei.c | 21 + fs/namespace.c | 38 ++ fs/nfs/Makefile | 3 fs/nfs/delegation.c | 25 + fs/nfs/delegation.h | 1 fs/nfs/dir.c | 70 ++-- fs/nfs/direct.c | 10 fs/nfs/file.c | 22 - fs/nfs/idmap.c | 1 fs/nfs/inode.c | 612 ++++++++++++++++++++++++++++++++++++++-- fs/nfs/namespace.c | 111 +++++++ fs/nfs/nfs2xdr.c | 3 fs/nfs/nfs3proc.c | 2 fs/nfs/nfs3xdr.c | 3 fs/nfs/nfs4_fs.h | 29 + fs/nfs/nfs4proc.c | 425 ++++++++++++++++----------- fs/nfs/nfs4state.c | 81 ++++- fs/nfs/nfs4xdr.c | 178 +++++++++++ fs/nfs/nfs_iostat.h | 87 +++++ fs/nfs/pagelist.c | 7 fs/nfs/proc.c | 2 fs/nfs/read.c | 9 fs/nfs/write.c | 11 fs/open.c | 14 fs/proc/base.c | 40 ++ fs/super.c | 22 - include/linux/fs.h | 1 include/linux/iosem.h | 110 +++++++ include/linux/mount.h | 5 include/linux/namei.h | 10 include/linux/nfs4.h | 1 include/linux/nfs_fs.h | 18 + include/linux/nfs_fs_sb.h | 7 include/linux/nfs_page.h | 1 include/linux/nfs_xdr.h | 55 +++ include/linux/sunrpc/auth_gss.h | 3 include/linux/sunrpc/clnt.h | 11 include/linux/sunrpc/sched.h | 2 include/linux/sunrpc/xdr.h | 1 include/linux/sunrpc/xprt.h | 2 lib/Makefile | 2 lib/iosem.c | 177 +++++++++++ net/sunrpc/auth_gss/auth_gss.c | 323 +++++++++++++++++++-- net/sunrpc/clnt.c | 65 +++- net/sunrpc/pmap_clnt.c | 2 net/sunrpc/rpc_pipe.c | 6 net/sunrpc/sched.c | 2 net/sunrpc/xdr.c | 28 + net/sunrpc/xprt.c | 12 51 files changed, 2359 insertions(+), 338 deletions(-) Index: linux-2.6.13-rc1/fs/Kconfig =================================================================== --- linux-2.6.13-rc1.orig/fs/Kconfig +++ linux-2.6.13-rc1/fs/Kconfig @@ -1526,6 +1526,20 @@ config RPCSEC_GSS_SPKM3 If unsure, say N. +config RPCSEC_GSS_KEYRING + bool "Secure RPC: keyring support (EXPERIMENTAL)" + depends on SUNRPC_GSS && KEYS && EXPERIMENTAL + help + Use the new RPCSEC_GSS upcall mechanism based on keyrings. + This allows individual threads, processes or groups of + processes to specify their own authentication tokens, + providing much the same functionality that AFS pags used to. + + Note: requires the new helper program /sbin/request-key, as + well as an updated rpc.gssd daemon in order to work. + + If unsure, say N + config SMB_FS tristate "SMB file system support (to mount Windows shares etc.)" depends on INET Index: linux-2.6.13-rc1/fs/exec.c =================================================================== --- linux-2.6.13-rc1.orig/fs/exec.c +++ linux-2.6.13-rc1/fs/exec.c @@ -126,7 +126,7 @@ asmlinkage long sys_uselib(const char __ struct nameidata nd; int error; - nd.intent.open.flags = FMODE_READ; + nd_init_open_intent(&nd, FMODE_READ, 0); error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); if (error) goto out; @@ -139,7 +139,7 @@ asmlinkage long sys_uselib(const char __ if (error) goto exit; - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = nd_open_file(&nd, O_RDONLY); error = PTR_ERR(file); if (IS_ERR(file)) goto out; @@ -167,7 +167,7 @@ asmlinkage long sys_uselib(const char __ out: return error; exit: - path_release(&nd); + path_release_open_intent(&nd); goto out; } @@ -495,7 +495,7 @@ struct file *open_exec(const char *name) int err; struct file *file; - nd.intent.open.flags = FMODE_READ; + nd_init_open_intent(&nd, FMODE_READ, 0); err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); file = ERR_PTR(err); @@ -509,7 +509,7 @@ struct file *open_exec(const char *name) err = -EACCES; file = ERR_PTR(err); if (!err) { - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = nd_open_file(&nd, O_RDONLY); if (!IS_ERR(file)) { err = deny_write_access(file); if (err) { @@ -521,7 +521,7 @@ out: return file; } } - path_release(&nd); + path_release_open_intent(&nd); } goto out; } Index: linux-2.6.13-rc1/fs/namei.c =================================================================== --- linux-2.6.13-rc1.orig/fs/namei.c +++ linux-2.6.13-rc1/fs/namei.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -318,6 +319,19 @@ void path_release_on_umount(struct namei } /* + * Open intents have to release any file pointer that was allocated + * but not used by the VFS. + */ +void path_release_open_intent(struct nameidata *nd) +{ + if ((nd->flags & LOOKUP_OPEN) && nd->intent.open.file != NULL) { + fput(nd->intent.open.file); + nd->intent.open.file = NULL; + } + path_release(nd); +} + +/* * Internal lookup() using the new generic dcache. * SMP-safe */ @@ -733,6 +747,7 @@ static fastcall int __link_path_walk(con struct qstr this; unsigned int c; + nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode, nd); if (err == -EAGAIN) { err = permission(inode, MAY_EXEC, nd); @@ -785,7 +800,6 @@ static fastcall int __link_path_walk(con if (err < 0) break; } - nd->flags |= LOOKUP_CONTINUE; /* This does the actual lookups.. */ err = do_lookup(nd, &this, &next); if (err) @@ -1426,8 +1440,7 @@ int open_namei(const char * pathname, in acc_mode |= MAY_APPEND; /* Fill in the open() intent data */ - nd->intent.open.flags = flag; - nd->intent.open.create_mode = mode; + nd_init_open_intent(nd, flag, mode); /* * The simplest case - just a plain lookup. @@ -1523,7 +1536,7 @@ exit_dput: if (nd->mnt != path.mnt) mntput(path.mnt); exit: - path_release(nd); + path_release_open_intent(nd); return error; do_link: Index: linux-2.6.13-rc1/fs/namespace.c =================================================================== --- linux-2.6.13-rc1.orig/fs/namespace.c +++ linux-2.6.13-rc1/fs/namespace.c @@ -265,6 +265,44 @@ struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int show_vfsstat(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = v; + int err = 0; + + /* device */ + if (mnt->mnt_devname) { + seq_puts(m, "device "); + mangle(m, mnt->mnt_devname); + } else + seq_puts(m, "no device"); + + /* mount point */ + seq_puts(m, " mounted on "); + seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); + seq_putc(m, ' '); + + /* file system type */ + seq_puts(m, "with fstype "); + mangle(m, mnt->mnt_sb->s_type->name); + + /* optional statistics */ + if (mnt->mnt_sb->s_op->show_stats) { + seq_putc(m, ' '); + err = mnt->mnt_sb->s_op->show_stats(m, mnt); + } + + seq_putc(m, '\n'); + return err; +} + +struct seq_operations mountstats_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_vfsstat, +}; + /** * may_umount_tree - check if a mount tree is busy * @mnt: root of mount tree Index: linux-2.6.13-rc1/fs/nfs/Makefile =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/Makefile +++ linux-2.6.13-rc1/fs/nfs/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := dir.o file.o inode.o nfs2xdr.o pagelist.o \ - proc.o read.o symlink.o unlink.o write.o + proc.o read.o symlink.o unlink.o write.o \ + namespace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o Index: linux-2.6.13-rc1/fs/nfs/delegation.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/delegation.c +++ linux-2.6.13-rc1/fs/nfs/delegation.c @@ -195,6 +195,31 @@ restart: } /* + * Return a delegation for clear_inode() + */ +void nfs_inode_clear_delegation(struct inode *inode) +{ + struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + + down_read(&clp->cl_sem); + spin_lock(&clp->cl_lock); + delegation = nfsi->delegation; + if (delegation != NULL) { + list_del_init(&delegation->super_list); + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + } + spin_unlock(&clp->cl_lock); + up_read(&clp->cl_sem); + if (delegation != NULL) { + nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); + nfs_free_delegation(delegation); + } +} + +/* * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. */ void nfs_handle_cb_pathdown(struct nfs4_client *clp) Index: linux-2.6.13-rc1/fs/nfs/delegation.h =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/delegation.h +++ linux-2.6.13-rc1/fs/nfs/delegation.h @@ -27,6 +27,7 @@ int nfs_inode_set_delegation(struct inod void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_inode_clear_delegation(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); void nfs_return_all_delegations(struct super_block *sb); Index: linux-2.6.13-rc1/fs/nfs/dir.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/dir.c +++ linux-2.6.13-rc1/fs/nfs/dir.c @@ -27,6 +27,7 @@ #include #include #include +#include "nfs_iostat.h" #include #include #include @@ -503,6 +504,8 @@ static int nfs_readdir(struct file *filp struct nfs_fattr fattr; long res; + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + lock_kernel(); res = nfs_revalidate_inode(NFS_SERVER(inode), inode); @@ -710,6 +713,7 @@ static int nfs_lookup_revalidate(struct parent = dget_parent(dentry); lock_kernel(); dir = parent->d_inode; + nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; if (!inode) { @@ -830,6 +834,17 @@ int nfs_is_exclusive_create(struct inode return (nd->intent.open.flags & O_EXCL) != 0; } +static inline int nfs_reval_fsid(struct inode *dir, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + struct nfs_server *server = NFS_SERVER(dir); + + if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) + /* Revalidate fsid on root dir */ + return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode); + return 0; +} + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { struct dentry *res; @@ -840,6 +855,7 @@ static struct dentry *nfs_lookup(struct dfprintk(VFS, "NFS: lookup(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); res = ERR_PTR(-ENAMETOOLONG); if (dentry->d_name.len > NFS_SERVER(dir)->namelen) @@ -851,10 +867,8 @@ static struct dentry *nfs_lookup(struct lock_kernel(); /* Revalidate parent directory attribute cache */ error = nfs_revalidate_inode(NFS_SERVER(dir), dir); - if (error < 0) { - res = ERR_PTR(error); - goto out_unlock; - } + if (error < 0) + goto out_err; /* If we're doing an exclusive create, optimize away the lookup */ if (nfs_is_exclusive_create(dir, nd)) @@ -863,10 +877,11 @@ static struct dentry *nfs_lookup(struct error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error == -ENOENT) goto no_entry; - if (error < 0) { - res = ERR_PTR(error); - goto out_unlock; - } + if (error < 0) + goto out_err; + error = nfs_reval_fsid(dir, &fhandle, &fattr); + if (error < 0) + goto out_err; res = ERR_PTR(-EACCES); inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); if (!inode) @@ -881,6 +896,9 @@ out_unlock: unlock_kernel(); out: return res; +out_err: + res = ERR_PTR(error); + goto out_unlock; } #ifdef CONFIG_NFS_V4 @@ -912,7 +930,6 @@ static int is_atomic_open(struct inode * static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct dentry *res = NULL; - struct inode *inode = NULL; int error; /* Check that we are indeed trying to open this file */ @@ -926,8 +943,10 @@ static struct dentry *nfs_atomic_lookup( dentry->d_op = NFS_PROTO(dir)->dentry_ops; /* Let vfs_create() deal with O_EXCL */ - if (nd->intent.open.flags & O_EXCL) - goto no_entry; + if (nd->intent.open.flags & O_EXCL) { + d_add(dentry, NULL); + goto out; + } /* Open the file on the server */ lock_kernel(); @@ -940,32 +959,30 @@ static struct dentry *nfs_atomic_lookup( if (nd->intent.open.flags & O_CREAT) { nfs_begin_data_update(dir); - inode = nfs4_atomic_open(dir, dentry, nd); + res = nfs4_atomic_open(dir, dentry, nd); nfs_end_data_update(dir); } else - inode = nfs4_atomic_open(dir, dentry, nd); + res = nfs4_atomic_open(dir, dentry, nd); unlock_kernel(); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); + if (IS_ERR(res)) { + error = PTR_ERR(res); switch (error) { /* Make a negative dentry */ case -ENOENT: - inode = NULL; - break; + res = NULL; + goto out; /* This turned out not to be a regular file */ + case -EISDIR: + case -ENOTDIR: + goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: - res = ERR_PTR(error); goto out; } - } -no_entry: - res = d_add_unique(dentry, inode); - if (res != NULL) + } else if (res != NULL) dentry = res; nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -1009,7 +1026,7 @@ static int nfs_open_revalidate(struct de */ lock_kernel(); verifier = nfs_save_change_attribute(dir); - ret = nfs4_open_revalidate(dir, dentry, openflags); + ret = nfs4_open_revalidate(dir, dentry, openflags, nd); if (!ret) nfs_set_verifier(dentry, verifier); unlock_kernel(); @@ -1132,7 +1149,7 @@ static int nfs_create(struct inode *dir, lock_kernel(); nfs_begin_data_update(dir); - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); nfs_end_data_update(dir); if (error != 0) goto out_err; @@ -1243,6 +1260,7 @@ static int nfs_sillyrename(struct inode dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, atomic_read(&dentry->d_count)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); #ifdef NFS_PARANOIA if (!dentry->d_inode) @@ -1627,6 +1645,8 @@ int nfs_permission(struct inode *inode, struct rpc_cred *cred; int res = 0; + nfs_inc_stats(inode, NFSIOS_VFSACCESS); + if (mask == 0) goto out; /* Is this sys_access() ? */ Index: linux-2.6.13-rc1/fs/nfs/direct.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/direct.c +++ linux-2.6.13-rc1/fs/nfs/direct.c @@ -47,6 +47,7 @@ #include #include +#include "nfs_iostat.h" #include #include @@ -66,6 +67,7 @@ struct nfs_direct_req { struct kref kref; /* release manager */ struct list_head list; /* nfs_read_data structs */ wait_queue_head_t wait; /* wait for i/o completion */ + struct inode * inode; /* target file of I/O */ struct page ** pages; /* pages in our buffer */ unsigned int npages; /* count of pages */ atomic_t complete, /* i/os we're waiting for */ @@ -207,6 +209,8 @@ static void nfs_direct_read_result(struc { struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + nfs_add_stats(dreq->inode, NFSIOS_SERVERREADBYTES, data->res.count); + if (likely(status >= 0)) atomic_add(data->res.count, &dreq->count); else @@ -347,6 +351,7 @@ static ssize_t nfs_direct_read_seg(struc dreq->pages = pages; dreq->npages = nr_pages; + dreq->inode = inode; rpc_clnt_sigmask(clnt, &oldset); nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, @@ -354,6 +359,8 @@ static ssize_t nfs_direct_read_seg(struc result = nfs_direct_read_wait(dreq, clnt->cl_intr); rpc_clnt_sigunmask(clnt, &oldset); + if (result > 0) + nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, result); return result; } @@ -571,11 +578,14 @@ static ssize_t nfs_direct_write(struct i break; return result; } + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, result); tot_bytes += result; file_offset += result; if (result < size) break; } + if (tot_bytes > 0) + nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, tot_bytes); return tot_bytes; } Index: linux-2.6.13-rc1/fs/nfs/file.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/file.c +++ linux-2.6.13-rc1/fs/nfs/file.c @@ -22,6 +22,7 @@ #include #include #include +#include "nfs_iostat.h" #include #include #include @@ -102,18 +103,15 @@ static int nfs_check_flags(int flags) static int nfs_file_open(struct inode *inode, struct file *filp) { - struct nfs_server *server = NFS_SERVER(inode); - int (*open)(struct inode *, struct file *); int res; res = nfs_check_flags(filp->f_flags); if (res) return res; + nfs_inc_stats(inode, NFSIOS_VFSOPEN); lock_kernel(); - /* Do NFSv4 open() call */ - if ((open = server->rpc_ops->file_open) != NULL) - res = open(inode, filp); + res = NFS_SERVER(inode)->rpc_ops->file_open(inode, filp); unlock_kernel(); return res; } @@ -124,6 +122,7 @@ nfs_file_release(struct inode *inode, st /* Ensure that dirty pages are flushed out with the right creds */ if (filp->f_mode & FMODE_WRITE) filemap_fdatawrite(filp->f_mapping); + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return NFS_PROTO(inode)->file_release(inode, filp); } @@ -197,6 +196,7 @@ nfs_file_flush(struct file *file) if ((file->f_mode & FMODE_WRITE) == 0) return 0; + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); lock_kernel(); /* Ensure that data+attribute caches are up to date after close() */ status = nfs_wb_all(inode); @@ -229,6 +229,8 @@ nfs_file_read(struct kiocb *iocb, char _ result = nfs_revalidate_file(inode, iocb->ki_filp); if (!result) result = generic_file_aio_read(iocb, buf, count, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); return result; } @@ -280,6 +282,7 @@ nfs_fsync(struct file *file, struct dent dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); lock_kernel(); status = nfs_wb_all(inode); if (!status) { @@ -364,6 +367,8 @@ nfs_file_write(struct kiocb *iocb, const goto out; result = generic_file_aio_write(iocb, buf, count, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, result); out: return result; @@ -504,13 +509,14 @@ static int nfs_lock(struct file *filp, i { struct inode * inode = filp->f_mapping->host; + if (!inode) + return -EINVAL; + dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", inode->i_sb->s_id, inode->i_ino, fl->fl_type, fl->fl_flags, (long long)fl->fl_start, (long long)fl->fl_end); - - if (!inode) - return -EINVAL; + nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) Index: linux-2.6.13-rc1/fs/nfs/idmap.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/idmap.c +++ linux-2.6.13-rc1/fs/nfs/idmap.c @@ -46,7 +46,6 @@ #include #include -#include #include #include Index: linux-2.6.13-rc1/fs/nfs/inode.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/inode.c +++ linux-2.6.13-rc1/fs/nfs/inode.c @@ -27,11 +27,13 @@ #include #include #include +#include "nfs_iostat.h" #include #include #include #include #include +#include #include #include #include @@ -64,6 +66,7 @@ static void nfs_clear_inode(struct inode static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct super_block *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); +static int nfs_show_stats(struct seq_file *, struct vfsmount *); static void nfs_zap_acl_cache(struct inode *); static struct rpc_program nfs_program; @@ -77,6 +80,7 @@ static struct super_operations nfs_sops .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_stats = nfs_show_stats, }; /* @@ -123,6 +127,60 @@ struct rpc_program nfsacl_program = { }; #endif /* CONFIG_NFS_V3_ACL */ +#ifdef CONFIG_SYSCTL +/* Follow the established convention in NLM */ +#define CTL_UNNUMBERED -2 + +static ctl_table nfs_sysctls[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs_mountpoint_timeout", + .data = &nfs_mountpoint_expiry_timeout, + .maxlen = sizeof(nfs_mountpoint_expiry_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nfs_sysctl_dir[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs", + .mode = 0555, + .child = nfs_sysctls, + }, + { .ctl_name = 0 } +}; + +static ctl_table nfs_sysctl_root[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = nfs_sysctl_dir, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *nfs_sysctl_table; + +static inline int nfs_init_sysctl(void) +{ + nfs_sysctl_table = register_sysctl_table(nfs_sysctl_root, 0); + return nfs_sysctl_table != NULL ? 0 : -ENOMEM; +} + +static inline void nfs_destroy_sysctl(void) +{ + unregister_sysctl_table(nfs_sysctl_table); +} +#else +#define nfs_init_sysctl() (0) +#define nfs_destroy_sysctl() do { } while(0) +#endif /* CONFIG_SYSCTL */ + static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { @@ -227,6 +285,14 @@ nfs_block_size(unsigned long bsize, unsi return nfs_block_bits(bsize, nrbitsp); } +static inline void +nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) +{ + sb->s_maxbytes = (loff_t)maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) + sb->s_maxbytes = MAX_LFS_FILESIZE; +} + /* * Obtain the root inode of the file system. */ @@ -243,6 +309,7 @@ nfs_get_root(struct super_block *sb, str return ERR_PTR(error); } + server->fsid = fsinfo->fattr->fsid; rooti = nfs_fhget(sb, rootfh, fsinfo->fattr); if (!rooti) return ERR_PTR(-ENOMEM); @@ -287,6 +354,12 @@ nfs_sb_init(struct super_block *sb, rpc_ } sb->s_root->d_op = server->rpc_ops->dentry_ops; + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + no_root_error = -ENOMEM; + goto out_no_root; + } + /* Get some general file system info */ if (server->namelen == 0 && server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) @@ -338,9 +411,7 @@ nfs_sb_init(struct super_block *sb, rpc_ } server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - sb->s_maxbytes = fsinfo.maxfilesize; - if (sb->s_maxbytes > MAX_LFS_FILESIZE) - sb->s_maxbytes = MAX_LFS_FILESIZE; + nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0; server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0; @@ -378,6 +449,9 @@ nfs_create_client(struct nfs_server *ser if (!timeparms.to_retries) timeparms.to_retries = 5; + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + /* create transport and client */ xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP, &server->addr, &timeparms); @@ -564,7 +638,7 @@ nfs_statfs(struct super_block *sb, struc } -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { static struct proc_nfs_info { int flag; @@ -572,37 +646,120 @@ static int nfs_show_options(struct seq_f char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, - { NFS_MOUNT_INTR, ",intr", "" }, - { NFS_MOUNT_POSIX, ",posix", "" }, + { NFS_MOUNT_INTR, ",intr", ",nointr" }, { NFS_MOUNT_TCP, ",tcp", ",udp" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, - { NFS_MOUNT_NONLM, ",nolock", ",lock" }, + { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { 0, NULL, NULL } }; struct proc_nfs_info *nfs_infop; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); - seq_printf(m, ",v%d", nfss->rpc_ops->version); + seq_printf(m, ",vers=%d", nfss->rpc_ops->version); seq_printf(m, ",rsize=%d", nfss->rsize); seq_printf(m, ",wsize=%d", nfss->wsize); - if (nfss->acregmin != 3*HZ) + if (nfss->acregmin != 3*HZ || showdefaults) seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ) + if (nfss->acregmax != 60*HZ || showdefaults) seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ) + if (nfss->acdirmin != 30*HZ || showdefaults) seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ) + if (nfss->acdirmax != 60*HZ || showdefaults) seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); + if (nfss->flags & NFS_MOUNT_TCP) + seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ); + seq_printf(m, ",retrans=%u", nfss->retrans_count); + for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); else seq_puts(m, nfs_infop->nostr); } +} + +static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + + nfs_show_mount_options(m, nfss, 0); + seq_puts(m, ",addr="); seq_escape(m, nfss->hostname, " \t\n\\"); + + return 0; +} + +static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +{ + int i, cpu; + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct rpc_auth *auth = nfss->client->cl_auth; + struct nfs_iostats totals = { }; + + seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); + + /* + * Display all mount option settings + * need ro/rw, sync/async + */ + seq_printf(m, "\n\topts:\t"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + nfs_show_mount_options(m, nfss, 1); + + seq_printf(m, "\n\tcaps:\t"); + seq_printf(m, "caps=0x%x", nfss->caps); + seq_printf(m, ",wtmult=%d", nfss->wtmult); + seq_printf(m, ",dtsize=%d", nfss->dtsize); + seq_printf(m, ",bsize=%d", nfss->bsize); + seq_printf(m, ",namelen=%d", nfss->namelen); + +#ifdef CONFIG_NFS_V4 + if (nfss->rpc_ops->version == 4) { + seq_printf(m, "\n\tnfsv4:\t"); + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + } +#endif + + /* + * Display security flavor in effect for this mount + */ + seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + if (auth->au_flavor) + seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + + /* + * Display superblock I/O counters + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct nfs_iostats *stats; + + if (!cpu_possible(cpu)) + continue; + + preempt_disable(); + stats = per_cpu_ptr(nfss->io_stats, cpu); + + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + totals.events[i] += stats->events[i]; + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + totals.bytes[i] += stats->bytes[i]; + + preempt_enable(); + } + + seq_printf(m, "\n\tevents:\t"); + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + seq_printf(m, "%lu ", totals.events[i]); + seq_printf(m, "\n\tbytes:\t"); + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + return 0; } @@ -615,6 +772,8 @@ nfs_zap_caches(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; @@ -740,6 +899,11 @@ nfs_fhget(struct super_block *sb, struct if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && fattr->size <= NFS_LIMIT_READDIRPLUS) NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + /* Deal with crossing mountpoints */ + if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + inode->i_op = &nfs_mountpoint_inode_operations; + inode->i_fop = NULL; + } } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; else @@ -827,6 +991,7 @@ nfs_setattr(struct dentry *dentry, struc if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); inode->i_size = attr->ia_size; vmtruncate(inode, attr->ia_size); } @@ -847,14 +1012,17 @@ nfs_wait_on_inode(struct inode *inode, i { struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_inode *nfsi = NFS_I(inode); + unsigned long start = jiffies; + int error = 0; - int error; - if (!(NFS_FLAGS(inode) & flag)) - return 0; - atomic_inc(&inode->i_count); - error = nfs_wait_event(clnt, nfsi->nfs_i_wait, + if ((NFS_FLAGS(inode) & flag)) { + atomic_inc(&inode->i_count); + error = nfs_wait_event(clnt, nfsi->nfs_i_wait, !(NFS_FLAGS(inode) & flag)); - iput(inode); + nfs_add_stats(inode, NFSIOS_WAITEVENTJIFFIES, (jiffies - start)); + nfs_inc_stats(inode, NFSIOS_WAITEVENT); + iput(inode); + } return error; } @@ -1096,6 +1264,7 @@ int nfs_attribute_timeout(struct inode * */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) && !nfs_attribute_timeout(inode)) return NFS_STALE(inode) ? -ESTALE : 0; @@ -1112,6 +1281,7 @@ void nfs_revalidate_mapping(struct inode struct nfs_inode *nfsi = NFS_I(inode); if (nfsi->flags & NFS_INO_INVALID_DATA) { + nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); if (S_ISREG(inode->i_mode)) { if (filemap_fdatawrite(mapping) == 0) filemap_fdatawait(mapping); @@ -1257,6 +1427,7 @@ int nfs_refresh_inode(struct inode *inod */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier) { + struct nfs_server *server; struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; unsigned int invalid = 0; @@ -1284,6 +1455,12 @@ static int nfs_update_inode(struct inode if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; + server = NFS_SERVER(inode); + /* Update the fsid if and only if this is the root directory */ + if (inode == inode->i_sb->s_root->d_inode + && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + server->fsid = fattr->fsid; + /* * Update the read time so we don't revalidate too often. */ @@ -1361,6 +1538,7 @@ static int nfs_update_inode(struct inode /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { + nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; } else if (time_after(jiffies, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { @@ -1396,9 +1574,170 @@ static int nfs_update_inode(struct inode } /* + * nfs_try_migrate_filehandle - Check if we can migrate the inode filehandle + * @inode - pointer to inode + * @fh - the filehandle resulting from lookup() + * @fattr - attributes associated with the new filehandle + * + * Do our very best to update existing inodes when the user wants to migrate + * this filesystem to a replica server. + * + * Note that here be HUGE dragons, with endless possibilities for causing + * trouble... + */ +int nfs_try_migrate_filehandle(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr, uint32_t generation) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + /* Argh! The basic file type has changed */ + if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + goto out_bad; + /* Fileid + filehandle are the same. Good! */ + if (nfs_compare_fh(&nfsi->fh, fh) == 0 && nfsi->fileid == fattr->fileid) + goto out_good; + if (fattr->valid && NFS_ATTR_FATTR_V4) { + /* Do the NFSv4 change attributes match our cached value? */ + if (nfsi->change_attr != fattr->change_attr) + goto out_bad; + } else { + /* Does the ctime match? */ + if (!timespec_equal(&fattr->ctime, &inode->i_ctime)) + goto out_bad; + } + /* Does the file size match? */ + if (nfs_size_to_loff_t(fattr->size) != inode->i_size) + goto out_bad; + /* FIXME: Here lie the biggest dragons: + * Try considering all possible races w.r.t. iget5_locked() + */ + nfs_copy_fh(&nfsi->fh, fh); + if (nfsi->fileid != fattr->fileid) { + /* The very concept of migrating to a new hash bucket + * is so full of holes and races that it defies belief! + */ + remove_inode_hash(inode); + nfsi->fileid = fattr->fileid; + inode->i_ino = nfs_fattr_to_ino_t(fattr); + insert_inode_hash(inode); + } +out_good: + inode->i_generation = generation; + return 0; +out_bad: + return -EIO; +} + +/* + * nfs_try_migrate_inode - Update an inode's filehandle after migration + * @inode - pointer to inode to migrate + * @dentry - pointer to dentry + */ +int nfs_try_migrate_inode(struct inode *inode, struct dentry *dentry) +{ + struct nfs_fh fh; + struct nfs_fattr fattr; + struct dentry *next, *next_parent; + uint32_t generation; + int status; + + if (dentry == NULL) { + status = -ENOENT; + dentry = d_find_alias(inode); + if (dentry == NULL) + goto out; + } else + dget(dentry); +repeat: + /* Has this inode already been revalidated? */ + status = 0; + generation = NFS_SERVER(inode)->generation; + if ((long)generation - (long)inode->i_generation <= 0) + goto out; + /* No. Search for a previously revalidated path element */ + next = dget(dentry); + next_parent = dget_parent(dentry); + while((long)generation - (long)next_parent->d_inode->i_generation > 0) { + BUG_ON(IS_ROOT(next_parent)); + dput(next); + next = next_parent; + next_parent = dget_parent(next); + } + status = NFS_PROTO(inode)->lookup(next_parent->d_inode, &next->d_name, + &fh, &fattr); + if (status == 0) + status = nfs_try_migrate_filehandle(next->d_inode, &fh, &fattr, generation); + switch (status) { + case -ESTALE: + if (IS_ROOT(next_parent)) + break; + case 0: + if (dentry->d_inode == inode) + break; + dput(next_parent); + dput(next); + goto repeat; + default: + d_drop(next); + } + dput(next_parent); + dput(next); +out: + dput(dentry); + dprintk("%s: returned error %d\n", __FUNCTION__, status); + return status; +} + +/* * File system information */ +/* + * nfs_path - reconstruct the path given an arbitrary dentry + * @base - arbitrary string to prepend to the path + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer + * + * Helper function for constructing the path from the + * root dentry to an arbitrary hashed dentry. + * + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition. + */ +static char *nfs_path(const char *base, const struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + char *end = buffer+buflen; + int namelen; + + *--end = '\0'; + buflen--; + spin_lock(&dcache_lock); + while (!IS_ROOT(dentry)) { + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + dentry = dentry->d_parent; + } + spin_unlock(&dcache_lock); + namelen = strlen(base); + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + buflen -= namelen; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, base, namelen); + return end; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + static int nfs_set_super(struct super_block *s, void *data) { s->s_fs_info = data; @@ -1547,6 +1886,7 @@ static void nfs_kill_super(struct super_ if (server->hostname != NULL) kfree(server->hostname); kfree(server); + nfs_release_automount_timer(); } static struct file_system_type nfs_fs_type = { @@ -1571,6 +1911,7 @@ static struct super_operations nfs4_sops .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_stats = nfs_show_stats, }; /* @@ -1582,9 +1923,6 @@ static void nfs4_clear_inode(struct inod { struct nfs_inode *nfsi = NFS_I(inode); - /* If we are holding a delegation, return it! */ - if (nfsi->delegation != NULL) - nfs_inode_return_delegation(inode); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); /* Now clear out any remaining state */ @@ -1602,6 +1940,9 @@ static void nfs4_clear_inode(struct inod BUG_ON(atomic_read(&state->count) != 1); nfs4_close_state(state, state->state); } + /* If we are holding a delegation, return it! */ + if (nfsi->delegation != NULL) + nfs_inode_clear_delegation(inode); } @@ -1656,6 +1997,9 @@ static int nfs4_fill_super(struct super_ return -EINVAL; } + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + clp = nfs4_get_client(&server->addr.sin_addr); if (!clp) { dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); @@ -1883,6 +2227,7 @@ out_free: kfree(server->mnt_path); if (server->hostname) kfree(server->hostname); + nfs_free_iostats(server->io_stats); kfree(server); return s; } @@ -1905,6 +2250,7 @@ static void nfs4_kill_super(struct super if (server->hostname != NULL) kfree(server->hostname); kfree(server); + nfs_release_automount_timer(); } static struct file_system_type nfs4_fs_type = { @@ -1915,6 +2261,59 @@ static struct file_system_type nfs4_fs_t .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; +/* Constructs the SERVER-side path */ +static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen) +{ + return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen); +} + +static inline char *nfs4_dup_path(const struct dentry *dentry) +{ + char *page = (char *) __get_free_page(GFP_USER); + char *path; + + path = nfs4_path(dentry, page, PAGE_SIZE); + if (!IS_ERR(path)) { + int len = PAGE_SIZE + page - path; + char *tmp = path; + + path = kmalloc(len, GFP_KERNEL); + if (path) + memcpy(path, tmp, len); + else + path = ERR_PTR(-ENOMEM); + } + free_page((unsigned long)page); + return path; +} + +static struct super_block *nfs4_clone_client(struct nfs_server *server, const struct dentry *dentry) +{ + struct nfs4_client *clp = server->nfs4_state; + struct super_block *sb; + + server->mnt_path = nfs4_dup_path(dentry); + if (IS_ERR(server->mnt_path)) { + sb = (struct super_block *)server->mnt_path; + goto err; + } + sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server); + if (IS_ERR(sb) || sb->s_root) + goto free_path; + nfs4_server_capabilities(server, &server->fh); + + down_write(&clp->cl_sem); + atomic_inc(&clp->cl_count); + list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); + up_write(&clp->cl_sem); + return sb; +free_path: + kfree(server->mnt_path); +err: + server->mnt_path = NULL; + return sb; +} + #define nfs4_init_once(nfsi) \ do { \ INIT_LIST_HEAD(&(nfsi)->open_states); \ @@ -1925,12 +2324,174 @@ static struct file_system_type nfs4_fs_t #define register_nfs4fs() register_filesystem(&nfs4_fs_type) #define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type) #else +#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL) #define nfs4_init_once(nfsi) \ do { } while (0) #define register_nfs4fs() (0) #define unregister_nfs4fs() #endif +static inline struct super_block *nfs_clone_client(struct nfs_server *server) +{ + struct super_block *sb; + + sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM)) + lockd_up(); + return sb; +} + +struct nfs_clone_mount { + const struct super_block *sb; + const struct dentry *dentry; + struct nfs_fh *fh; + struct nfs_fattr *fattr; +}; + +static struct super_block *clone_nfs_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + struct nfs_server *server; + struct nfs_server *parent = NFS_SB(data->sb); + struct super_block *sb = ERR_PTR(-EINVAL); + void *err = ERR_PTR(-ENOMEM); + struct inode *root_inode; + struct nfs_fsinfo fsinfo; + int len; + + server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (server == NULL) + goto out_err; + memcpy(server, parent, sizeof(*server)); + len = strlen(parent->hostname) + 1; + server->hostname = kmalloc(len, GFP_KERNEL); + if (server->hostname == NULL) + goto free_server; + memcpy(server->hostname, parent->hostname, len); + server->fsid = data->fattr->fsid; + nfs_copy_fh(&server->fh, data->fh); + server->io_stats = nfs_alloc_iostats(); + if (server->io_stats == NULL) + goto free_hostname; + if (rpciod_up() != 0) + goto free_iostats; + + switch (parent->rpc_ops->version) { + case 2: + case 3: + sb = nfs_clone_client(server); + break; + case 4: + sb = nfs4_clone_client(server, data->dentry); + break; + default: + BUG(); + } + if (IS_ERR((err = sb)) || sb->s_root) + goto kill_rpciod; + sb->s_op = data->sb->s_op; + sb->s_blocksize = data->sb->s_blocksize; + sb->s_blocksize_bits = data->sb->s_blocksize_bits; + sb->s_maxbytes = data->sb->s_maxbytes; + + server->client_sys = server->client_acl = ERR_PTR(-EINVAL); + server->client = rpc_clone_client(parent->client); + if (IS_ERR((err = server->client))) + goto out_deactivate; + if (!IS_ERR(parent->client_sys)) { + server->client_sys = rpc_clone_client(parent->client_sys); + if (IS_ERR((err = server->client_sys))) + goto out_deactivate; + } + if (!IS_ERR(parent->client_acl)) { + server->client_acl = rpc_clone_client(parent->client_acl); + if (IS_ERR((err = server->client_acl))) + goto out_deactivate; + } + root_inode = nfs_fhget(sb, data->fh, data->fattr); + if (!root_inode) + goto out_deactivate; + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_put_root; + fsinfo.fattr = data->fattr; + if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0) + nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); + sb->s_root->d_op = server->rpc_ops->dentry_ops; + sb->s_flags |= MS_ACTIVE; + return sb; +out_put_root: + iput(root_inode); +out_deactivate: + up_write(&sb->s_umount); + deactivate_super(sb); + return (struct super_block *)err; +kill_rpciod: + rpciod_down(); +free_iostats: + nfs_free_iostats(server->io_stats); +free_hostname: + kfree(server->hostname); +free_server: + kfree(server); +out_err: + return (struct super_block *)err; +} + +static struct file_system_type clone_nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .get_sb = clone_nfs_sb, + .kill_sb = nfs_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static inline char *nfs_devname(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen); +} + +/** + * nfs_do_submount - set up mountpoint when crossing a filesystem boundary + * @mnt_parent - mountpoint of parent directory + * @dentry - parent directory + * @fh - filehandle for new root dentry + * @fattr - attributes for new root inode + * + */ +struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, + const struct dentry *dentry, struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + struct nfs_clone_mount mountdata = { + .sb = mnt_parent->mnt_sb, + .dentry = dentry, + .fh = fh, + .fattr = fattr, + }; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + char *page = (char *) __get_free_page(GFP_USER); + char *devname; + + dprintk("%s: submounting on %s/%s\n", __FUNCTION__, + dentry->d_parent->d_name.name, + dentry->d_name.name); + if (page == NULL) + goto out; + devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); + if (!IS_ERR(devname)) + mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata); + else + mnt = (struct vfsmount *)devname; + free_page((unsigned long)page); +out: + dprintk("%s: done\n", __FUNCTION__); + return mnt; +} + extern int nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int nfs_init_readpagecache(void); @@ -2012,6 +2573,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_init_sysctl(); + if (err) + goto out5; + err = nfs_init_nfspagecache(); if (err) goto out4; @@ -2059,6 +2624,8 @@ out2: out3: nfs_destroy_nfspagecache(); out4: + nfs_destroy_sysctl(); +out5: return err; } @@ -2074,6 +2641,7 @@ static void __exit exit_nfs_fs(void) #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif + nfs_destroy_sysctl(); unregister_filesystem(&nfs_fs_type); unregister_nfs4fs(); } Index: linux-2.6.13-rc1/fs/nfs/namespace.c =================================================================== --- /dev/null +++ linux-2.6.13-rc1/fs/nfs/namespace.c @@ -0,0 +1,111 @@ +/* + * linux/fs/nfs/namespace.c + * + * Copyright (C) 2005 Trond Myklebust + * + * NFS namespace + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define NFSDBG_FACILITY NFSDBG_VFS + +static LIST_HEAD(nfs_automount_list); +static void nfs_expire_automounts(void *list); +static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list); +int nfs_mountpoint_expiry_timeout = 500 * HZ; + +/* + * nfs_follow_mountpoint - handle crossing a mountpoint on the server + * @dentry - dentry of mountpoint + * @nd - nameidata info + * + * When we encounter a mountpoint on the server, we want to set up + * a mountpoint on the client too, to prevent inode numbers from + * colliding, and to allow "df" to work properly. + * On NFSv4, we also want to allow for the fact that different + * filesystems may be migrated to different servers in a failover + * situation, and that different filesystems may want to use + * different security flavours. + */ +static int nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +{ + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct dentry *parent; + struct nfs_fh fh; + struct nfs_fattr fattr; + int err; + + BUG_ON(IS_ROOT(dentry)); + dprintk("%s: enter\n", __FUNCTION__); + dput(nd->dentry); + nd->dentry = dget(dentry); + if (d_mountpoint(nd->dentry)) + goto out_follow; + /* Look it up again */ + parent = dget_parent(nd->dentry); + err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr); + dput(parent); + if (err != 0) + goto out_err; + mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out_err; + } + mntget(mnt); + err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, &nfs_automount_list); + if (err < 0) { + mntput(mnt); + if (err == -EBUSY) + goto out_follow; + goto out_err; + } + mntput(nd->mnt); + dput(nd->dentry); + nd->mnt = mnt; + nd->dentry = dget(mnt->mnt_root); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +out: + dprintk("%s: done, returned %d\n", __FUNCTION__, err); + return err; +out_err: + path_release(nd); + goto out; +out_follow: + while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) + ; + err = 0; + goto out; +} + +struct inode_operations nfs_mountpoint_inode_operations = { + .follow_link = nfs_follow_mountpoint, + .getattr = nfs_getattr, +}; + +static void nfs_expire_automounts(void *data) +{ + struct list_head *list = (struct list_head *)data; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +} + +void nfs_release_automount_timer(void) +{ + if (list_empty(&nfs_automount_list)) { + cancel_delayed_work(&nfs_automount_task); + flush_scheduled_work(); + } +} Index: linux-2.6.13-rc1/fs/nfs/nfs2xdr.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs2xdr.c +++ linux-2.6.13-rc1/fs/nfs/nfs2xdr.c @@ -131,7 +131,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fatt fattr->du.nfs2.blocksize = ntohl(*p++); rdev = ntohl(*p++); fattr->du.nfs2.blocks = ntohl(*p++); - fattr->fsid_u.nfs3 = ntohl(*p++); + fattr->fsid.major = ntohl(*p++); + fattr->fsid.minor = 0; fattr->fileid = ntohl(*p++); p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); Index: linux-2.6.13-rc1/fs/nfs/nfs3proc.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs3proc.c +++ linux-2.6.13-rc1/fs/nfs/nfs3proc.c @@ -298,7 +298,7 @@ static int nfs3_proc_commit(struct nfs_w */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs_fh fhandle; struct nfs_fattr fattr; Index: linux-2.6.13-rc1/fs/nfs/nfs3xdr.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs3xdr.c +++ linux-2.6.13-rc1/fs/nfs/nfs3xdr.c @@ -166,7 +166,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fatt if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) fattr->rdev = 0; - p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3); + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; p = xdr_decode_hyper(p, &fattr->fileid); p = xdr_decode_time3(p, &fattr->atime); p = xdr_decode_time3(p, &fattr->mtime); Index: linux-2.6.13-rc1/fs/nfs/nfs4_fs.h =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs4_fs.h +++ linux-2.6.13-rc1/fs/nfs/nfs4_fs.h @@ -11,6 +11,8 @@ #ifdef CONFIG_NFS_V4 +#include + struct idmap; /* @@ -97,7 +99,7 @@ struct nfs4_client { * sequences of RPC calls. Their sole purpose is to provide once-only * semantics by allowing the server to identify replayed requests. * - * The ->so_sema is held during all state_owner seqid-mutating operations: + * The ->so_iosem is held during all state_owner seqid-mutating operations: * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize * so_seqid. */ @@ -105,8 +107,8 @@ struct nfs4_state_owner { struct list_head so_list; /* per-clientid list of state_owners */ struct nfs4_client *so_client; u32 so_id; /* 32-bit identifier, unique */ - struct semaphore so_sema; - u32 so_seqid; /* protected by so_sema */ + struct iosem so_iosem; + u32 so_seqid; /* protected by so_iosem */ atomic_t so_count; struct rpc_cred *so_cred; /* Associated cred */ @@ -153,7 +155,7 @@ struct nfs4_state { struct inode *inode; /* Pointer to the inode */ unsigned long flags; /* Do we hold any locks? */ - struct semaphore lock_sema; /* Serializes file locking operations */ + struct iosem lock_iosem; /* Serializes file locking operations */ spinlock_t state_lock; /* Protects the lock_states list */ nfs4_stateid stateid; @@ -191,8 +193,11 @@ extern int nfs4_proc_setclientid_confirm extern int nfs4_proc_async_renew(struct nfs4_client *); extern int nfs4_proc_renew(struct nfs4_client *); extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); -extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); -extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); +extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); +extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); +extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, + struct nfs4_fs_locations *fs_locations, struct page *page); extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; @@ -228,10 +233,22 @@ extern void nfs4_increment_seqid(int sta extern void nfs4_schedule_state_recovery(struct nfs4_client *); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls); +extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t); +extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); extern const nfs4_stateid zero_stateid; +static void inline nfs_lock_state_owner(struct nfs4_state_owner *sp) +{ + iosem_lock(&sp->so_iosem); +} + +static void inline nfs_unlock_state_owner(struct nfs4_state_owner *sp) +{ + iosem_unlock(&sp->so_iosem); +} + /* nfs4xdr.c */ extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; Index: linux-2.6.13-rc1/fs/nfs/nfs4proc.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs4proc.c +++ linux-2.6.13-rc1/fs/nfs/nfs4proc.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "nfs4_fs.h" #include "delegation.h" @@ -302,7 +303,7 @@ static int _nfs4_open_delegation_recall( }; int status = 0; - down(&sp->so_sema); + nfs_lock_state_owner(sp); if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) goto out; if (state->state == 0) @@ -318,7 +319,7 @@ static int _nfs4_open_delegation_recall( clear_bit(NFS_DELEGATED_STATE, &state->flags); } out: - up(&sp->so_sema); + nfs_unlock_state_owner(sp); dput(parent); return status; } @@ -455,6 +456,7 @@ static int _nfs4_open_expired(struct nfs .f_attr = &f_attr, .server = server, }; + uint32_t generation; int status = 0; if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { @@ -465,23 +467,17 @@ static int _nfs4_open_expired(struct nfs set_bit(NFS_DELEGATED_STATE, &state->flags); goto out; } + /* If we are in a failover situation, recover path first */ + status = nfs_try_migrate_inode(dir, parent); + if (status != 0) + goto out_nodeleg; + generation = server->generation; status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_nodeleg; - /* Check if files differ */ - if ((f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT)) + status = nfs_try_migrate_filehandle(inode, &o_res.fh, o_res.f_attr, generation); + if (status != 0) goto out_stale; - /* Has the file handle changed? */ - if (nfs_compare_fh(&o_res.fh, NFS_FH(inode)) != 0) { - /* Verify if the change attributes are the same */ - if (f_attr.change_attr != NFS_I(inode)->change_attr) - goto out_stale; - if (nfs_size_to_loff_t(f_attr.size) != inode->i_size) - goto out_stale; - /* Lets just pretend that this is the same file */ - nfs_copy_fh(NFS_FH(inode), &o_res.fh); - NFS_I(inode)->fileid = f_attr.fileid; - } memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); if (o_res.delegation_type != 0) { if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) @@ -495,7 +491,6 @@ out: dput(parent); return status; out_stale: - status = -ESTALE; /* Invalidate the state owner so we don't ever use it again */ nfs4_drop_state_owner(sp); d_drop(dentry); @@ -564,7 +559,7 @@ static int _nfs4_open_delegated(struct i dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); goto out_err; } - down(&sp->so_sema); + nfs_lock_state_owner(sp); state = nfs4_get_open_state(inode, sp); if (state == NULL) goto out_err; @@ -589,7 +584,7 @@ static int _nfs4_open_delegated(struct i set_bit(NFS_DELEGATED_STATE, &state->flags); update_open_stateid(state, &delegation->stateid, open_flags); out_ok: - up(&sp->so_sema); + nfs_unlock_state_owner(sp); nfs4_put_state_owner(sp); up_read(&nfsi->rwsem); up_read(&clp->cl_sem); @@ -600,7 +595,7 @@ out_err: if (sp != NULL) { if (state != NULL) nfs4_put_open_state(state); - up(&sp->so_sema); + nfs_unlock_state_owner(sp); nfs4_put_state_owner(sp); } up_read(&nfsi->rwsem); @@ -665,11 +660,25 @@ static int _nfs4_do_open(struct inode *d } else o_arg.u.attrs = sattr; /* Serialization for the sequence id */ - down(&sp->so_sema); + nfs_lock_state_owner(sp); status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); if (status != 0) goto out_err; + /* OPEN on anything except a regular file is disallowed in NFSv4 */ + switch (f_attr.mode & S_IFMT) { + case S_IFREG: + break; + case S_IFLNK: + status = -ELOOP; + goto out_err; + case S_IFDIR: + status = -EISDIR; + goto out_err; + default: + status = -ENOTDIR; + goto out_err; + } status = -ENOMEM; inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr); @@ -681,7 +690,7 @@ static int _nfs4_do_open(struct inode *d update_open_stateid(state, &o_res.stateid, flags); if (o_res.delegation_type != 0) nfs_inode_set_delegation(inode, cred, &o_res); - up(&sp->so_sema); + nfs_unlock_state_owner(sp); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); *res = state; @@ -690,7 +699,7 @@ out_err: if (sp != NULL) { if (state != NULL) nfs4_put_open_state(state); - up(&sp->so_sema); + nfs_unlock_state_owner(sp); nfs4_put_state_owner(sp); } /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ @@ -816,7 +825,7 @@ static void nfs4_close_done(struct rpc_t } state->state = calldata->arg.open_flags; nfs4_put_open_state(state); - up(&sp->so_sema); + nfs_unlock_state_owner(sp); nfs4_put_state_owner(sp); up_read(&server->nfs4_state->cl_sem); kfree(calldata); @@ -873,15 +882,46 @@ int nfs4_do_close(struct inode *inode, s * caller that an asynchronous RPC call has been launched, and * that it will release the semaphores on completion. */ - return (status == 0) ? -EINPROGRESS : status; + if (status == 0) + return -EINPROGRESS; + kfree(calldata); + return status; } -struct inode * +static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +{ + struct file *filp; + int flags; + + flags = nd->intent.open.flags & ~(FMODE_READ|FMODE_WRITE); + switch (nd->intent.open.flags & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + flags |= O_RDWR; + break; + case FMODE_WRITE: + flags |= O_WRONLY; + break; + case FMODE_READ: + flags |= O_RDONLY; + } + + filp = dentry_open(dget(dentry), mntget(nd->mnt), flags); + if (!IS_ERR(filp)) { + struct nfs_open_context *ctx; + ctx = (struct nfs_open_context *)filp->private_data; + ctx->state = state; + nd->intent.open.file = filp; + } else + nfs4_close_state(state, nd->intent.open.flags); +} + +struct dentry * nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct iattr attr; struct rpc_cred *cred; struct nfs4_state *state; + struct dentry *res; if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -895,16 +935,23 @@ nfs4_atomic_open(struct inode *dir, stru cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); if (IS_ERR(cred)) - return (struct inode *)cred; + return (struct dentry *)cred; state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); put_rpccred(cred); - if (IS_ERR(state)) - return (struct inode *)state; - return state->inode; + if (IS_ERR(state)) { + if (PTR_ERR(state) == -ENOENT) + d_add(dentry, NULL); + return (struct dentry *)state; + } + res = d_add_unique(dentry, state->inode); + if (res != NULL) + dentry = res; + nfs4_intent_set_file(nd, dentry, state); + return res; } int -nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags) +nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) { struct rpc_cred *cred; struct nfs4_state *state; @@ -917,18 +964,18 @@ nfs4_open_revalidate(struct inode *dir, if (IS_ERR(state)) state = nfs4_do_open(dir, dentry, openflags, NULL, cred); put_rpccred(cred); - if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) + if (state == ERR_PTR(-ENOENT) && dentry->d_inode == NULL) return 1; if (IS_ERR(state)) return 0; inode = state->inode; + iput(inode); if (inode == dentry->d_inode) { - iput(inode); + nfs4_intent_set_file(nd, dentry, state); return 1; } d_drop(dentry); nfs4_close_state(state, openflags); - iput(inode); return 0; } @@ -957,7 +1004,7 @@ static int _nfs4_server_capabilities(str return status; } -static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { struct nfs4_exception exception = { }; int err; @@ -1427,7 +1474,7 @@ static int nfs4_proc_commit(struct nfs_w static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs4_state *state; struct rpc_cred *cred; @@ -1449,11 +1496,11 @@ nfs4_proc_create(struct inode *dir, stru struct nfs_fattr fattr; status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, NFS_FH(state->inode), sattr, state); - if (status == 0) - goto out; - } else if (flags != 0) - goto out; - nfs4_close_state(state, flags); + } + if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) + nfs4_intent_set_file(nd, dentry, state); + else + nfs4_close_state(state, flags); out: return status; } @@ -2100,65 +2147,6 @@ nfs4_proc_renew(struct nfs4_client *clp) return 0; } -/* - * We will need to arrange for the VFS layer to provide an atomic open. - * Until then, this open method is prone to inefficiency and race conditions - * due to the lookup, potential create, and open VFS calls from sys_open() - * placed on the wire. - */ -static int -nfs4_proc_file_open(struct inode *inode, struct file *filp) -{ - struct dentry *dentry = filp->f_dentry; - struct nfs_open_context *ctx; - struct nfs4_state *state = NULL; - struct rpc_cred *cred; - int status = -ENOMEM; - - dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n", - (int)dentry->d_parent->d_name.len, - dentry->d_parent->d_name.name, - (int)dentry->d_name.len, dentry->d_name.name); - - - /* Find our open stateid */ - cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_context(dentry, cred); - put_rpccred(cred); - if (unlikely(ctx == NULL)) - return -ENOMEM; - status = -EIO; /* ERACE actually */ - state = nfs4_find_state(inode, cred, filp->f_mode); - if (unlikely(state == NULL)) - goto no_state; - ctx->state = state; - nfs4_close_state(state, filp->f_mode); - ctx->mode = filp->f_mode; - nfs_file_set_open_context(filp, ctx); - put_nfs_open_context(ctx); - if (filp->f_mode & FMODE_WRITE) - nfs_begin_data_update(inode); - return 0; -no_state: - printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__); - put_nfs_open_context(ctx); - return status; -} - -/* - * Release our state - */ -static int -nfs4_proc_file_release(struct inode *inode, struct file *filp) -{ - if (filp->f_mode & FMODE_WRITE) - nfs_end_data_update(inode); - nfs_file_clear_open_context(filp); - return 0; -} - static inline int nfs4_server_supports_acls(struct nfs_server *server) { return (server->caps & NFS_CAP_ACLS) @@ -2529,38 +2517,56 @@ nfs4_proc_setclientid_confirm(struct nfs return status; } -static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) +struct nfs4_delegreturn_data { + struct nfs_server *server; + struct nfs_fh fhandle; + nfs4_stateid stateid; + struct nfs4_delegreturnargs args; +}; + +static void nfs4_delegreturn_done(struct rpc_task *task) { - struct nfs4_delegreturnargs args = { - .fhandle = NFS_FH(inode), - .stateid = stateid, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], - .rpc_argp = &args, - .rpc_cred = cred, - }; + struct nfs4_delegreturn_data *calldata = (struct nfs4_delegreturn_data *)task->tk_calldata; - return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_schedule_state_recovery(calldata->server->nfs4_state); + break; + default: + if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) { + rpc_restart_call(task); + return; + } + } + kfree(calldata); } int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) { - struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_delegreturn(inode, cred, stateid); - switch (err) { - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs4_state); - case 0: - return 0; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + struct nfs4_delegreturn_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], + .rpc_cred = cred, + }; + int status = -ENOMEM; + + calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->server = NFS_SERVER(inode); + nfs_copy_fh(&calldata->fhandle, NFS_FH(inode)); + memcpy(&calldata->stateid, stateid, sizeof(calldata->stateid)); + calldata->args.fhandle = &calldata->fhandle; + calldata->args.stateid = &calldata->stateid; + msg.rpc_argp = &calldata->args; + status = rpc_call_async(NFS_CLIENT(inode), &msg, 0, nfs4_delegreturn_done, calldata); + if (status != 0) + kfree(calldata); +out: + return status; } #define NFS4_LOCK_MINTIMEOUT (1 * HZ) @@ -2630,7 +2636,7 @@ static int _nfs4_proc_getlk(struct nfs4_ down_read(&clp->cl_sem); nlo.clientid = clp->cl_clientid; - down(&state->lock_sema); + iosem_lock(&state->lock_iosem); status = nfs4_set_lock_state(state, request); if (status != 0) goto out; @@ -2657,7 +2663,7 @@ static int _nfs4_proc_getlk(struct nfs4_ status = 0; } out: - up(&state->lock_sema); + iosem_unlock(&state->lock_iosem); up_read(&clp->cl_sem); return status; } @@ -2694,54 +2700,93 @@ static int do_vfs_lock(struct file *file return res; } -static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) +struct nfs4_unlockdata { + struct nfs_lockargs arg; + struct nfs_lockres res; + struct nfs_locku_opargs luargs; + struct nfs4_lock_state *lsp; + struct nfs4_state *state; + struct nfs_open_context *ctx; + struct file_lock fl; + struct iosem_work worker; +}; + +static void release_calldata(struct nfs4_unlockdata *calldata) { - struct inode *inode = state->inode; - struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_client *clp = server->nfs4_state; - struct nfs_lockargs arg = { - .fh = NFS_FH(inode), - .type = nfs4_lck_type(cmd, request), - .offset = request->fl_start, - .length = nfs4_lck_length(request), - }; - struct nfs_lockres res = { - .server = server, - }; + nfs4_put_lock_state(calldata->lsp); + iosem_unlock(&calldata->state->lock_iosem); + up_read(&calldata->state->owner->so_client->cl_sem); + put_nfs_open_context(calldata->ctx); + kfree(calldata); +} + +static void nfs4_locku_done(struct rpc_task *task) +{ + struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; + + nfs4_increment_lock_seqid(task->tk_status, calldata->lsp); + if (task->tk_status == 0) + memcpy(&calldata->lsp->ls_stateid, &calldata->res.u.stateid, + sizeof(calldata->lsp->ls_stateid)); + release_calldata(calldata); +} + +static void nfs4_do_unlock_func(void *data) +{ + struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)data; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], - .rpc_argp = &arg, - .rpc_resp = &res, - .rpc_cred = state->owner->so_cred, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], + .rpc_argp = &calldata->arg, + .rpc_resp = &calldata->res, + .rpc_cred = calldata->state->owner->so_cred, }; + struct nfs4_lock_state *lsp = calldata->lsp; + + calldata->luargs.seqid = lsp->ls_seqid; + memcpy(&calldata->luargs.stateid, &lsp->ls_stateid, sizeof(calldata->luargs.stateid)); + + if (rpc_call_async(NFS_SERVER(calldata->state->inode)->client, + &msg, 0, nfs4_locku_done, calldata) != 0) + release_calldata(calldata); +} + +static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *fl) +{ + struct nfs4_unlockdata *calldata; struct nfs4_lock_state *lsp; - struct nfs_locku_opargs luargs; - int status; - - down_read(&clp->cl_sem); - down(&state->lock_sema); - status = nfs4_set_lock_state(state, request); - if (status != 0) + int err = 0; + + if (!test_bit(LK_STATE_IN_USE, &state->flags)) goto out; - lsp = request->fl_u.nfs4_fl.owner; - /* We might have lost the locks! */ - if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) + lsp = nfs4_find_lock_state(state, fl->fl_owner); + if (lsp == NULL) goto out; - luargs.seqid = lsp->ls_seqid; - memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); - arg.u.locku = &luargs; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_lock_seqid(status, lsp); - - if (status == 0) - memcpy(&lsp->ls_stateid, &res.u.stateid, - sizeof(lsp->ls_stateid)); + if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) + goto out_put; + err = -ENOMEM; + calldata = (struct nfs4_unlockdata *)kmalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + goto out_put; + calldata->arg.fh = NFS_FH(state->inode); + calldata->arg.type = nfs4_lck_type(cmd, fl); + calldata->arg.offset = fl->fl_start; + calldata->arg.length = nfs4_lck_length(fl); + calldata->res.server = NFS_SERVER(state->inode); + calldata->arg.u.locku = &calldata->luargs; + calldata->lsp = lsp; + calldata->state = state; + calldata->ctx = get_nfs_open_context((struct nfs_open_context*)fl->fl_file->private_data); + memcpy(&calldata->fl, fl, sizeof(calldata->fl)); + iosem_work_init(&calldata->worker, nfs4_do_unlock_func, calldata); + down_read(&state->owner->so_client->cl_sem); + iosem_lock_and_schedule_work(&state->lock_iosem, &calldata->worker); + /* Note: We do the VFS unlock now! */ + do_vfs_lock(fl->fl_file, fl); out: - up(&state->lock_sema); - if (status == 0) - do_vfs_lock(request->fl_file, request); - up_read(&clp->cl_sem); - return status; + return 0; +out_put: + nfs4_put_lock_state(lsp); + return err; } static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) @@ -2797,13 +2842,13 @@ static int _nfs4_do_setlk(struct nfs4_st largs.u.open_lock = &otl; largs.new_lock_owner = 1; arg.u.lock = &largs; - down(&owner->so_sema); + nfs_lock_state_owner(owner); otl.open_seqid = owner->so_seqid; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); /* increment open_owner seqid on success, and * seqid mutating errors */ nfs4_increment_seqid(status, owner); - up(&owner->so_sema); + nfs_unlock_state_owner(owner); if (status == 0) { lsp->ls_flags |= NFS_LOCK_INITIALIZED; lsp->ls_seqid++; @@ -2863,11 +2908,11 @@ static int _nfs4_proc_setlk(struct nfs4_ int status; down_read(&clp->cl_sem); - down(&state->lock_sema); + iosem_lock(&state->lock_iosem); status = nfs4_set_lock_state(state, request); if (status == 0) status = _nfs4_do_setlk(state, cmd, request, 0); - up(&state->lock_sema); + iosem_unlock(&state->lock_iosem); if (status == 0) { /* Note: we always want to sleep here! */ request->fl_flags |= FL_SLEEP; @@ -2903,6 +2948,9 @@ nfs4_proc_lock(struct file *filp, int cm ctx = (struct nfs_open_context *)filp->private_data; state = ctx->state; + if (state == NULL) + return -ENOLCK; + if (request->fl_start < 0 || request->fl_end < 0) return -EINVAL; @@ -2971,6 +3019,35 @@ ssize_t nfs4_listxattr(struct dentry *de return len; } +int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, + struct nfs4_fs_locations *fs_locations, struct page *page) +{ + struct nfs_server *server = NFS_SERVER(dir); + u32 bitmask[2] = { + [0] = server->attr_bitmask[0] | FATTR4_WORD0_FS_LOCATIONS, + [1] = server->attr_bitmask[1], + }; + struct nfs4_fs_locations_arg args = { + .dir_fh = NFS_FH(dir), + .name = &dentry->d_name, + .page = page, + .bitmask = bitmask, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = fs_locations, + }; + int status; + + dprintk("%s: start\n", __FUNCTION__); + fs_locations->fattr.valid = 0; + fs_locations->server = server; + status = rpc_call_sync(server->client, &msg, 0); + dprintk("%s: returned status = %d\n", __FUNCTION__, status); + return status; +} + struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, @@ -3022,8 +3099,8 @@ struct nfs_rpc_ops nfs_v4_clientops = { .read_setup = nfs4_proc_read_setup, .write_setup = nfs4_proc_write_setup, .commit_setup = nfs4_proc_commit_setup, - .file_open = nfs4_proc_file_open, - .file_release = nfs4_proc_file_release, + .file_open = nfs_open, + .file_release = nfs_release, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, }; Index: linux-2.6.13-rc1/fs/nfs/nfs4state.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs4state.c +++ linux-2.6.13-rc1/fs/nfs/nfs4state.c @@ -267,7 +267,7 @@ nfs4_alloc_state_owner(void) sp = kmalloc(sizeof(*sp),GFP_KERNEL); if (!sp) return NULL; - init_MUTEX(&sp->so_sema); + iosem_init(&sp->so_iosem); sp->so_seqid = 0; /* arbitrary */ INIT_LIST_HEAD(&sp->so_states); INIT_LIST_HEAD(&sp->so_delegations); @@ -359,7 +359,7 @@ nfs4_alloc_open_state(void) memset(state->stateid.data, 0, sizeof(state->stateid.data)); atomic_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); - init_MUTEX(&state->lock_sema); + iosem_init(&state->lock_iosem); spin_lock_init(&state->state_lock); return state; } @@ -461,7 +461,7 @@ out: /* * Beware! Caller must be holding exactly one - * reference to clp->cl_sem and owner->so_sema! + * reference to clp->cl_sem and the state owner lock! */ void nfs4_put_open_state(struct nfs4_state *state) { @@ -480,20 +480,19 @@ void nfs4_put_open_state(struct nfs4_sta nfs4_put_state_owner(owner); } -/* - * Beware! Caller must be holding no references to clp->cl_sem! - * of owner->so_sema! - */ -void nfs4_close_state(struct nfs4_state *state, mode_t mode) +struct nfs4_close_state_args { + struct iosem_work work; + struct nfs4_state *state; + mode_t mode; +}; + +static inline void __nfs4_close_state(struct nfs4_state *state, mode_t mode) { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; struct nfs4_client *clp = owner->so_client; int newstate; - atomic_inc(&owner->so_count); - down_read(&clp->cl_sem); - down(&owner->so_sema); /* Protect against nfs4_find_state() */ spin_lock(&inode->i_lock); if (mode & FMODE_READ) @@ -520,11 +519,42 @@ void nfs4_close_state(struct nfs4_state } out: nfs4_put_open_state(state); - up(&owner->so_sema); + nfs_unlock_state_owner(owner); nfs4_put_state_owner(owner); up_read(&clp->cl_sem); } +static void nfs4_close_state_func(void *data) +{ + struct nfs4_close_state_args *args = (struct nfs4_close_state_args *)data; + + __nfs4_close_state(args->state, args->mode); + kfree(args); +} + +/* + * Beware! Caller must be holding no references to clp->cl_sem! + * or state owner lock! + */ +void nfs4_close_state(struct nfs4_state *state, mode_t mode) +{ + struct nfs4_state_owner *owner = state->owner; + struct nfs4_client *clp = owner->so_client; + struct nfs4_close_state_args *args; + + args = kmalloc(sizeof(*args), GFP_NOFS); + if (args == NULL) { + printk("%s: failed. Out of memory\n", __FUNCTION__); + return; + } + atomic_inc(&owner->so_count); + down_read(&clp->cl_sem); + args->state = state; + args->mode = mode; + iosem_work_init(&args->work, nfs4_close_state_func, args); + iosem_lock_and_schedule_work(&owner->so_iosem, &args->work); +} + /* * Search the state->lock_states for an existing lock_owner * that is compatible with current->files @@ -542,11 +572,21 @@ __nfs4_find_lock_state(struct nfs4_state return NULL; } +struct nfs4_lock_state * +nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +{ + struct nfs4_lock_state *lsp; + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, fl_owner); + spin_unlock(&state->state_lock); + return lsp; +} + /* * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding state->lock_sema + * The caller must be holding state->lock_iosem */ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) { @@ -572,7 +612,7 @@ static struct nfs4_lock_state *nfs4_allo * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding state->lock_sema and clp->cl_sem + * The caller must be holding state->lock_iosem and clp->cl_sem */ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) { @@ -605,7 +645,7 @@ static struct nfs4_lock_state *nfs4_get_ * Release reference to lock_state, and free it if we see that * it is no longer in use */ -static void nfs4_put_lock_state(struct nfs4_lock_state *lsp) +void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { struct nfs4_state *state; @@ -674,7 +714,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst } /* -* Called with state->lock_sema and clp->cl_sem held. +* Called with state->lock_iosem and clp->cl_sem held. */ void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) { @@ -683,7 +723,7 @@ void nfs4_increment_lock_seqid(int statu } /* -* Called with sp->so_sema and clp->cl_sem held. +* Called with state owner lock and clp->cl_sem held. * * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - @@ -791,8 +831,11 @@ static int nfs4_reclaim_open_state(struc if (state->state == 0) continue; status = ops->recover_open(sp, state); - list_for_each_entry(lock, &state->lock_states, ls_locks) + /* Reset lockowner state */ + list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + lock->ls_seqid = 0; + } if (status >= 0) { status = nfs4_reclaim_locks(ops, state); if (status < 0) @@ -871,6 +914,8 @@ restart_loop: nfs_delegation_mark_reclaim(clp); /* Note: list is protected by exclusive lock on cl->cl_sem */ list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + /* Reset open owner state */ + sp->so_seqid = 0; status = nfs4_reclaim_open_state(ops, sp); if (status < 0) { if (status == -NFS4ERR_NO_GRACE) { Index: linux-2.6.13-rc1/fs/nfs/nfs4xdr.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/nfs4xdr.c +++ linux-2.6.13-rc1/fs/nfs/nfs4xdr.c @@ -379,6 +379,15 @@ static int nfs_stat_to_errno(int); #define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define NFS4_enc_fs_locations_sz \ + (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_fs_locations_sz \ + (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz) static struct { unsigned int mode; @@ -1901,6 +1910,38 @@ static int nfs4_xdr_enc_delegreturn(stru } /* + * Encode FS_LOCATIONS request + */ +static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 3, + }; + struct rpc_auth *auth = req->rq_task->tk_auth; + int replen; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) + goto out; + if ((status = encode_lookup(&xdr, args->name)) != 0) + goto out; + if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) + goto out; + /* set up reply + * toplevel_status + taglen + rescount + OP_PUTFH + status + * + OP_LOOKUP + status + OP_GETATTR + status = 7 + */ + replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2; + xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page, + 0, PAGE_SIZE); +out: + return status; +} + +/* * START OF "GENERIC" DECODE ROUTINES. * These may look a little ugly since they are imported from a "generic" * set of XDR encode/decode routines which are intended to be shared by @@ -1934,7 +1975,7 @@ static int nfs4_xdr_enc_delegreturn(stru } \ } while (0) -static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string) +static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { uint32_t *p; @@ -1985,7 +2026,7 @@ static int decode_op_hdr(struct xdr_stre static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp) { uint32_t *p; - uint32_t strlen; + unsigned int strlen; char *str; READ_BUF(12); @@ -2115,7 +2156,7 @@ static int decode_attr_symlink_support(s return 0; } -static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid) +static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { uint32_t *p; @@ -2234,6 +2275,107 @@ static int decode_attr_files_total(struc return status; } +static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) +{ + int n; + uint32_t *p; + int status = 0; + + READ_BUF(4); + READ32(n); + if (n <= 0) + goto out_eio; + dprintk("path "); + path->ncomponents = 0; + while (path->ncomponents < n) { + struct nfs4_string *component = &path->components[path->ncomponents]; + status = decode_opaque_inline(xdr, &component->len, &component->data); + if (unlikely(status != 0)) + goto out_eio; + if (path->ncomponents != n) + dprintk("/"); + dprintk("%s", component->data); + if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) + path->ncomponents++; + else { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + } +out: + dprintk("\n"); + return status; +out_eio: + dprintk(" status %d", status); + status = -EIO; + goto out; +} + +static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) +{ + int n; + uint32_t *p; + int status = -EIO; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U))) + goto out; + status = 0; + if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) + goto out; + dprintk("%s: fsroot ", __FUNCTION__); + status = decode_pathname(xdr, &res->fs_path); + if (unlikely(status != 0)) + goto out; + READ_BUF(4); + READ32(n); + if (n <= 0) + goto out_eio; + res->nlocations = 0; + while (res->nlocations < n) { + int m; + struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + + READ_BUF(4); + READ32(m); + if (m <= 0) + goto out_eio; + + loc->nservers = 0; + dprintk("%s: servers ", __FUNCTION__); + while (loc->nservers < m) { + struct nfs4_string *server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); + if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) + loc->nservers++; + else { + int i; + dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations); + for (i = loc->nservers; i < m; i++) { + int len; + char *data; + status = decode_opaque_inline(xdr, &len, &data); + if (unlikely(status != 0)) + goto out_eio; + } + } + } + status = decode_pathname(xdr, &loc->rootpath); + if (unlikely(status != 0)) + goto out_eio; + if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) + res->nlocations++; + } +out: + dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status); + return status; +out_eio: + status = -EIO; + goto out; +} + static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) { uint32_t *p; @@ -2764,10 +2906,14 @@ static int decode_getfattr(struct xdr_st goto xdr_error; if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) goto xdr_error; - if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0) + if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) goto xdr_error; if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) goto xdr_error; + if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + struct nfs4_fs_locations, + fattr))) != 0) + goto xdr_error; if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) goto xdr_error; fattr->mode |= fmode; @@ -4047,6 +4193,29 @@ static int nfs4_xdr_dec_delegreturn(stru return status; } +/* + * FS_LOCATIONS request + */ +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &req->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + if ((status = decode_putfh(&xdr)) != 0) + goto out; + if ((status = decode_lookup(&xdr)) != 0) + goto out; + xdr_enter_page(&xdr, PAGE_SIZE); + status = decode_getfattr(&xdr, &res->fattr, res->server); +out: + return status; +} + uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) { uint32_t bitmap[2] = {0}; @@ -4216,6 +4385,7 @@ struct rpc_procinfo nfs4_procedures[] = PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), PROC(GETACL, enc_getacl, dec_getacl), PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), }; struct rpc_version nfs_version4 = { Index: linux-2.6.13-rc1/fs/nfs/nfs_iostat.h =================================================================== --- /dev/null +++ linux-2.6.13-rc1/fs/nfs/nfs_iostat.h @@ -0,0 +1,87 @@ +#ifndef _NFS_IOSTAT +#define _NFS_IOSTAT + +#define NFS_IOSTAT_VERS "1.0" + +enum nfs_stat_bytecounters { + NFSIOS_NORMALREADBYTES = 0, + NFSIOS_NORMALWRITTENBYTES, + NFSIOS_DIRECTREADBYTES, + NFSIOS_DIRECTWRITTENBYTES, + NFSIOS_SERVERREADBYTES, + NFSIOS_SERVERWRITTENBYTES, + NFSIOS_WAITEVENTJIFFIES, + __NFSIOS_BYTESMAX, +}; + +enum nfs_stat_eventcounters { + NFSIOS_WAITEVENT = 0, + NFSIOS_INODEREVALIDATE, + NFSIOS_DENTRYREVALIDATE, + NFSIOS_DATAINVALIDATE, + NFSIOS_ATTRINVALIDATE, + NFSIOS_VFSOPEN, + NFSIOS_VFSLOOKUP, + NFSIOS_VFSACCESS, + NFSIOS_VFSREADPAGE, + NFSIOS_VFSREADPAGES, + NFSIOS_VFSWRITEPAGE, + NFSIOS_VFSWRITEPAGES, + NFSIOS_VFSGETDENTS, + NFSIOS_VFSFLUSH, + NFSIOS_VFSFSYNC, + NFSIOS_VFSLOCK, + NFSIOS_VFSRELEASE, + NFSIOS_SETATTRTRUNC, + NFSIOS_EXTENDWRITE, + NFSIOS_SILLYRENAME, + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + __NFSIOS_COUNTSMAX, +}; + +#ifdef __KERNEL__ + +#include +#include + +struct nfs_iostats { + unsigned long long bytes[__NFSIOS_BYTESMAX]; + unsigned long events[__NFSIOS_COUNTSMAX]; +} ____cacheline_aligned; + +static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); + iostats->events[stat] ++; + put_cpu_no_resched(); +} + +static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) +{ + struct nfs_iostats *iostats; + int cpu; + + cpu = get_cpu(); + iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); + iostats->bytes[stat] += addend; + put_cpu_no_resched(); +} + +static inline struct nfs_iostats *nfs_alloc_iostats(void) +{ + return alloc_percpu(struct nfs_iostats); +} + +static inline void nfs_free_iostats(struct nfs_iostats *stats) +{ + if (stats != NULL) + free_percpu(stats); +} + +#endif +#endif Index: linux-2.6.13-rc1/fs/nfs/pagelist.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/pagelist.c +++ linux-2.6.13-rc1/fs/nfs/pagelist.c @@ -17,6 +17,7 @@ #include #include #include +#include "nfs_iostat.h" #include #define NFS_PARANOIA 1 @@ -198,8 +199,10 @@ static int nfs_wait_bit_interruptible(vo int nfs_wait_on_request(struct nfs_page *req) { - struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->dentry->d_inode); + struct inode *inode = req->wb_context->dentry->d_inode; + struct rpc_clnt *clnt = NFS_CLIENT(inode); sigset_t oldmask; + unsigned long start = jiffies; int ret = 0; if (!test_bit(PG_BUSY, &req->wb_flags)) @@ -211,6 +214,8 @@ nfs_wait_on_request(struct nfs_page *req rpc_clnt_sigmask(clnt, &oldmask); ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE); + nfs_add_stats(inode, NFSIOS_WAITEVENTJIFFIES, (jiffies - start)); + nfs_inc_stats(inode, NFSIOS_WAITEVENT); rpc_clnt_sigunmask(clnt, &oldmask); out: return ret; Index: linux-2.6.13-rc1/fs/nfs/proc.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/proc.c +++ linux-2.6.13-rc1/fs/nfs/proc.c @@ -214,7 +214,7 @@ static int nfs_proc_write(struct nfs_wri static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags) + int flags, struct nameidata *nd) { struct nfs_fh fhandle; struct nfs_fattr fattr; Index: linux-2.6.13-rc1/fs/nfs/read.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/read.c +++ linux-2.6.13-rc1/fs/nfs/read.c @@ -26,6 +26,7 @@ #include #include #include +#include "nfs_iostat.h" #include #include @@ -134,6 +135,8 @@ static int nfs_readpage_sync(struct nfs_ } count -= result; rdata->args.pgbase += result; + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, result); + /* Note: result == 0 should only happen if we're caching * a write that extends the file and punches a hole. */ @@ -460,8 +463,11 @@ void nfs_readpage_result(struct rpc_task dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", task->tk_pid, status); + nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count); + /* Is this a short read? */ if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { + nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count != 0) { /* Yes, so retry the read at the end of the data */ @@ -491,6 +497,8 @@ int nfs_readpage(struct file *file, stru dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page->index); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); + /* * Try to flush any pending writes to the file.. * @@ -570,6 +578,7 @@ int nfs_readpages(struct file *filp, str inode->i_sb->s_id, (long long)NFS_FILEID(inode), nr_pages); + nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); if (filp == NULL) { desc.ctx = nfs_find_open_context(inode, FMODE_READ); Index: linux-2.6.13-rc1/fs/nfs/write.c =================================================================== --- linux-2.6.13-rc1.orig/fs/nfs/write.c +++ linux-2.6.13-rc1/fs/nfs/write.c @@ -57,6 +57,7 @@ #include #include +#include "nfs_iostat.h" #include #include #include @@ -122,6 +123,7 @@ static void nfs_grow_file(struct page *p end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); if (i_size >= end) return; + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); i_size_write(inode, end); } @@ -210,6 +212,7 @@ static int nfs_writepage_sync(struct nfs wdata->args.pgbase += result; written += result; count -= result; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, result); } while (count); /* Update file length */ nfs_grow_file(page, offset, written); @@ -268,6 +271,8 @@ int nfs_writepage(struct page *page, str int priority = wb_priority(wbc); int err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); + /* * Note: We need to ensure that we have a reference to the inode * if we are to do asynchronous writes. If not, waiting @@ -335,6 +340,8 @@ int nfs_writepages(struct address_space struct inode *inode = mapping->host; int err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + err = generic_writepages(mapping, wbc); if (err) return err; @@ -1144,6 +1151,8 @@ void nfs_writeback_done(struct rpc_task dprintk("NFS: %4d nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (resp->verf->committed < argp->stable && task->tk_status >= 0) { /* We tried a write call, but the server did not @@ -1169,6 +1178,8 @@ void nfs_writeback_done(struct rpc_task if (task->tk_status >= 0 && resp->count < argp->count) { static unsigned long complain; + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + /* Has the server at least made some progress? */ if (resp->count != 0) { /* Was this an NFSv2 write or an NFSv3 stable write? */ Index: linux-2.6.13-rc1/fs/open.c =================================================================== --- linux-2.6.13-rc1.orig/fs/open.c +++ linux-2.6.13-rc1/fs/open.c @@ -764,13 +764,25 @@ struct file *filp_open(const char * file error = open_namei(filename, namei_flags, mode, &nd); if (!error) - return dentry_open(nd.dentry, nd.mnt, flags); + return nd_open_file(&nd, flags); return ERR_PTR(error); } EXPORT_SYMBOL(filp_open); +struct file *nd_open_file(struct nameidata *nd, int flags) +{ + struct file *filp; + + if ((nd->flags & LOOKUP_OPEN) && nd->intent.open.file != NULL) { + filp = nd->intent.open.file; + path_release(nd); + } else + filp = dentry_open(nd->dentry, nd->mnt, flags); + return filp; +} + struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) { struct file * f; Index: linux-2.6.13-rc1/fs/proc/base.c =================================================================== --- linux-2.6.13-rc1.orig/fs/proc/base.c +++ linux-2.6.13-rc1/fs/proc/base.c @@ -66,6 +66,7 @@ enum pid_directory_inos { PROC_TGID_STATM, PROC_TGID_MAPS, PROC_TGID_MOUNTS, + PROC_TGID_MOUNTSTATS, PROC_TGID_WCHAN, #ifdef CONFIG_SCHEDSTATS PROC_TGID_SCHEDSTAT, @@ -103,6 +104,7 @@ enum pid_directory_inos { PROC_TID_STATM, PROC_TID_MAPS, PROC_TID_MOUNTS, + PROC_TID_MOUNTSTATS, PROC_TID_WCHAN, #ifdef CONFIG_SCHEDSTATS PROC_TID_SCHEDSTAT, @@ -152,6 +154,7 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO), + E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUGO), #ifdef CONFIG_SECURITY E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif @@ -188,6 +191,7 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO), + E(PROC_TID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUGO), #ifdef CONFIG_SECURITY E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), #endif @@ -555,6 +559,38 @@ static struct file_operations proc_mount .release = mounts_release, }; +extern struct seq_operations mountstats_op; +static int mountstats_open(struct inode *inode, struct file *file) +{ + struct task_struct *task = proc_task(inode); + int ret = seq_open(file, &mountstats_op); + + if (!ret) { + struct seq_file *m = file->private_data; + struct namespace *namespace; + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + + if (namespace) + m->private = namespace; + else { + seq_release(inode, file); + ret = -EINVAL; + } + } + return ret; +} + +static struct file_operations proc_mountstats_operations = { + .open = mountstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, +}; + #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ static ssize_t proc_info_read(struct file * file, char __user * buf, @@ -1539,6 +1575,10 @@ static struct dentry *proc_pident_lookup case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; break; + case PROC_TID_MOUNTSTATS: + case PROC_TGID_MOUNTSTATS: + inode->i_fop = &proc_mountstats_operations; + break; #ifdef CONFIG_SECURITY case PROC_TID_ATTR: inode->i_nlink = 2; Index: linux-2.6.13-rc1/fs/super.c =================================================================== --- linux-2.6.13-rc1.orig/fs/super.c +++ linux-2.6.13-rc1/fs/super.c @@ -801,17 +801,13 @@ struct super_block *get_sb_single(struct EXPORT_SYMBOL(get_sb_single); struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { - struct file_system_type *type = get_fs_type(fstype); struct super_block *sb = ERR_PTR(-ENOMEM); struct vfsmount *mnt; int error; char *secdata = NULL; - if (!type) - return ERR_PTR(-ENODEV); - mnt = alloc_vfsmnt(name); if (!mnt) goto out; @@ -843,7 +839,6 @@ do_kern_mount(const char *fstype, int fl mnt->mnt_namespace = current->namespace; up_write(&sb->s_umount); free_secdata(secdata); - put_filesystem(type); return mnt; out_sb: up_write(&sb->s_umount); @@ -854,10 +849,23 @@ out_free_secdata: out_mnt: free_vfsmnt(mnt); out: - put_filesystem(type); return (struct vfsmount *)sb; } +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct vfsmount *mnt; + if (!type) + return ERR_PTR(-ENODEV); + mnt = vfs_kern_mount(type, flags, name, data); + put_filesystem(type); + return mnt; +} + EXPORT_SYMBOL_GPL(do_kern_mount); struct vfsmount *kern_mount(struct file_system_type *type) Index: linux-2.6.13-rc1/include/linux/fs.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/fs.h +++ linux-2.6.13-rc1/include/linux/fs.h @@ -1033,6 +1033,7 @@ struct super_operations { void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_stats)(struct seq_file *, struct vfsmount *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); Index: linux-2.6.13-rc1/include/linux/iosem.h =================================================================== --- /dev/null +++ linux-2.6.13-rc1/include/linux/iosem.h @@ -0,0 +1,110 @@ +/* + * include/linux/iosem.h + * + * Copyright (C) 2005 Trond Myklebust + * + * Definitions for iosems. These can act as mutexes, but unlike + * semaphores, their code is 100% arch-independent, and can therefore + * easily be expanded in order to provide for things like + * asynchronous I/O. + */ + +#ifndef __LINUX_SEM_LOCK_H +#define __LINUX_SEM_LOCK_H + +#ifdef __KERNEL__ +#include +#include + +/* + * struct iosem: iosem mutex + * state: bitmask - currently only signals whether or not an exclusive + * lock has been taken + * wait: FIFO wait queue + */ +struct iosem { +#define IOSEM_LOCK_EXCLUSIVE (31) +/* #define IOSEM_LOCK_SHARED (30) */ + unsigned long state; + wait_queue_head_t wait; +}; + + + +/* + * struct iosem_wait: acts as a request for a lock on the iosem + * lock: backpointer to the iosem + * wait: wait queue entry. note that the callback function + * defines what to do when the lock has been granted + */ +struct iosem_wait { + struct iosem *lock; + wait_queue_t wait; +}; + +/* + * struct iosem_work: used by asynchronous waiters. + * + * work: work to schedule once the iosem has been granted. The + * function containing the critical code that needs to + * run under the protection of the lock should be placed here. + * The same function is responsible for calling iosem_unlock() + * when done. + * waiter: iosem waitqueue entry + */ +struct iosem_work { + struct work_struct work; + struct iosem_wait waiter; +}; + +/* + * Functions for synchronous i/o + */ + +/* Synchronously grab an iosem. + * These functions act in pretty much the same way down()/up() + * do for semaphores. + */ +extern void FASTCALL(iosem_lock(struct iosem *lk)); +extern void FASTCALL(iosem_unlock(struct iosem *lk)); + +/* + * Callback function to wake up the sleeping task once + * it has been granted an exclusive lock + */ +extern int iosem_lock_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); + +/* Initialize a struct iosem in the "unlocked" state */ +static inline void iosem_init(struct iosem *lk) +{ + lk->state = 0; + init_waitqueue_head(&lk->wait); +} + +/* Initializes a lock request */ +static inline void iosem_waiter_init(struct iosem_wait *waiter) +{ + waiter->lock = NULL; + init_waitqueue_entry(&waiter->wait, current); + INIT_LIST_HEAD(&waiter->wait.task_list); +} + +/* + * Functions for asynchronous I/O. + */ + +/* Requests an exclusive lock on the iosem on behalf of a workqueue entry "wk". + * Schedule wk->work for execution as soon as the lock is granted. */ +extern int FASTCALL(iosem_lock_and_schedule_work(struct iosem *lk, struct iosem_work *wk)); + +/* Waitqueue notifier that schedules work once the exclusive lock has + * been granted */ +extern int iosem_lock_and_schedule_function(wait_queue_t *wait, unsigned mode, int sync, void *key); + +static inline void iosem_work_init(struct iosem_work *wk, void (*func)(void *), void *data) +{ + INIT_WORK(&wk->work, func, data); +} + +#endif /* __KERNEL__ */ +#endif /* __LINUX_SEM_LOCK_H */ Index: linux-2.6.13-rc1/include/linux/mount.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/mount.h +++ linux-2.6.13-rc1/include/linux/mount.h @@ -68,6 +68,11 @@ extern struct vfsmount *alloc_vfsmnt(con extern struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data); +struct file_system_type; +extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data); + struct nameidata; extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, Index: linux-2.6.13-rc1/include/linux/namei.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/namei.h +++ linux-2.6.13-rc1/include/linux/namei.h @@ -8,6 +8,7 @@ struct vfsmount; struct open_intent { int flags; int create_mode; + struct file *file; }; enum { MAX_NESTED_LINKS = 5 }; @@ -64,6 +65,15 @@ extern int FASTCALL(path_walk(const char extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); extern void path_release(struct nameidata *); extern void path_release_on_umount(struct nameidata *); +extern void path_release_open_intent(struct nameidata *); + +extern struct file *nd_open_file(struct nameidata *nd, int flags); +static inline void nd_init_open_intent(struct nameidata *nd, int flags, int mode) +{ + nd->intent.open.flags = flags; + nd->intent.open.create_mode = mode; + nd->intent.open.file = NULL; +} extern struct dentry * lookup_one_len(const char *, struct dentry *, int); extern struct dentry * lookup_hash(struct qstr *, struct dentry *); Index: linux-2.6.13-rc1/include/linux/nfs4.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/nfs4.h +++ linux-2.6.13-rc1/include/linux/nfs4.h @@ -384,6 +384,7 @@ enum { NFSPROC4_CLNT_DELEGRETURN, NFSPROC4_CLNT_GETACL, NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, }; #endif Index: linux-2.6.13-rc1/include/linux/nfs_fs.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/nfs_fs.h +++ linux-2.6.13-rc1/include/linux/nfs_fs.h @@ -16,8 +16,6 @@ #include #include -#include - #include #include #include @@ -27,6 +25,9 @@ #include #include #include + +#include + #include #include @@ -302,6 +303,12 @@ extern void put_nfs_open_context(struct extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode); extern void nfs_file_clear_open_context(struct file *filp); +extern struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr); +extern int nfs_try_migrate_inode(struct inode *dir, struct dentry *parent); +extern int nfs_try_migrate_filehandle(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr, uint32_t generation); /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern u32 root_nfs_parse_addr(char *name); /*__init*/ @@ -371,6 +378,13 @@ extern int nfs_instantiate(struct dentry extern struct inode_operations nfs_symlink_inode_operations; /* + * linux/fs/nfs/namespace.c + */ +extern struct inode_operations nfs_mountpoint_inode_operations; +extern int nfs_mountpoint_expiry_timeout; +extern void nfs_release_automount_timer(void); + +/* * linux/fs/nfs/unlink.c */ extern int nfs_async_unlink(struct dentry *); Index: linux-2.6.13-rc1/include/linux/nfs_fs_sb.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/nfs_fs_sb.h +++ linux-2.6.13-rc1/include/linux/nfs_fs_sb.h @@ -4,6 +4,8 @@ #include #include +struct nfs_iostats; + /* * NFS client parameters stored in the superblock. */ @@ -12,6 +14,7 @@ struct nfs_server { struct rpc_clnt * client_sys; /* 2nd handle for FSINFO */ struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ + struct nfs_iostats * io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; int flags; /* various flags */ unsigned int caps; /* server capabilities */ @@ -26,10 +29,14 @@ struct nfs_server { unsigned int acregmax; unsigned int acdirmin; unsigned int acdirmax; + unsigned long retrans_timeo; /* retransmit timeout */ + unsigned int retrans_count; /* number of retransmit tries */ unsigned int namelen; char * hostname; /* remote hostname */ struct nfs_fh fh; struct sockaddr_in addr; + struct nfs_fsid fsid; + uint32_t generation; #ifdef CONFIG_NFS_V4 /* Our own IP address, as a null-terminated string. * This is used to generate the clientid, and the callback address. Index: linux-2.6.13-rc1/include/linux/nfs_page.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/nfs_page.h +++ linux-2.6.13-rc1/include/linux/nfs_page.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include Index: linux-2.6.13-rc1/include/linux/nfs_xdr.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/nfs_xdr.h +++ linux-2.6.13-rc1/include/linux/nfs_xdr.h @@ -4,11 +4,19 @@ #include #include -struct nfs4_fsid { - __u64 major; - __u64 minor; +struct nfs_fsid { + uint64_t major; + uint64_t minor; }; +/* + * Helper for checking equality between 2 fsids. + */ +static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid *b) +{ + return a->major == b->major && a->minor == b->minor; +} + struct nfs_fattr { unsigned short valid; /* which fields are valid */ __u64 pre_size; /* pre_op_attr.size */ @@ -30,10 +38,7 @@ struct nfs_fattr { } nfs3; } du; dev_t rdev; - union { - __u64 nfs3; /* also nfs2 */ - struct nfs4_fsid nfs4; - } fsid_u; + struct nfs_fsid fsid; __u64 fileid; struct timespec atime; struct timespec mtime; @@ -657,6 +662,40 @@ struct nfs4_server_caps_res { u32 has_symlinks; }; +struct nfs4_string { + unsigned int len; + char *data; +}; + +#define NFS4_PATHNAME_MAXCOMPONENTS 512 +struct nfs4_pathname { + unsigned int ncomponents; + struct nfs4_string components[NFS4_PATHNAME_MAXCOMPONENTS]; +}; + +#define NFS4_FS_LOCATION_MAXSERVERS 10 +struct nfs4_fs_location { + unsigned int nservers; + struct nfs4_string servers[NFS4_FS_LOCATION_MAXSERVERS]; + struct nfs4_pathname rootpath; +}; + +#define NFS4_FS_LOCATIONS_MAXENTRIES 10 +struct nfs4_fs_locations { + struct nfs_fattr fattr; + const struct nfs_server *server; + struct nfs4_pathname fs_path; + int nlocations; + struct nfs4_fs_location locations[NFS4_FS_LOCATIONS_MAXENTRIES]; +}; + +struct nfs4_fs_locations_arg { + const struct nfs_fh *dir_fh; + const struct qstr *name; + struct page *page; + const u32 *bitmask; +}; + #endif /* CONFIG_NFS_V4 */ struct nfs_page; @@ -722,7 +761,7 @@ struct nfs_rpc_ops { int (*write) (struct nfs_write_data *); int (*commit) (struct nfs_write_data *); int (*create) (struct inode *, struct dentry *, - struct iattr *, int); + struct iattr *, int, struct nameidata *); int (*remove) (struct inode *, struct qstr *); int (*unlink_setup) (struct rpc_message *, struct dentry *, struct qstr *); Index: linux-2.6.13-rc1/include/linux/sunrpc/auth_gss.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/sunrpc/auth_gss.h +++ linux-2.6.13-rc1/include/linux/sunrpc/auth_gss.h @@ -75,14 +75,17 @@ struct gss_cl_ctx { struct xdr_netobj gc_wire_ctx; u32 gc_win; unsigned long gc_expiry; + char gc_principal[0]; }; struct gss_upcall_msg; +struct key; struct gss_cred { struct rpc_cred gc_base; enum rpc_gss_svc gc_service; struct gss_cl_ctx *gc_ctx; struct gss_upcall_msg *gc_upcall; + struct key *gc_key; }; #define gc_uid gc_base.cr_uid Index: linux-2.6.13-rc1/include/linux/sunrpc/clnt.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/sunrpc/clnt.h +++ linux-2.6.13-rc1/include/linux/sunrpc/clnt.h @@ -9,12 +9,9 @@ #ifndef _LINUX_SUNRPC_CLNT_H #define _LINUX_SUNRPC_CLNT_H -#include -#include -#include #include +#include #include -#include #include #include @@ -66,7 +63,6 @@ struct rpc_clnt { struct rpc_portmap cl_pmap_default; char cl_inline_name[32]; }; -#define cl_timeout cl_xprt->timeout #define cl_prog cl_pmap->pm_prog #define cl_vers cl_pmap->pm_vers #define cl_port cl_pmap->pm_port @@ -103,7 +99,6 @@ struct rpc_procinfo { unsigned int p_timer; /* Which RTT timer to use */ }; -#define RPC_CONGESTED(clnt) (RPCXPRT_CONGESTED((clnt)->cl_xprt)) #define RPC_PEERADDR(clnt) (&(clnt)->cl_xprt->addr) #ifdef __KERNEL__ @@ -136,6 +131,10 @@ void rpc_setbufsize(struct rpc_clnt *, size_t rpc_max_payload(struct rpc_clnt *); int rpc_ping(struct rpc_clnt *clnt, int flags); +struct rpc_xprt *rpc_client_get_xprt(struct rpc_clnt *clnt); +void rpc_client_set_xprt(struct rpc_clnt *, struct rpc_xprt *); +void rpc_put_xprt(struct rpc_xprt *xprt); + static __inline__ int rpc_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) { Index: linux-2.6.13-rc1/include/linux/sunrpc/sched.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/sunrpc/sched.h +++ linux-2.6.13-rc1/include/linux/sunrpc/sched.h @@ -43,6 +43,7 @@ struct rpc_task { #endif struct list_head tk_task; /* global list of tasks */ struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_xprt * tk_xprt; /* RPC request */ struct rpc_rqst * tk_rqstp; /* RPC request */ int tk_status; /* result of last operation */ @@ -93,7 +94,6 @@ struct rpc_task { #endif }; #define tk_auth tk_client->cl_auth -#define tk_xprt tk_client->cl_xprt /* support walking a list of tasks on a wait queue */ #define task_for_each(task, pos, head) \ Index: linux-2.6.13-rc1/include/linux/sunrpc/xdr.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/sunrpc/xdr.h +++ linux-2.6.13-rc1/include/linux/sunrpc/xdr.h @@ -203,6 +203,7 @@ extern void xdr_write_pages(struct xdr_s extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p); extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); +extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); #endif /* __KERNEL__ */ Index: linux-2.6.13-rc1/include/linux/sunrpc/xprt.h =================================================================== --- linux-2.6.13-rc1.orig/include/linux/sunrpc/xprt.h +++ linux-2.6.13-rc1/include/linux/sunrpc/xprt.h @@ -9,7 +9,6 @@ #ifndef _LINUX_SUNRPC_XPRT_H #define _LINUX_SUNRPC_XPRT_H -#include #include #include #include @@ -127,6 +126,7 @@ struct rpc_rqst { #define XPRT_COPY_DATA (1 << 3) struct rpc_xprt { + atomic_t count; /* Reference counter */ struct socket * sock; /* BSD socket layer */ struct sock * inet; /* INET layer */ Index: linux-2.6.13-rc1/lib/Makefile =================================================================== --- linux-2.6.13-rc1.orig/lib/Makefile +++ linux-2.6.13-rc1/lib/Makefile @@ -9,7 +9,7 @@ lib-y := errno.o ctype.o string.o vsprin lib-y += kobject.o kref.o kobject_uevent.o klist.o -obj-y += sort.o parser.o +obj-y += sort.o parser.o iosem.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG Index: linux-2.6.13-rc1/lib/iosem.c =================================================================== --- /dev/null +++ linux-2.6.13-rc1/lib/iosem.c @@ -0,0 +1,177 @@ +/* + * linux/lib/iosem.c + * + * Copyright (C) 2005 Trond Myklebust + * + * A set of primitives for semaphore-like locks that also support notification + * callbacks for waiters. + */ +#include +#include +#include +#include +#include + +/* + * Common function for requesting an exclusive lock on an iosem + * + * Note: should be called while holding the non-irqsafe spinlock + * lk->wait.lock. The spinlock is non-irqsafe as we have no reason (yet) to + * expect anyone to take/release iosems from within an interrupt + * context (and 'cos it is a _bug_ to attempt to wake up the waitqueue + * lk->wait using anything other than iosem_unlock()). + */ +static inline int __iosem_lock(struct iosem *lk, struct iosem_wait *waiter) +{ + int ret; + + if (lk->state != 0) { + /* The lock cannot be immediately granted: queue waiter */ + waiter->lock = lk; + add_wait_queue_exclusive_locked(&lk->wait, &waiter->wait); + ret = -EINPROGRESS; + } else { + lk->state |= 1 << IOSEM_LOCK_EXCLUSIVE; + ret = 0; + } + return ret; +} + +/** + * iosem_unlock - release an exclusive lock + * @iosem - the iosem on which we hold an exclusive lock + */ +void fastcall iosem_unlock(struct iosem *lk) +{ + spin_lock(&lk->wait.lock); + lk->state &= ~(1 << IOSEM_LOCK_EXCLUSIVE); + wake_up_locked(&lk->wait); + spin_unlock(&lk->wait.lock); +} +EXPORT_SYMBOL(iosem_unlock); + +/** + * iosem_lock_wake_function - take an exclusive lock and wake up sleeping task + * @wait: waitqueue entry. Must be part of an initialized struct iosem_wait + * @mode: + * @sync: + * @key: + * + * Standard wait_queue_func_t callback function used by iosem_lock(). When + * called, it will attempt to wake up the sleeping task, and set an + * exclusive lock on the iosem. + * On success, @wait is automatically removed from the iosem's waitqueue, + * and a non-zero value is returned. + * + * This function will in practice *always* be called from within iosem_unlock() + */ +int iosem_lock_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct iosem_wait *waiter = container_of(wait, struct iosem_wait, wait); + unsigned long *lk_state = &waiter->lock->state; + int ret = 0; + + if (*lk_state == 0) { + ret = default_wake_function(wait, mode, sync, key); + if (ret) { + *lk_state |= 1 << IOSEM_LOCK_EXCLUSIVE; + list_del_init(&wait->task_list); + } + } + return ret; +} + +/** + * iosem_lock - synchronously take an exclusive lock + * @iosem - the iosem to take an exclusive lock + * + * If the exclusive lock cannot be immediately granted, put the current task + * to uninterruptible sleep until it can. + */ +void fastcall iosem_lock(struct iosem *lk) +{ + struct iosem_wait waiter; + + might_sleep(); + + iosem_waiter_init(&waiter); + waiter.wait.func = iosem_lock_wake_function; + + spin_lock(&lk->wait.lock); + if (__iosem_lock(lk, &waiter) != 0) { + /* Must wait for lock... */ + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (list_empty(&waiter.wait.task_list)) + break; + spin_unlock(&lk->wait.lock); + schedule(); + spin_lock(&lk->wait.lock); + } + __set_current_state(TASK_RUNNING); + } + spin_unlock(&lk->wait.lock); +} +EXPORT_SYMBOL(iosem_lock); + +/** + * iosem_lock_and_schedule_function - take an exclusive lock and schedule work + * @wait: waitqueue entry. Must be part of an initialized struct iosem_work + * @mode: unused + * @sync: unused + * @key: unused + * + * Standard wait_queue_func_t callback function used by + * iosem_lock_and_schedule_work. When called, it will attempt to queue the + * work function and set the exclusive lock on the iosem. + * On success, @wait is removed from the iosem's waitqueue, and a non-zero + * value is returned. + * + * This function will in practice *always* be called from within iosem_unlock() + */ +int iosem_lock_and_schedule_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct iosem_wait *waiter = container_of(wait, struct iosem_wait, wait); + struct iosem_work *wk = container_of(waiter, struct iosem_work, waiter); + unsigned long *lk_state = &waiter->lock->state; + int ret = 0; + + if (*lk_state == 0) { + ret = schedule_work(&wk->work); + if (ret) { + *lk_state |= 1 << IOSEM_LOCK_EXCLUSIVE; + list_del_init(&wait->task_list); + } + } + return ret; +} + +/** + * iosem_lock_and_schedule_work - request an exclusive lock and schedule work + * @lk: pointer to iosem + * @wk: pointer to iosem_work + * + * Request an exclusive lock on the iosem. If the lock cannot be immediately + * granted, place wk->waiter on the iosem's waitqueue, and return, else + * immediately queue the work function wk->work. + * + * Once the exclusive lock has been granted, the work function described by + * wk->work is queued in keventd. It is then the responsibility of that work + * function to release the exclusive lock once it has been granted. + * + * returns -EINPROGRESS if the lock could not be immediately granted. + */ +int fastcall iosem_lock_and_schedule_work(struct iosem *lk, struct iosem_work *wk) +{ + int ret; + + iosem_waiter_init(&wk->waiter); + wk->waiter.wait.func = iosem_lock_and_schedule_function; + spin_lock(&lk->wait.lock); + ret = __iosem_lock(lk, &wk->waiter); + spin_unlock(&lk->wait.lock); + if (ret == 0) + ret = schedule_work(&wk->work); + return ret; +} +EXPORT_SYMBOL(iosem_lock_and_schedule_work); Index: linux-2.6.13-rc1/net/sunrpc/auth_gss/auth_gss.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/auth_gss/auth_gss.c +++ linux-2.6.13-rc1/net/sunrpc/auth_gss/auth_gss.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,18 @@ #include #include +#ifdef CONFIG_RPCSEC_GSS_KEYRING + +#include + +#else + +#define gss_key_lookup_cred(auth) ERR_PTR(-ENOKEY) +#define gss_register_keytype() (0) +#define gss_unregister_keytype() do { } while(0) + +#endif + static struct rpc_authops authgss_ops; static struct rpc_credops gss_credops; @@ -83,6 +96,7 @@ static struct rpc_credops gss_credops; static DEFINE_RWLOCK(gss_ctx_lock); struct gss_auth { + spinlock_t lock; struct rpc_auth rpc_auth; struct gss_api_mech *mech; enum rpc_gss_svc service; @@ -90,7 +104,7 @@ struct gss_auth { struct rpc_clnt *client; struct dentry *dentry; char path[48]; - spinlock_t lock; + char key_name[256]; }; static void gss_destroy_ctx(struct gss_cl_ctx *); @@ -221,17 +235,19 @@ gss_cred_get_ctx(struct rpc_cred *cred) } static struct gss_cl_ctx * -gss_alloc_context(void) +gss_alloc_context(const char *principal) { struct gss_cl_ctx *ctx; + size_t len = strlen(principal) + 1; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(*ctx) + len, GFP_KERNEL); if (ctx != NULL) { memset(ctx, 0, sizeof(*ctx)); ctx->gc_proc = RPC_GSS_PROC_DATA; ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ spin_lock_init(&ctx->gc_seq_lock); atomic_set(&ctx->count,1); + memcpy(ctx->gc_principal, principal, len); } return ctx; } @@ -535,6 +551,7 @@ gss_pipe_downcall(struct file *filp, con struct rpc_cred *cred; struct gss_upcall_msg *gss_msg; struct gss_cl_ctx *ctx; + char principal[32]; uid_t uid; int err = -EFBIG; @@ -558,7 +575,8 @@ gss_pipe_downcall(struct file *filp, con } err = -ENOMEM; - ctx = gss_alloc_context(); + snprintf(principal, sizeof(principal), "%u@%s", uid, clnt->cl_server); + ctx = gss_alloc_context(principal); if (ctx == NULL) goto err; err = 0; @@ -692,6 +710,13 @@ gss_create(struct rpc_clnt *clnt, rpc_au if (err) goto err_put_mech; + snprintf(gss_auth->key_name, sizeof(gss_auth->key_name), + "mechanism=\"%s\" service=\"%s%u\" host=\"%s\"", + gss_auth->mech->gm_pfs[gss_auth->service-RPC_GSS_SVC_NONE].name, + clnt->cl_protname, + clnt->cl_vers, + clnt->cl_server); + snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", clnt->cl_pathname, gss_auth->mech->gm_name); @@ -743,6 +768,23 @@ gss_destroy_ctx(struct gss_cl_ctx *ctx) kfree(ctx); } +static inline struct gss_cred * +gss_alloc_cred(struct gss_auth *gss_auth) +{ + struct gss_cred *cred; + + dprintk("RPC: gss_alloc_cred \n"); + + cred = kmalloc(sizeof(*cred), GFP_KERNEL); + if (cred != NULL) { + memset(cred, 0, sizeof(*cred)); + atomic_set(&cred->gc_count, 1); + cred->gc_base.cr_ops = &gss_credops; + cred->gc_service = gss_auth->service; + } + return cred; +} + static void gss_destroy_cred(struct rpc_cred *rc) { @@ -752,16 +794,266 @@ gss_destroy_cred(struct rpc_cred *rc) if (cred->gc_ctx) gss_put_ctx(cred->gc_ctx); + if (cred->gc_key) + key_put(cred->gc_key); kfree(cred); } +#ifdef CONFIG_RPCSEC_GSS_KEYRING +static inline const void * +simple_skip_bytes(const void *p, const void *end, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + return q; +} + + +static struct gss_cl_ctx * +gss_key_read_context(const void *p, const void *end) +{ + struct gss_cl_ctx *ctx; + struct gss_api_mech *mech; + size_t len, maxlen; + + /* First up should be the name of the mechanism */ + maxlen = end - p; + len = strnlen((const char *)p, maxlen); + if (len == maxlen) + return ERR_PTR(-EFAULT); + /* find the mechanism */ + mech = gss_mech_get_by_name((const char *)p); + if (mech == NULL) + return ERR_PTR(-EINVAL); + p = simple_skip_bytes(p, end, len + 1); + if (IS_ERR(p)) + goto err_put_mech; + /* Next we want the name of the principal */ + maxlen = end - p; + len = strnlen((const char *)p, maxlen); + if (len == maxlen) { + p = ERR_PTR(-EFAULT); + goto err_put_mech; + } + ctx = gss_alloc_context((const char *)p); + p = simple_skip_bytes(p, end, len + 1); + if (IS_ERR(p)) + goto err_free_ctx; + /* Now read in context */ + p = gss_fill_context(p, end, ctx, mech); + if (IS_ERR(p)) + goto err_free_ctx; + return ctx; +err_free_ctx: + kfree(ctx); +err_put_mech: + gss_mech_put(mech); + return (struct gss_cl_ctx *)p; +} + +static int +gss_key_instantiate(struct key *key, const void *p, size_t buflen) +{ + const void *end = (const void *)((const char *)p + buflen); + struct gss_cl_ctx *ctx; + + ctx = gss_key_read_context(p, end); + if (IS_ERR(ctx)) + goto err; + write_lock(&key->lock); + key->payload.data = ctx; + key->expiry = get_seconds() + (ctx->gc_expiry - jiffies)/HZ; + write_unlock(&key->lock); + return 0; +err: + return PTR_ERR(ctx); +} + +static int +gss_key_duplicate(struct key *key, const struct key *source) +{ + struct gss_cl_ctx *ctx = (struct gss_cl_ctx *)source->payload.data; + + if (ctx != NULL) { + gss_mech_get(ctx->gc_gss_ctx->mech_type); + write_lock(&key->lock); + key->payload.data = gss_get_ctx(ctx); + key->expiry = source->expiry; + write_unlock(&key->lock); + } + return 0; +} + +static int +gss_key_update(struct key *key, const void *p, size_t buflen) +{ + const void *end = (const void *)((const char *)p + buflen); + struct gss_cl_ctx *ctx, *old; + + ctx = gss_key_read_context(p, end); + if (IS_ERR(ctx)) + goto err; + write_lock(&key->lock); + old = (struct gss_cl_ctx *) key->payload.data; + key->payload.data = ctx; + key->expiry = get_seconds() + (ctx->gc_expiry - jiffies)/HZ; + write_unlock(&key->lock); + if (old) + gss_put_ctx(ctx); + return 0; +err: + return PTR_ERR(ctx); +} + +static int +gss_key_match(const struct key *key, const void *description) +{ + return key->description != NULL && + strcmp(key->description, description) == 0; +} + +static void +gss_key_destroy(struct key *key) +{ + struct gss_cl_ctx *ctx = (struct gss_cl_ctx *)key->payload.data; + if (ctx != NULL) { + struct gss_api_mech *mech = ctx->gc_gss_ctx->mech_type; + gss_put_ctx(ctx); + gss_mech_put(mech); + } +} + +static void +gss_key_describe(const struct key *key, struct seq_file *m) +{ + struct gss_cl_ctx *ctx = NULL; + + seq_puts(m, key->description); + + if (key->payload.data) + ctx = gss_get_ctx((struct gss_cl_ctx *)key->payload.data); + if (ctx != NULL) { + seq_printf(m, ": %s", ctx->gc_principal); + gss_put_ctx(ctx); + } else + seq_printf(m, ": "); +} + +static struct key_type key_type_rpcsec_context = { + .name = "rpcsec_gss context", + .def_datalen = sizeof(struct gss_cl_ctx) + sizeof(struct gss_ctx), + .instantiate = gss_key_instantiate, + .duplicate = gss_key_duplicate, + .update = gss_key_update, + .match = gss_key_match, + .destroy = gss_key_destroy, + .describe = gss_key_describe, +}; + +static struct key * +gss_request_key(struct gss_auth *gss_auth) +{ + struct key *key; + struct rpc_clnt *clnt = gss_auth->client; + char args[384]; + + snprintf(args, sizeof(args), "%s ip=\"%u.%u.%u.%u\" port=\"%u\" proto=\"%s\"", + gss_auth->key_name, + NIPQUAD(clnt->cl_xprt->addr.sin_addr.s_addr), + clnt->cl_port, + clnt->cl_prot == IPPROTO_TCP ? "tcp" : "udp"); + dprintk("%s: requesting key %s with args %s\n", __FUNCTION__, + gss_auth->key_name, args); + + key = request_key(&key_type_rpcsec_context, gss_auth->key_name, args); + if (IS_ERR(key)) + goto out_err; + dprintk("%s: returned success\n", __FUNCTION__); + return key; +out_err: + dprintk("%s: returned error %ld\n", __FUNCTION__, -PTR_ERR(key)); + return key; +} + + +static inline struct gss_cl_ctx * +gss_key_lookup_context(struct key *key) +{ + struct gss_cl_ctx *ctx = ERR_PTR(-ENOKEY); + + read_lock(&key->lock); + if (key->payload.data != NULL) + ctx = gss_get_ctx((struct gss_cl_ctx *)key->payload.data); + read_unlock(&key->lock); + return ctx; +} + +static inline struct rpc_cred * +gss_key_lookup_cred(struct rpc_auth *auth) +{ + struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); + struct gss_cred *gss_cred; + struct gss_cl_ctx *ctx; + struct key *key; + void *err; + + err = key = gss_request_key(gss_auth); + if (IS_ERR(key)) + goto out_no_key; + err = ctx = gss_key_lookup_context(key); + if (IS_ERR(ctx)) + goto out_put_key; + gss_cred = gss_alloc_cred(gss_auth); + if (gss_cred == NULL) + goto out_no_cred; + gss_cred_set_ctx(&gss_cred->gc_base, ctx); + gss_cred->gc_key = key; + return &gss_cred->gc_base; +out_no_cred: + err = ERR_PTR(-ENOMEM); +out_put_key: + key_put(key); +out_no_key: + return (struct rpc_cred *)err; +} + +static inline int +gss_register_keytype(void) +{ + return register_key_type(&key_type_rpcsec_context); +} + +static inline void +gss_unregister_keytype(void) +{ + unregister_key_type(&key_type_rpcsec_context); +} +#endif + /* * Lookup RPCSEC_GSS cred for the current process */ static struct rpc_cred * gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) { - return rpcauth_lookup_credcache(auth, acred, taskflags); + struct rpc_cred *cred; + + /* Try to use the keyring upcall first */ + cred = gss_key_lookup_cred(auth); + if (!IS_ERR(cred)) + goto out; + switch (PTR_ERR(cred)) { + case -EKEYREVOKED: + case -EKEYEXPIRED: + /* Translate into EACCES */ + cred = ERR_PTR(-EACCES); + break; + case -ENOKEY: + cred = rpcauth_lookup_credcache(auth, acred, taskflags); + }; +out: + return cred; } static struct rpc_cred * @@ -774,25 +1066,14 @@ gss_create_cred(struct rpc_auth *auth, s dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", acred->uid, auth->au_flavor); - if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL))) + cred = gss_alloc_cred(gss_auth); + if (cred == NULL) goto out_err; - - memset(cred, 0, sizeof(*cred)); - atomic_set(&cred->gc_count, 1); cred->gc_uid = acred->uid; - /* - * Note: in order to force a call to call_refresh(), we deliberately - * fail to flag the credential as RPCAUTH_CRED_UPTODATE. - */ - cred->gc_flags = 0; - cred->gc_base.cr_ops = &gss_credops; - cred->gc_service = gss_auth->service; err = gss_create_upcall(gss_auth, cred); if (err < 0) goto out_err; - return &cred->gc_base; - out_err: dprintk("RPC: gss_create_cred failed with error %d\n", err); if (cred) gss_destroy_cred(&cred->gc_base); @@ -1138,7 +1419,12 @@ static int __init init_rpcsec_gss(void) err = gss_svc_init(); if (err) goto out_unregister; + err = gss_register_keytype(); + if (err) + goto out_shutdown_svc; return 0; +out_shutdown_svc: + gss_svc_shutdown(); out_unregister: rpcauth_unregister(&authgss_ops); out: @@ -1147,6 +1433,7 @@ out: static void __exit exit_rpcsec_gss(void) { + gss_unregister_keytype(); gss_svc_shutdown(); rpcauth_unregister(&authgss_ops); } Index: linux-2.6.13-rc1/net/sunrpc/clnt.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/clnt.c +++ linux-2.6.13-rc1/net/sunrpc/clnt.c @@ -180,7 +180,7 @@ out_no_path: kfree(clnt->cl_server); kfree(clnt); out_err: - xprt_destroy(xprt); + rpc_put_xprt(xprt); return ERR_PTR(err); } @@ -238,7 +238,8 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; - rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); + new->cl_xprt = rpc_client_get_xprt(clnt); + rpc_init_rtt(&new->cl_rtt_default, new->cl_xprt->timeout.to_initval); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); new->cl_pmap = &new->cl_pmap_default; @@ -298,16 +299,16 @@ rpc_destroy_client(struct rpc_clnt *clnt rpcauth_destroy(clnt->cl_auth); clnt->cl_auth = NULL; } + if (clnt->cl_xprt) { + rpc_put_xprt(clnt->cl_xprt); + clnt->cl_xprt = NULL; + } if (clnt->cl_parent != clnt) { rpc_destroy_client(clnt->cl_parent); goto out_free; } if (clnt->cl_pathname[0]) rpc_rmdir(clnt->cl_pathname); - if (clnt->cl_xprt) { - xprt_destroy(clnt->cl_xprt); - clnt->cl_xprt = NULL; - } if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); out_free: @@ -453,6 +454,41 @@ out: return status; } +/** + * rpc_client_get_xprt() - Get reference to the RPC transport struct + * @clnt - pointer to RPC client + */ +struct rpc_xprt *rpc_client_get_xprt(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + + /* Synchronize w.r.t. rpc_client_set_xprt() */ + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + atomic_inc(&xprt->count); + rcu_read_unlock(); + return xprt; +} + +/** + * rpc_client_set_xprt() - Change the transport struct pointer on an in-use RPC client + * @clnt - pointer to RPC client + * @xprt - new transport + * + * This function should be called VERY infrequently, and is designed + * to be called only in case of a failover mount. + */ +void rpc_client_set_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt *old; + + old = xchg(&clnt->cl_xprt, xprt); + /* Wait for all reads of clnt->cl_xprt == old to complete */ + synchronize_rcu(); + rpc_put_xprt(old); +} +EXPORT_SYMBOL(rpc_client_set_xprt); + /* * New rpc_call implementation */ @@ -516,7 +552,7 @@ rpc_call_setup(struct rpc_task *task, st void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { - struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_xprt *xprt = rpc_client_get_xprt(clnt); xprt->sndsize = 0; if (sndsize) @@ -526,6 +562,7 @@ rpc_setbufsize(struct rpc_clnt *clnt, un xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; if (xprt_connected(xprt)) xprt_sock_setbufsize(xprt); + rpc_put_xprt(xprt); } /* @@ -538,7 +575,11 @@ rpc_setbufsize(struct rpc_clnt *clnt, un */ size_t rpc_max_payload(struct rpc_clnt *clnt) { - return clnt->cl_xprt->max_payload; + size_t res; + rcu_read_lock(); + res = rcu_dereference(clnt->cl_xprt)->max_payload; + rcu_read_unlock(); + return res; } EXPORT_SYMBOL(rpc_max_payload); @@ -734,7 +775,7 @@ static void call_bind(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_xprt *xprt = task->tk_xprt; dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, xprt, (xprt_connected(xprt) ? "is" : "is not")); @@ -754,12 +795,10 @@ call_bind(struct rpc_task *task) static void call_connect(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; - dprintk("RPC: %4d call_connect status %d\n", task->tk_pid, task->tk_status); - if (xprt_connected(clnt->cl_xprt)) { + if (xprt_connected(task->tk_xprt)) { task->tk_action = call_transmit; return; } @@ -1020,7 +1059,7 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; Index: linux-2.6.13-rc1/net/sunrpc/pmap_clnt.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/pmap_clnt.c +++ linux-2.6.13-rc1/net/sunrpc/pmap_clnt.c @@ -39,7 +39,7 @@ void rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) { struct rpc_portmap *map = clnt->cl_pmap; - struct sockaddr_in *sap = &clnt->cl_xprt->addr; + struct sockaddr_in *sap = &task->tk_xprt->addr; struct rpc_message msg = { .rpc_proc = &pmap_procedures[PMAP_GETPORT], .rpc_argp = map, Index: linux-2.6.13-rc1/net/sunrpc/rpc_pipe.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/rpc_pipe.c +++ linux-2.6.13-rc1/net/sunrpc/rpc_pipe.c @@ -291,14 +291,16 @@ static int rpc_show_info(struct seq_file *m, void *v) { struct rpc_clnt *clnt = m->private; + struct rpc_xprt *xprt = rpc_client_get_xprt(clnt); seq_printf(m, "RPC server: %s\n", clnt->cl_server); seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname, clnt->cl_prog, clnt->cl_vers); seq_printf(m, "address: %u.%u.%u.%u\n", - NIPQUAD(clnt->cl_xprt->addr.sin_addr.s_addr)); + NIPQUAD(xprt->addr.sin_addr.s_addr)); seq_printf(m, "protocol: %s\n", - clnt->cl_xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + rpc_put_xprt(xprt); return 0; } Index: linux-2.6.13-rc1/net/sunrpc/sched.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/sched.c +++ linux-2.6.13-rc1/net/sunrpc/sched.c @@ -780,6 +780,7 @@ void rpc_init_task(struct rpc_task *task task->tk_flags |= RPC_TASK_SOFT; if (!clnt->cl_intr) task->tk_flags |= RPC_TASK_NOINTR; + task->tk_xprt = rpc_client_get_xprt(clnt); } #ifdef RPC_DEBUG @@ -869,6 +870,7 @@ void rpc_release_task(struct rpc_task *t rpcauth_unbindcred(task); rpc_free(task); if (task->tk_client) { + rpc_put_xprt(task->tk_xprt); rpc_release_client(task->tk_client); task->tk_client = NULL; } Index: linux-2.6.13-rc1/net/sunrpc/xdr.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/xdr.c +++ linux-2.6.13-rc1/net/sunrpc/xdr.c @@ -764,8 +764,7 @@ EXPORT_SYMBOL(xdr_inline_decode); * * Moves data beyond the current pointer position from the XDR head[] buffer * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. The current pointer is then - * repositioned at the beginning of the XDR tail. + * bytes is moved into the XDR tail[]. */ void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { @@ -802,6 +801,31 @@ void xdr_read_pages(struct xdr_stream *x } EXPORT_SYMBOL(xdr_read_pages); +/** + * xdr_enter_page - decode data from the XDR page + * @xdr: pointer to xdr_stream struct + * @len: number of bytes of page data + * + * Moves data beyond the current pointer position from the XDR head[] buffer + * into the page list. Any data that lies beyond current position + "len" + * bytes is moved into the XDR tail[]. The current pointer is then + * repositioned at the beginning of the first XDR page. + */ +void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) +{ + char * kaddr = page_address(xdr->buf->pages[0]); + xdr_read_pages(xdr, len); + /* + * Position current pointer at beginning of tail, and + * set remaining message length. + */ + if (len > PAGE_CACHE_SIZE - xdr->buf->page_base) + len = PAGE_CACHE_SIZE - xdr->buf->page_base; + xdr->p = (uint32_t *)(kaddr + xdr->buf->page_base); + xdr->end = (uint32_t *)((char *)xdr->p + len); +} +EXPORT_SYMBOL(xdr_enter_page); + static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; void Index: linux-2.6.13-rc1/net/sunrpc/xprt.c =================================================================== --- linux-2.6.13-rc1.orig/net/sunrpc/xprt.c +++ linux-2.6.13-rc1/net/sunrpc/xprt.c @@ -1485,6 +1485,7 @@ xprt_setup(int proto, struct sockaddr_in if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) return ERR_PTR(-ENOMEM); memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ + atomic_set(&xprt->count, 1); xprt->max_reqs = entries; slot_table_size = entries * sizeof(xprt->slot[0]); xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); @@ -1719,3 +1720,14 @@ xprt_destroy(struct rpc_xprt *xprt) return 0; } + +/** + * rpc_put_xprt() - Drop reference to the RPC transport struct + * @xprt - pointer to RPC transport + */ +void rpc_put_xprt(struct rpc_xprt *xprt) +{ + if (xprt != NULL && atomic_dec_and_test(&xprt->count)) + xprt_destroy(xprt); +} +