fs/Kconfig | 39 +- fs/inode.c | 2 fs/lockd/clntproc.c | 2 fs/lockd/host.c | 4 fs/lockd/mon.c | 16 fs/lockd/svc4proc.c | 21 + fs/lockd/svclock.c | 28 + fs/lockd/svcproc.c | 19 - fs/nfs/dir.c | 177 +++++++--- fs/nfs/direct.c | 5 fs/nfs/file.c | 9 fs/nfs/inode.c | 530 +++++++++++++++--------------- fs/nfs/mount_clnt.c | 14 fs/nfs/nfs2xdr.c | 54 +-- fs/nfs/nfs3proc.c | 97 +---- fs/nfs/nfs3xdr.c | 86 ++-- fs/nfs/nfs4proc.c | 230 +++---------- fs/nfs/nfs4state.c | 26 - fs/nfs/nfs4xdr.c | 245 +++++++------ fs/nfs/pagelist.c | 8 fs/nfs/proc.c | 108 ++---- fs/nfs/read.c | 346 +++++++++++++++---- fs/nfs/unlink.c | 3 fs/nfs/write.c | 723 ++++++++++++++++++++++++++++------------- include/linux/fs.h | 2 include/linux/lockd/debug.h | 2 include/linux/lockd/lockd.h | 1 include/linux/nfs_fs.h | 133 ++++--- include/linux/nfs_page.h | 44 ++ include/linux/nfs_xdr.h | 24 - include/linux/sunrpc/debug.h | 4 include/linux/sunrpc/sched.h | 66 ++- include/linux/sunrpc/timer.h | 11 include/linux/sunrpc/xdr.h | 2 include/linux/sunrpc/xprt.h | 37 +- net/sunrpc/auth_gss/auth_gss.c | 2 net/sunrpc/auth_unix.c | 7 net/sunrpc/clnt.c | 29 - net/sunrpc/pmap_clnt.c | 28 - net/sunrpc/sched.c | 267 ++++++++++++--- net/sunrpc/sunrpc_syms.c | 2 net/sunrpc/sysctl.c | 28 + net/sunrpc/xdr.c | 2 net/sunrpc/xprt.c | 298 +++++++--------- 44 files changed, 2300 insertions(+), 1481 deletions(-) diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/inode.c linux-2.6.4-27-nfs4mount/fs/inode.c --- linux-2.6.4-pre3/fs/inode.c 2004-03-10 19:39:43.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/inode.c 2004-03-10 20:12:03.000000000 -0500 @@ -1178,6 +1178,8 @@ void inode_update_time(struct inode *ino struct timespec now; int sync_it = 0; + if (IS_NOCMTIME(inode)) + return; if (IS_RDONLY(inode)) return; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/Kconfig linux-2.6.4-27-nfs4mount/fs/Kconfig --- linux-2.6.4-pre3/fs/Kconfig 2004-03-10 19:27:12.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/Kconfig 2004-03-10 20:12:19.000000000 -0500 @@ -1302,15 +1302,18 @@ config NFS_V3 Say Y here if you want your NFS client to be able to speak the newer version 3 of the NFS protocol. - If unsure, say N. + If unsure, say Y. config NFS_V4 bool "Provide NFSv4 client support (EXPERIMENTAL)" depends on NFS_FS && EXPERIMENTAL + select RPCSEC_GSS_KRB5 help Say Y here if you want your NFS client to be able to speak the newer - version 4 of the NFS protocol. This feature is experimental, and - should only be used if you are interested in helping to test NFSv4. + version 4 of the NFS protocol. + + Note: Requires auxiliary userspace daemons which may be found on + http://www.citi.umich.edu/projects/nfsv4/ If unsure, say N. @@ -1419,28 +1422,24 @@ config SUNRPC tristate config SUNRPC_GSS - tristate "Provide RPCSEC_GSS authentication (EXPERIMENTAL)" + tristate + +config RPCSEC_GSS_KRB5 + tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL - default SUNRPC if NFS_V4=y + select SUNRPC_GSS + select CRYPTO + select CRYPTO_MD5 + select CRYPTO_DES help - Provides cryptographic authentication for NFS rpc requests. To - make this useful, you must also select at least one rpcsec_gss - mechanism. - Note: You should always select this option if you wish to use + Provides for secure RPC calls by means of a gss-api + mechanism based on Kerberos V5. This is required for NFSv4. -config RPCSEC_GSS_KRB5 - tristate "Kerberos V mechanism for RPCSEC_GSS (EXPERIMENTAL)" - depends on SUNRPC_GSS && CRYPTO_DES && CRYPTO_MD5 - default SUNRPC_GSS if NFS_V4=y - help - Provides a gss-api mechanism based on Kerberos V5 (this is - mandatory for RFC3010-compliant NFSv4 implementations). - Requires a userspace daemon; - see http://www.citi.umich.edu/projects/nfsv4/. + Note: Requires an auxiliary userspace daemon which may be found on + http://www.citi.umich.edu/projects/nfsv4/ - Note: If you select this option, please ensure that you also - enable the MD5 and DES crypto ciphers. + If unsure, say N. config SMB_FS tristate "SMB file system support (to mount Windows shares etc.)" diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/clntproc.c linux-2.6.4-27-nfs4mount/fs/lockd/clntproc.c --- linux-2.6.4-pre3/fs/lockd/clntproc.c 2004-03-10 19:12:43.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/clntproc.c 2004-03-10 20:13:27.000000000 -0500 @@ -443,7 +443,7 @@ nlmclnt_lock(struct nlm_rqst *req, struc } if (status < 0) return status; - } while (resp->status == NLM_LCK_BLOCKED); + } while (resp->status == NLM_LCK_BLOCKED && req->a_args.block); if (resp->status == NLM_LCK_GRANTED) { fl->fl_u.nfs_fl.state = host->h_state; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/host.c linux-2.6.4-27-nfs4mount/fs/lockd/host.c --- linux-2.6.4-pre3/fs/lockd/host.c 2004-03-10 19:26:25.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/host.c 2004-03-10 20:13:08.000000000 -0500 @@ -188,14 +188,14 @@ nlm_bind_host(struct nlm_host *host) } } else { xprt = xprt_create_proto(host->h_proto, &host->h_addr, NULL); - if (xprt == NULL) + if (IS_ERR(xprt)) goto forgetit; xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); clnt = rpc_create_client(xprt, host->h_name, &nlm_program, host->h_version, host->h_authflavor); - if (clnt == NULL) { + if (IS_ERR(clnt)) { xprt_destroy(xprt); goto forgetit; } diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/mon.c linux-2.6.4-27-nfs4mount/fs/lockd/mon.c --- linux-2.6.4-pre3/fs/lockd/mon.c 2004-03-10 19:31:36.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/mon.c 2004-03-10 20:13:08.000000000 -0500 @@ -36,10 +36,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 int status; struct nsm_args args; - status = -EACCES; clnt = nsm_create(); - if (!clnt) + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); goto out; + } args.addr = host->h_addr.sin_addr.s_addr; args.proto= (host->h_proto<<1) | host->h_server; @@ -104,7 +105,7 @@ static struct rpc_clnt * nsm_create(void) { struct rpc_xprt *xprt; - struct rpc_clnt *clnt = NULL; + struct rpc_clnt *clnt; struct sockaddr_in sin; sin.sin_family = AF_INET; @@ -112,24 +113,23 @@ nsm_create(void) sin.sin_port = 0; xprt = xprt_create_proto(IPPROTO_UDP, &sin, NULL); - if (!xprt) - goto out; + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; clnt = rpc_create_client(xprt, "localhost", &nsm_program, SM_VERSION, RPC_AUTH_NULL); - if (!clnt) + if (IS_ERR(clnt)) goto out_destroy; clnt->cl_softrtry = 1; clnt->cl_chatty = 1; clnt->cl_oneshot = 1; xprt->resvport = 1; /* NSM requires a reserved port */ -out: return clnt; out_destroy: xprt_destroy(xprt); - goto out; + return clnt; } /* diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/svc4proc.c linux-2.6.4-27-nfs4mount/fs/lockd/svc4proc.c --- linux-2.6.4-pre3/fs/lockd/svc4proc.c 2004-03-10 19:21:08.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/svc4proc.c 2004-03-10 20:13:34.000000000 -0500 @@ -453,6 +453,24 @@ nlm4svc_proc_sm_notify(struct svc_rqst * } /* + * client sent a GRANTED_RES, let's remove the associated block + */ +static int +nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + return rpc_success; +} + + + +/* * This is the generic lockd callback for async RPC calls */ static u32 @@ -515,7 +533,6 @@ nlm4svc_callback_exit(struct rpc_task *t #define nlm4svc_proc_lock_res nlm4svc_proc_null #define nlm4svc_proc_cancel_res nlm4svc_proc_null #define nlm4svc_proc_unlock_res nlm4svc_proc_null -#define nlm4svc_proc_granted_res nlm4svc_proc_null struct nlm_void { int dummy; }; @@ -548,7 +565,7 @@ struct svc_procedure nlmsvc_procedures4 PROC(lock_res, lockres, norep, res, void, 1), PROC(cancel_res, cancelres, norep, res, void, 1), PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, grantedres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), /* statd callback */ PROC(sm_notify, reboot, void, reboot, void, 1), PROC(none, void, void, void, void, 0), diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/svclock.c linux-2.6.4-27-nfs4mount/fs/lockd/svclock.c --- linux-2.6.4-pre3/fs/lockd/svclock.c 2004-03-10 19:19:09.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/svclock.c 2004-03-10 20:13:40.000000000 -0500 @@ -64,7 +64,7 @@ nlmsvc_insert_block(struct nlm_block *bl if (when != NLM_NEVER) { if ((when += jiffies) == NLM_NEVER) when ++; - while ((b = *bp) && time_before_eq(b->b_when,when)) + while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER) bp = &b->b_next; } else while ((b = *bp)) @@ -143,14 +143,15 @@ static inline int nlm_cookie_match(struc * Find a block with a given NLM cookie. */ static inline struct nlm_block * -nlmsvc_find_block(struct nlm_cookie *cookie) +nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin) { struct nlm_block *block; for (block = nlm_blocked; block; block = block->b_next) { dprintk("cookie: head of blocked queue %p, block %p\n", nlm_blocked, block); - if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie)) + if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie) + && nlm_cmp_addr(sin, &block->b_host->h_addr)) break; } @@ -566,12 +567,16 @@ nlmsvc_grant_callback(struct rpc_task *t struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; struct nlm_block *block; unsigned long timeout; + struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client); dprintk("lockd: GRANT_MSG RPC callback\n"); - dprintk("callback: looking for cookie %x \n", - *(unsigned int *)(call->a_args.cookie.data)); - if (!(block = nlmsvc_find_block(&call->a_args.cookie))) { - dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data)); + dprintk("callback: looking for cookie %x, host (%08x)\n", + *(unsigned int *)(call->a_args.cookie.data), + ntohl(peer_addr->sin_addr.s_addr)); + if (!(block = nlmsvc_find_block(&call->a_args.cookie, peer_addr))) { + dprintk("lockd: no block for cookie %x, host (%08x)\n", + *(u32 *)(call->a_args.cookie.data), + ntohl(peer_addr->sin_addr.s_addr)); return; } @@ -600,18 +605,21 @@ nlmsvc_grant_callback(struct rpc_task *t * block. */ void -nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status) +nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status) { struct nlm_block *block; struct nlm_file *file; - if (!(block = nlmsvc_find_block(cookie))) + dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", + *(unsigned int *)(cookie->data), + ntohl(rqstp->rq_addr.sin_addr.s_addr), status); + if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr))) return; file = block->b_file; file->f_count++; down(&file->f_sema); - if ((block = nlmsvc_find_block(cookie)) != NULL) { + if ((block = nlmsvc_find_block(cookie,&rqstp->rq_addr)) != NULL) { if (status == NLM_LCK_DENIED_GRACE_PERIOD) { /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/lockd/svcproc.c linux-2.6.4-27-nfs4mount/fs/lockd/svcproc.c --- linux-2.6.4-pre3/fs/lockd/svcproc.c 2004-03-10 19:30:13.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/lockd/svcproc.c 2004-03-10 20:13:34.000000000 -0500 @@ -479,6 +479,22 @@ nlmsvc_proc_sm_notify(struct svc_rqst *r } /* + * client sent a GRANTED_RES, let's remove the associated block + */ +static int +nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + return rpc_success; +} + +/* * This is the generic lockd callback for async RPC calls */ static u32 @@ -541,7 +557,6 @@ nlmsvc_callback_exit(struct rpc_task *ta #define nlmsvc_proc_lock_res nlmsvc_proc_null #define nlmsvc_proc_cancel_res nlmsvc_proc_null #define nlmsvc_proc_unlock_res nlmsvc_proc_null -#define nlmsvc_proc_granted_res nlmsvc_proc_null struct nlm_void { int dummy; }; @@ -576,7 +591,7 @@ struct svc_procedure nlmsvc_procedures[ PROC(lock_res, lockres, norep, res, void, 1), PROC(cancel_res, cancelres, norep, res, void, 1), PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, grantedres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), /* statd callback */ PROC(sm_notify, reboot, void, reboot, void, 1), PROC(none, void, void, void, void, 1), diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/dir.c linux-2.6.4-27-nfs4mount/fs/nfs/dir.c --- linux-2.6.4-pre3/fs/nfs/dir.c 2004-03-10 19:24:26.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/dir.c 2004-03-10 20:12:42.000000000 -0500 @@ -139,11 +139,13 @@ int nfs_readdir_filler(nfs_readdir_descr struct file *file = desc->file; struct inode *inode = file->f_dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); + unsigned long timestamp; int error; dfprintk(VFS, "NFS: nfs_readdir_filler() reading cookie %Lu into page %lu.\n", (long long)desc->entry->cookie, page->index); again: + timestamp = jiffies; error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -157,18 +159,21 @@ int nfs_readdir_filler(nfs_readdir_descr goto error; } SetPageUptodate(page); + NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; /* Ensure consistent page alignment of the data. * Note: assumes we have exclusive access to this mapping either * throught inode->i_sem or some other mechanism. */ - if (page->index == 0) + if (page->index == 0) { invalidate_inode_pages(inode->i_mapping); + NFS_I(inode)->readdir_timestamp = timestamp; + } unlock_page(page); return 0; error: SetPageError(page); unlock_page(page); - invalidate_inode_pages(inode->i_mapping); + nfs_zap_caches(inode); desc->error = error; return -EIO; } @@ -381,6 +386,7 @@ int uncached_readdir(nfs_readdir_descrip page, NFS_SERVER(inode)->dtsize, desc->plus); + NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; desc->page = page; desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (desc->error >= 0) { @@ -459,7 +465,15 @@ static int nfs_readdir(struct file *filp } res = 0; break; - } else if (res < 0) + } + if (res == -ETOOSMALL && desc->plus) { + NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + nfs_zap_caches(inode); + desc->plus = 0; + desc->entry->eof = 0; + continue; + } + if (res < 0) break; res = nfs_do_filldir(desc, dirent, filldir); @@ -481,14 +495,19 @@ static int nfs_readdir(struct file *filp * In the case it has, we assume that the dentries are untrustworthy * and may need to be looked up again. */ -static inline -int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +static inline int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; - if (nfs_revalidate_inode(NFS_SERVER(dir), dir)) + if ((NFS_FLAGS(dir) & NFS_INO_INVALID_ATTR) != 0 + || nfs_attribute_timeout(dir)) return 0; - return time_after(dentry->d_time, NFS_MTIME_UPDATE(dir)); + return nfs_verify_change_attribute(dir, (unsigned long)dentry->d_fsdata); +} + +static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) +{ + dentry->d_fsdata = (void *)verf; } /* @@ -528,9 +547,7 @@ int nfs_neg_need_reval(struct inode *dir /* Don't revalidate a negative dentry if we're creating a new file */ if ((ndflags & LOOKUP_CREATE) && !(ndflags & LOOKUP_CONTINUE)) return 0; - if (!nfs_check_verifier(dir, dentry)) - return 1; - return time_after(jiffies, dentry->d_time + NFS_ATTRTIMEO(dir)); + return !nfs_check_verifier(dir, dentry); } /* @@ -552,6 +569,7 @@ static int nfs_lookup_revalidate(struct int error; struct nfs_fh fhandle; struct nfs_fattr fattr; + unsigned long verifier; int isopen = 0; parent = dget_parent(dentry); @@ -574,6 +592,9 @@ static int nfs_lookup_revalidate(struct goto out_bad; } + /* Revalidate parent directory attribute cache */ + nfs_revalidate_inode(NFS_SERVER(dir), dir); + /* Force a full look up iff the parent directory has changed */ if (nfs_check_verifier(dir, dentry)) { if (nfs_lookup_verify_inode(inode, isopen)) @@ -581,6 +602,12 @@ static int nfs_lookup_revalidate(struct goto out_valid; } + /* + * Note: we're not holding inode->i_sem and so may be racing with + * operations that change the directory. We therefore save the + * change attribute *before* we do the RPC call. + */ + verifier = nfs_save_change_attribute(dir); error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); if (!error) { if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) @@ -603,6 +630,7 @@ static int nfs_lookup_revalidate(struct out_valid_renew: nfs_renew_times(dentry); + nfs_set_verifier(dentry, verifier); out_valid: unlock_kernel(); dput(parent); @@ -638,6 +666,11 @@ static int nfs_dentry_delete(struct dent /* Unhash it, so that ->d_iput() would be called */ return 1; } + if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + /* Unhash it, so that ancestors of killed async unlink + * files will be cleaned up during umount */ + return 1; + } return 0; } @@ -693,6 +726,8 @@ static struct dentry *nfs_lookup(struct dentry->d_op = NFS_PROTO(dir)->dentry_ops; lock_kernel(); + /* Revalidate parent directory attribute cache */ + nfs_revalidate_inode(NFS_SERVER(dir), dir); /* If we're doing an exclusive create, optimize away the lookup */ if (nfs_is_exclusive_create(dir, nd)) @@ -715,6 +750,7 @@ no_entry: error = 0; d_add(dentry, inode); nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unlock: unlock_kernel(); out: @@ -768,7 +804,15 @@ static struct dentry *nfs_atomic_lookup( /* Open the file on the server */ lock_kernel(); - inode = nfs4_atomic_open(dir, dentry, nd); + /* Revalidate parent directory attribute cache */ + nfs_revalidate_inode(NFS_SERVER(dir), dir); + + if (nd->intent.open.flags & O_CREAT) { + nfs_begin_data_update(dir); + inode = nfs4_atomic_open(dir, dentry, nd); + nfs_end_data_update(dir); + } else + inode = nfs4_atomic_open(dir, dentry, nd); unlock_kernel(); if (IS_ERR(inode)) { error = PTR_ERR(inode); @@ -790,6 +834,7 @@ static struct dentry *nfs_atomic_lookup( no_entry: d_add(dentry, inode); nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out: BUG_ON(error > 0); return ERR_PTR(error); @@ -801,13 +846,16 @@ static int nfs_open_revalidate(struct de { struct dentry *parent = NULL; struct inode *inode = dentry->d_inode; + struct inode *dir; + unsigned long verifier; int openflags, ret = 0; /* NFS only supports OPEN for regular files */ if (inode && !S_ISREG(inode->i_mode)) goto no_open; parent = dget_parent(dentry); - if (!is_atomic_open(parent->d_inode, nd)) + dir = parent->d_inode; + if (!is_atomic_open(dir, nd)) goto no_open; openflags = nd->intent.open.flags; if (openflags & O_CREAT) { @@ -821,8 +869,16 @@ static int nfs_open_revalidate(struct de /* We can't create new files, or truncate existing ones here */ openflags &= ~(O_CREAT|O_TRUNC); + /* + * Note: we're not holding inode->i_sem and so may be racing with + * operations that change the directory. We therefore save the + * change attribute *before* we do the RPC call. + */ lock_kernel(); - ret = nfs4_open_revalidate(parent->d_inode, dentry, openflags); + verifier = nfs_save_change_attribute(dir); + ret = nfs4_open_revalidate(dir, dentry, openflags); + if (!ret) + nfs_set_verifier(dentry, verifier); unlock_kernel(); out: dput(parent); @@ -869,15 +925,20 @@ int nfs_cached_lookup(struct inode *dir, struct nfs_server *server; struct nfs_entry entry; struct page *page; - unsigned long timestamp = NFS_MTIME_UPDATE(dir); + unsigned long timestamp; int res; if (!NFS_USE_READDIRPLUS(dir)) return -ENOENT; server = NFS_SERVER(dir); - if (server->flags & NFS_MOUNT_NOAC) + /* Don't use readdirplus unless the cache is stable */ + if ((server->flags & NFS_MOUNT_NOAC) != 0 + || nfs_caches_unstable(dir) + || nfs_attribute_timeout(dir)) return -ENOENT; - nfs_revalidate_inode(server, dir); + if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0) + return -ENOENT; + timestamp = NFS_I(dir)->readdir_timestamp; entry.fh = fh; entry.fattr = fattr; @@ -931,6 +992,7 @@ static int nfs_instantiate(struct dentry if (inode) { d_instantiate(dentry, inode); nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dentry->d_parent->d_inode)); error = 0; } return error; @@ -969,11 +1031,13 @@ static int nfs_create(struct inode *dir, * does not pass the create flags. */ lock_kernel(); - nfs_zap_caches(dir); + nfs_begin_data_update(dir); inode = NFS_PROTO(dir)->create(dir, &dentry->d_name, &attr, open_flags); + nfs_end_data_update(dir); if (!IS_ERR(inode)) { d_instantiate(dentry, inode); nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); error = 0; } else { error = PTR_ERR(inode); @@ -1004,9 +1068,10 @@ nfs_mknod(struct inode *dir, struct dent attr.ia_valid = ATTR_MODE; lock_kernel(); - nfs_zap_caches(dir); + nfs_begin_data_update(dir); error = NFS_PROTO(dir)->mknod(dir, &dentry->d_name, &attr, rdev, &fhandle, &fattr); + nfs_end_data_update(dir); if (!error) error = nfs_instantiate(dentry, &fhandle, &fattr); else @@ -1041,9 +1106,10 @@ static int nfs_mkdir(struct inode *dir, */ d_drop(dentry); #endif - nfs_zap_caches(dir); + nfs_begin_data_update(dir); error = NFS_PROTO(dir)->mkdir(dir, &dentry->d_name, &attr, &fhandle, &fattr); + nfs_end_data_update(dir); if (!error) error = nfs_instantiate(dentry, &fhandle, &fattr); else @@ -1060,10 +1126,12 @@ static int nfs_rmdir(struct inode *dir, dir->i_ino, dentry->d_name.name); lock_kernel(); - nfs_zap_caches(dir); + nfs_begin_data_update(dir); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); - if (!error) + /* Ensure the VFS deletes this inode */ + if (error == 0 && dentry->d_inode != NULL) dentry->d_inode->i_nlink = 0; + nfs_end_data_update(dir); unlock_kernel(); return error; @@ -1119,12 +1187,21 @@ dentry->d_parent->d_name.name, dentry->d goto out; } while(sdentry->d_inode != NULL); /* need negative lookup */ - nfs_zap_caches(dir); qsilly.name = silly; qsilly.len = strlen(silly); - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, dir, &qsilly); + nfs_begin_data_update(dir); + if (dentry->d_inode) { + nfs_begin_data_update(dentry->d_inode); + error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, + dir, &qsilly); + nfs_end_data_update(dentry->d_inode); + } else + error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, + dir, &qsilly); + nfs_end_data_update(dir); if (!error) { nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); d_move(dentry, sdentry); error = nfs_async_unlink(dentry); /* If we return 0 we don't unlink */ @@ -1156,14 +1233,17 @@ static int nfs_safe_remove(struct dentry goto out; } - nfs_zap_caches(dir); - if (inode) - NFS_CACHEINV(inode); - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); - if (error < 0) - goto out; - if (inode) - inode->i_nlink--; + nfs_begin_data_update(dir); + if (inode != NULL) { + nfs_begin_data_update(inode); + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + /* The VFS may want to delete this inode */ + if (error == 0) + inode->i_nlink--; + nfs_end_data_update(inode); + } else + error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + nfs_end_data_update(dir); out: return error; } @@ -1198,9 +1278,10 @@ static int nfs_unlink(struct inode *dir, spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); - if (!error) + if (!error) { nfs_renew_times(dentry); - else if (need_rehash) + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + } else if (need_rehash) d_rehash(dentry); unlock_kernel(); return error; @@ -1247,9 +1328,10 @@ dentry->d_parent->d_name.name, dentry->d qsymname.len = strlen(symname); lock_kernel(); - nfs_zap_caches(dir); + nfs_begin_data_update(dir); error = NFS_PROTO(dir)->symlink(dir, &dentry->d_name, &qsymname, &attr, &sym_fh, &sym_attr); + nfs_end_data_update(dir); if (!error) { error = nfs_instantiate(dentry, &sym_fh, &sym_attr); } else { @@ -1281,9 +1363,12 @@ nfs_link(struct dentry *old_dentry, stru */ lock_kernel(); d_drop(dentry); - nfs_zap_caches(dir); - NFS_CACHEINV(inode); + + nfs_begin_data_update(dir); + nfs_begin_data_update(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + nfs_end_data_update(inode); + nfs_end_data_update(dir); unlock_kernel(); return error; } @@ -1388,16 +1473,23 @@ go_ahead: if (new_inode) d_delete(new_dentry); - nfs_zap_caches(new_dir); - nfs_zap_caches(old_dir); + nfs_begin_data_update(old_dir); + nfs_begin_data_update(new_dir); + nfs_begin_data_update(old_inode); error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); + nfs_end_data_update(old_inode); + nfs_end_data_update(new_dir); + nfs_end_data_update(old_dir); out: if (rehash) d_rehash(rehash); - if (!error && !S_ISDIR(old_inode->i_mode)) - d_move(old_dentry, new_dentry); - nfs_renew_times(new_dentry); + if (!error) { + if (!S_ISDIR(old_inode->i_mode)) + d_move(old_dentry, new_dentry); + nfs_renew_times(new_dentry); + nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); + } /* new dentry created? */ if (dentry) @@ -1451,7 +1543,8 @@ nfs_permission(struct inode *inode, int cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); if (cache->cred == cred - && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) { + && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) + && !(NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) { if (!(res = cache->err)) { /* Is the mask a subset of an accepted mask? */ if ((cache->mask & mask) == mask) diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/direct.c linux-2.6.4-27-nfs4mount/fs/nfs/direct.c --- linux-2.6.4-pre3/fs/nfs/direct.c 2004-03-10 19:36:02.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/direct.c 2004-03-10 20:14:30.000000000 -0500 @@ -128,6 +128,7 @@ nfs_direct_read_seg(struct inode *inode, .inode = inode, .args = { .fh = NFS_FH(inode), + .lockowner = current->files, }, .res = { .fattr = &rdata.fattr, @@ -258,6 +259,7 @@ nfs_direct_write_seg(struct inode *inode .inode = inode, .args = { .fh = NFS_FH(inode), + .lockowner = current->files, }, .res = { .fattr = &wdata.fattr, @@ -269,6 +271,7 @@ nfs_direct_write_seg(struct inode *inode if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) wdata.args.stable = NFS_FILE_SYNC; + nfs_begin_data_update(inode); retry: need_commit = 0; tot_bytes = 0; @@ -334,6 +337,8 @@ retry: VERF_SIZE) != 0) goto sync_retry; } + nfs_end_data_update(inode); + NFS_FLAGS(inode) |= NFS_INO_INVALID_DATA; return tot_bytes; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/file.c linux-2.6.4-27-nfs4mount/fs/nfs/file.c --- linux-2.6.4-pre3/fs/nfs/file.c 2004-03-10 19:29:44.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/file.c 2004-03-10 20:12:34.000000000 -0500 @@ -104,11 +104,16 @@ nfs_file_flush(struct file *file) dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; lock_kernel(); - status = nfs_wb_file(inode, file); + /* Ensure that data+attribute caches are up to date after close() */ + status = nfs_wb_all(inode); if (!status) { status = file->f_error; file->f_error = 0; + if (!status) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); } unlock_kernel(); return status; @@ -179,7 +184,7 @@ nfs_fsync(struct file *file, struct dent dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); - status = nfs_wb_file(inode, file); + status = nfs_wb_all(inode); if (!status) { status = file->f_error; file->f_error = 0; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/inode.c linux-2.6.4-27-nfs4mount/fs/nfs/inode.c --- linux-2.6.4-pre3/fs/nfs/inode.c 2004-03-10 19:33:18.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/inode.c 2004-03-10 20:15:18.000000000 -0500 @@ -47,14 +47,11 @@ * their needs. People that do NFS over a slow network, might for * instance want to reduce it to something closer to 1 for improved * interactive response. - * - * For the moment, though, we instead set it to RPC_MAXREQS, which - * is the maximum number of simultaneous RPC requests on the wire. */ -#define NFS_MAX_READAHEAD RPC_MAXREQS +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) -void nfs_zap_caches(struct inode *); static void nfs_invalidate_inode(struct inode *); +static int nfs_update_inode(struct inode *, struct nfs_fattr *, unsigned long); static struct inode *nfs_alloc_inode(struct super_block *sb); static void nfs_destroy_inode(struct inode *); @@ -118,7 +115,7 @@ nfs_write_inode(struct inode *inode, int { int flags = sync ? FLUSH_WAIT : 0; - nfs_commit_file(inode, NULL, 0, 0, flags); + nfs_commit_inode(inode, 0, 0, flags); } static void @@ -151,6 +148,7 @@ nfs_clear_inode(struct inode *inode) cred = nfsi->cache_access.cred; if (cred) put_rpccred(cred); + BUG_ON(atomic_read(&nfsi->data_updates) != 0); } void @@ -230,50 +228,23 @@ nfs_block_size(unsigned long bsize, unsi /* * Obtain the root inode of the file system. */ -static int -nfs_get_root(struct inode **rooti, rpc_authflavor_t authflavor, struct super_block *sb, struct nfs_fh *rootfh) +static struct inode * +nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo) { struct nfs_server *server = NFS_SB(sb); - struct nfs_fattr fattr = { }; + struct inode *rooti; int error; - error = server->rpc_ops->getroot(server, rootfh, &fattr); - if (error == -EACCES && authflavor > RPC_AUTH_MAXFLAVOR) { - /* - * Some authentication types (gss/krb5, most notably) - * are such that root won't be able to present a - * credential for GETATTR (ie, getroot()). - * - * We still want the mount to succeed. - * - * So we fake the attr values and mark the inode as such. - * On the first succesful traversal, we fix everything. - * The auth type test isn't quite correct, but whatever. - */ - dfprintk(VFS, "NFS: faking root inode\n"); - - fattr.fileid = 1; - fattr.nlink = 2; /* minimum for a dir */ - fattr.type = NFDIR; - fattr.mode = S_IFDIR|S_IRUGO|S_IXUGO; - fattr.size = 4096; - fattr.du.nfs3.used = 1; - fattr.valid = NFS_ATTR_FATTR|NFS_ATTR_FATTR_V3; - } else if (error < 0) { + error = server->rpc_ops->getroot(server, rootfh, fsinfo); + if (error < 0) { printk(KERN_NOTICE "nfs_get_root: getattr error = %d\n", -error); - *rooti = NULL; /* superfluous ... but safe */ - return error; + return ERR_PTR(error); } - *rooti = nfs_fhget(sb, rootfh, &fattr); - if (error == -EACCES && authflavor > RPC_AUTH_MAXFLAVOR) { - if (*rooti) { - NFS_FLAGS(*rooti) |= NFS_INO_FAKE_ROOT; - NFS_CACHEINV((*rooti)); - error = 0; - } - } - return error; + rooti = nfs_fhget(sb, rootfh, fsinfo->fattr); + if (!rooti) + return ERR_PTR(-ENOMEM); + return rooti; } /* @@ -283,7 +254,7 @@ static int nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) { struct nfs_server *server; - struct inode *root_inode = NULL; + struct inode *root_inode; struct nfs_fattr fattr; struct nfs_fsinfo fsinfo = { .fattr = &fattr, @@ -299,8 +270,9 @@ nfs_sb_init(struct super_block *sb, rpc_ sb->s_magic = NFS_SUPER_MAGIC; + root_inode = nfs_get_root(sb, &server->fh, &fsinfo); /* Did getting the root inode fail? */ - if (nfs_get_root(&root_inode, authflavor, sb, &server->fh) < 0) + if (IS_ERR(root_inode)) goto out_no_root; sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) @@ -309,10 +281,6 @@ nfs_sb_init(struct super_block *sb, rpc_ sb->s_root->d_op = server->rpc_ops->dentry_ops; /* Get some general file system info */ - if (server->rpc_ops->fsinfo(server, &server->fh, &fsinfo) < 0) { - printk(KERN_NOTICE "NFS: cannot retrieve file system info.\n"); - goto out_no_root; - } if (server->namelen == 0 && server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; @@ -368,13 +336,11 @@ nfs_sb_init(struct super_block *sb, rpc_ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); return 0; /* Yargs. It didn't work out. */ -out_free_all: - if (root_inode) - iput(root_inode); - return -EINVAL; out_no_root: printk("nfs_read_super: get root inode failed\n"); - goto out_free_all; + if (!IS_ERR(root_inode)) + iput(root_inode); + return -EINVAL; } /* @@ -402,13 +368,13 @@ nfs_create_client(struct nfs_server *ser /* create transport and client */ xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP, &server->addr, &timeparms); - if (xprt == NULL) { + if (IS_ERR(xprt)) { printk(KERN_WARNING "NFS: cannot create RPC transport.\n"); - goto out_fail; + return (struct rpc_clnt *)xprt; } clnt = rpc_create_client(xprt, server->hostname, &nfs_program, server->rpc_ops->version, data->pseudoflavor); - if (clnt == NULL) { + if (IS_ERR(clnt)) { printk(KERN_WARNING "NFS: cannot create RPC client.\n"); goto out_fail; } @@ -421,9 +387,8 @@ nfs_create_client(struct nfs_server *ser return clnt; out_fail: - if (xprt) - xprt_destroy(xprt); - return NULL; + xprt_destroy(xprt); + return clnt; } /* @@ -627,13 +592,17 @@ static int nfs_show_options(struct seq_f void nfs_zap_caches(struct inode *inode) { + struct nfs_inode *nfsi = NFS_I(inode); + int mode = inode->i_mode; + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - invalidate_remote_inode(inode); - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - NFS_CACHEINV(inode); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + else + nfsi->flags |= NFS_INO_INVALID_ATTR; } /* @@ -673,9 +642,6 @@ nfs_find_actor(struct inode *inode, void return 0; if (is_bad_inode(inode)) return 0; - /* Force an attribute cache update if inode->i_count == 0 */ - if (!atomic_read(&inode->i_count)) - NFS_CACHEINV(inode); return 1; } @@ -729,7 +695,7 @@ nfs_fhget(struct super_block *sb, struct inode->i_ino = hash; /* We can't support update_atime(), since the server will reset it */ - inode->i_flags |= S_NOATIME; + inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. @@ -754,10 +720,6 @@ nfs_fhget(struct super_block *sb, struct inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; - nfsi->read_cache_ctime = fattr->ctime; - nfsi->read_cache_mtime = fattr->mtime; - nfsi->cache_mtime_jiffies = fattr->timestamp; - nfsi->read_cache_isize = fattr->size; if (fattr->valid & NFS_ATTR_FATTR_V4) nfsi->change_attr = fattr->change_attr; inode->i_size = nfs_size_to_loff_t(fattr->size); @@ -804,70 +766,50 @@ nfs_setattr(struct dentry *dentry, struc struct nfs_fattr fattr; int error; + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + attr->ia_valid &= ~ATTR_SIZE; + } + /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; if (attr->ia_valid == 0) return 0; lock_kernel(); - - /* - * Make sure the inode is up-to-date. - */ - error = nfs_revalidate_inode(NFS_SERVER(inode),inode); - if (error) { -#ifdef NFS_PARANOIA -printk("nfs_setattr: revalidate failed, error=%d\n", error); -#endif - goto out; - } - - if (!S_ISREG(inode->i_mode)) { - attr->ia_valid &= ~ATTR_SIZE; - if (attr->ia_valid == 0) - goto out; - } else { - filemap_fdatawrite(inode->i_mapping); - error = nfs_wb_all(inode); - filemap_fdatawait(inode->i_mapping); - if (error) - goto out; - /* Optimize away unnecessary truncates */ - if ((attr->ia_valid & ATTR_SIZE) && i_size_read(inode) == attr->ia_size) - attr->ia_valid &= ~ATTR_SIZE; + nfs_begin_data_update(inode); + /* Write all dirty data if we're changing file permissions or size */ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) { + if (filemap_fdatawrite(inode->i_mapping) == 0) + filemap_fdatawait(inode->i_mapping); + nfs_wb_all(inode); } - if (!attr->ia_valid) - goto out; - error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); - if (error) - goto out; - /* - * If we changed the size or mtime, update the inode - * now to avoid invalidating the page cache. - */ - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != fattr.size) - printk("nfs_setattr: attr=%Ld, fattr=%Ld??\n", - (long long) attr->ia_size, (long long)fattr.size); - vmtruncate(inode, attr->ia_size); + if (error == 0) { + nfs_refresh_inode(inode, &fattr); + if ((attr->ia_valid & ATTR_MODE) != 0) { + int mode; + mode = inode->i_mode & ~S_IALLUGO; + mode |= attr->ia_mode & S_IALLUGO; + inode->i_mode = mode; + } + if ((attr->ia_valid & ATTR_UID) != 0) + inode->i_uid = attr->ia_uid; + if ((attr->ia_valid & ATTR_GID) != 0) + inode->i_gid = attr->ia_gid; + if ((attr->ia_valid & ATTR_SIZE) != 0) { + i_size_write(inode, attr->ia_size); + vmtruncate(inode, attr->ia_size); + } } - - /* - * If we changed the size or mtime, update the inode - * now to avoid invalidating the page cache. - */ - if (!(fattr.valid & NFS_ATTR_WCC)) { - struct nfs_inode *nfsi = NFS_I(inode); - fattr.pre_size = nfsi->read_cache_isize; - fattr.pre_mtime = nfsi->read_cache_mtime; - fattr.pre_ctime = nfsi->read_cache_ctime; - fattr.valid |= NFS_ATTR_WCC; - } - /* Force an attribute cache update */ - NFS_CACHEINV(inode); - error = nfs_refresh_inode(inode, &fattr); -out: + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; + if (*cred) { + put_rpccred(*cred); + *cred = NULL; + } + } + nfs_end_data_update(inode); unlock_kernel(); return error; } @@ -895,7 +837,19 @@ nfs_wait_on_inode(struct inode *inode, i int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - int err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + struct nfs_inode *nfsi = NFS_I(inode); + int need_atime = nfsi->flags & NFS_INO_INVALID_ATIME; + int err; + + if (__IS_FLG(inode, MS_NOATIME)) + need_atime = 0; + else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + need_atime = 0; + /* We may force a getattr if the user cares about atime */ + if (need_atime) + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + err = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (!err) generic_fillattr(inode, stat); return err; @@ -930,8 +884,10 @@ int nfs_open(struct inode *inode, struct auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); filp->private_data = cred; - if (filp->f_mode & FMODE_WRITE) + if ((filp->f_mode & FMODE_WRITE) != 0) { nfs_set_mmcred(inode, cred); + nfs_begin_data_update(inode); + } return 0; } @@ -940,6 +896,8 @@ int nfs_release(struct inode *inode, str struct rpc_cred *cred; lock_kernel(); + if ((filp->f_mode & FMODE_WRITE) != 0) + nfs_end_data_update(inode); cred = nfs_file_cred(filp); if (cred) put_rpccred(cred); @@ -956,6 +914,9 @@ __nfs_revalidate_inode(struct nfs_server { int status = -ESTALE; struct nfs_fattr fattr; + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long verifier; + unsigned int flags; dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); @@ -965,23 +926,22 @@ __nfs_revalidate_inode(struct nfs_server goto out_nowait; if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) goto out_nowait; - if (NFS_FAKE_ROOT(inode)) { - dfprintk(VFS, "NFS: not revalidating fake root\n"); - status = 0; - goto out_nowait; - } while (NFS_REVALIDATING(inode)) { status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING); if (status < 0) goto out_nowait; - if (time_before(jiffies,NFS_READTIME(inode)+NFS_ATTRTIMEO(inode))) { - status = NFS_STALE(inode) ? -ESTALE : 0; - goto out_nowait; - } + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOAC) + continue; + if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) + continue; + status = NFS_STALE(inode) ? -ESTALE : 0; + goto out_nowait; } NFS_FLAGS(inode) |= NFS_INO_REVALIDATING; + /* Protect against RPC races by saving the change attribute */ + verifier = nfs_save_change_attribute(inode); status = NFS_PROTO(inode)->getattr(inode, &fattr); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -995,13 +955,36 @@ __nfs_revalidate_inode(struct nfs_server goto out; } - status = nfs_refresh_inode(inode, &fattr); + status = nfs_update_inode(inode, &fattr, verifier); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } + flags = nfsi->flags; + /* + * We may need to keep the attributes marked as invalid if + * we raced with nfs_end_attr_update(). + */ + if (verifier == nfsi->cache_change_attribute) + nfsi->flags &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); + /* Do the page cache invalidation */ + if (flags & NFS_INO_INVALID_DATA) { + if (S_ISREG(inode->i_mode)) { + if (filemap_fdatawrite(inode->i_mapping) == 0) + filemap_fdatawait(inode->i_mapping); + nfs_wb_all(inode); + } + nfsi->flags &= ~NFS_INO_INVALID_DATA; + invalidate_inode_pages2(inode->i_mapping); + memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode)); + /* This ensures we revalidate dentries */ + nfsi->cache_change_attribute++; + } dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); @@ -1009,41 +992,107 @@ __nfs_revalidate_inode(struct nfs_server NFS_FLAGS(inode) &= ~NFS_INO_STALE; out: NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; - wake_up(&NFS_I(inode)->nfs_i_wait); + wake_up(&nfsi->nfs_i_wait); out_nowait: unlock_kernel(); return status; } -/* - * nfs_fattr_obsolete - Test if attribute data is newer than cached data - * @inode: inode - * @fattr: attributes to test +/** + * nfs_begin_data_update + * @inode - pointer to inode + * Declare that a set of operations will update file data on the server + */ +void nfs_begin_data_update(struct inode *inode) +{ + atomic_inc(&NFS_I(inode)->data_updates); +} + +/** + * nfs_end_data_update + * @inode - pointer to inode + * Declare end of the operations that will update file data + */ +void nfs_end_data_update(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + /* Mark the attribute cache for revalidation */ + nfsi->flags |= NFS_INO_INVALID_ATTR; + /* Directories and symlinks: invalidate page cache too */ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + nfsi->flags |= NFS_INO_INVALID_DATA; + nfsi->cache_change_attribute ++; + atomic_dec(&nfsi->data_updates); +} + +/** + * nfs_refresh_inode - verify consistency of the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes * - * Avoid stuffing the attribute cache with obsolete information. - * We always accept updates if the attribute cache timed out, or if - * fattr->ctime is newer than our cached value. - * If fattr->ctime matches the cached value, we still accept the update - * if it increases the file size. + * Verifies the attribute cache. If we have just changed the attributes, + * so that fattr carries weak cache consistency data, then it may + * also update the ctime/mtime/change_attribute. */ -static inline -int nfs_fattr_obsolete(struct inode *inode, struct nfs_fattr *fattr) +int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); - long cdif; + loff_t cur_size, new_isize; + int data_unstable; + + /* Are we in the process of updating data on the server? */ + data_unstable = nfs_caches_unstable(inode); - if (time_after(jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo)) - goto out_valid; - cdif = fattr->ctime.tv_sec - nfsi->read_cache_ctime.tv_sec; - if (cdif == 0) - cdif = fattr->ctime.tv_nsec - nfsi->read_cache_ctime.tv_nsec; - if (cdif > 0) - goto out_valid; - /* Ugh... */ - if (cdif == 0 && fattr->size > nfsi->read_cache_isize) - goto out_valid; - return -1; - out_valid: + if (fattr->valid & NFS_ATTR_FATTR_V4) { + if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 + && nfsi->change_attr == fattr->pre_change_attr) + nfsi->change_attr = fattr->change_attr; + if (!data_unstable && nfsi->change_attr != fattr->change_attr) + nfsi->flags |= NFS_INO_INVALID_ATTR; + } + + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + + /* Has the inode gone and changed behind our back? */ + if (nfsi->fileid != fattr->fileid + || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + return -EIO; + + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + + /* If we have atomic WCC data, we may update some attributes */ + if ((fattr->valid & NFS_ATTR_WCC) != 0) { + if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + } + + /* Verify a few of the more important attributes */ + if (!data_unstable) { + if (!timespec_equal(&inode->i_mtime, &fattr->mtime) + || cur_size != new_isize) + nfsi->flags |= NFS_INO_INVALID_ATTR; + } else if (S_ISREG(inode->i_mode) && new_isize > cur_size) + nfsi->flags |= NFS_INO_INVALID_ATTR; + + /* Have any file permissions changed? */ + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) + || inode->i_uid != fattr->uid + || inode->i_gid != fattr->gid) + nfsi->flags |= NFS_INO_INVALID_ATTR; + + /* Has the link count changed? */ + if (inode->i_nlink != fattr->nlink) + nfsi->flags |= NFS_INO_INVALID_ATTR; + + if (!timespec_equal(&inode->i_atime, &fattr->atime)) + nfsi->flags |= NFS_INO_INVALID_ATIME; + + nfsi->read_cache_jiffies = fattr->timestamp; return 0; } @@ -1059,65 +1108,66 @@ int nfs_fattr_obsolete(struct inode *ino * * A very similar scenario holds for the dir cache. */ -int -__nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier) { struct nfs_inode *nfsi = NFS_I(inode); __u64 new_size; loff_t new_isize; - int invalid = 0; - int mtime_update = 0; + unsigned int invalid = 0; loff_t cur_isize; + int data_unstable; - dfprintk(VFS, "NFS: refresh_inode(%s/%ld ct=%d info=0x%x)\n", - inode->i_sb->s_id, inode->i_ino, + dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", + __FUNCTION__, inode->i_sb->s_id, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - /* First successful call after mount, fill real data. */ - if (NFS_FAKE_ROOT(inode)) { - dfprintk(VFS, "NFS: updating fake root\n"); - nfsi->fileid = fattr->fileid; - NFS_FLAGS(inode) &= ~NFS_INO_FAKE_ROOT; - } + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; if (nfsi->fileid != fattr->fileid) { - printk(KERN_ERR "nfs_refresh_inode: inode number mismatch\n" + printk(KERN_ERR "%s: inode number mismatch\n" "expected (%s/0x%Lx), got (%s/0x%Lx)\n", + __FUNCTION__, inode->i_sb->s_id, (long long)nfsi->fileid, inode->i_sb->s_id, (long long)fattr->fileid); goto out_err; } - /* Throw out obsolete READDIRPLUS attributes */ - if (time_before(fattr->timestamp, NFS_READTIME(inode))) - return 0; /* * Make sure the inode's type hasn't changed. */ if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; - new_size = fattr->size; - new_isize = nfs_size_to_loff_t(fattr->size); - - /* Avoid races */ - if (nfs_fattr_obsolete(inode, fattr)) - goto out_nochange; - /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->timestamp; - /* - * Note: NFS_CACHE_ISIZE(inode) reflects the state of the cache. - * NOT inode->i_size!!! - */ - if (nfsi->read_cache_isize != new_size) { + /* Are we racing with known updates of the metadata on the server? */ + data_unstable = ! nfs_verify_change_attribute(inode, verifier); + + /* Check if the file size agrees */ + new_size = fattr->size; + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (cur_isize != new_size) { #ifdef NFS_DEBUG_VERBOSE printk(KERN_DEBUG "NFS: isize change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino); #endif - invalid = 1; + /* + * If we have pending writebacks, things can get + * messy. + */ + if (S_ISREG(inode->i_mode) && data_unstable) { + if (new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } + } else { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + } } /* @@ -1125,12 +1175,13 @@ __nfs_refresh_inode(struct inode *inode, * can change this value in VFS without requiring a * cache revalidation. */ - if (!timespec_equal(&nfsi->read_cache_mtime, &fattr->mtime)) { + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); #ifdef NFS_DEBUG_VERBOSE printk(KERN_DEBUG "NFS: mtime change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino); #endif - invalid = 1; - mtime_update = 1; + if (!data_unstable) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } if ((fattr->valid & NFS_ATTR_FATTR_V4) @@ -1139,47 +1190,15 @@ __nfs_refresh_inode(struct inode *inode, printk(KERN_DEBUG "NFS: change_attr change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino); #endif - invalid = 1; - } - - /* Check Weak Cache Consistency data. - * If size and mtime match the pre-operation values, we can - * assume that any attribute changes were caused by our NFS - * operation, so there's no need to invalidate the caches. - */ - if ((fattr->valid & NFS_ATTR_PRE_CHANGE) - && nfsi->change_attr == fattr->pre_change_attr) { - invalid = 0; - } - else if ((fattr->valid & NFS_ATTR_WCC) - && nfsi->read_cache_isize == fattr->pre_size - && timespec_equal(&nfsi->read_cache_mtime, &fattr->pre_mtime)) { - invalid = 0; - } - - /* - * If we have pending writebacks, things can get - * messy. - */ - cur_isize = i_size_read(inode); - if (nfs_have_writebacks(inode) && new_isize < cur_isize) - new_isize = cur_isize; - - nfsi->read_cache_ctime = fattr->ctime; - inode->i_ctime = fattr->ctime; - inode->i_atime = fattr->atime; - - if (mtime_update) { - if (invalid) - nfsi->cache_mtime_jiffies = fattr->timestamp; - nfsi->read_cache_mtime = fattr->mtime; - inode->i_mtime = fattr->mtime; + nfsi->change_attr = fattr->change_attr; + if (!data_unstable) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } - nfsi->read_cache_isize = new_size; - i_size_write(inode, new_isize); + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - if (inode->i_mode != fattr->mode || + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || inode->i_uid != fattr->uid || inode->i_gid != fattr->gid) { struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; @@ -1187,11 +1206,9 @@ __nfs_refresh_inode(struct inode *inode, put_rpccred(*cred); *cred = NULL; } + invalid |= NFS_INO_INVALID_ATTR; } - if (fattr->valid & NFS_ATTR_FATTR_V4) - nfsi->change_attr = fattr->change_attr; - inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; @@ -1207,31 +1224,30 @@ __nfs_refresh_inode(struct inode *inode, inode->i_blocks = fattr->du.nfs2.blocks; inode->i_blksize = fattr->du.nfs2.blocksize; } - - /* Update attrtimeo value */ - if (invalid) { + + /* Update attrtimeo value if we're out of the unstable period */ + if (invalid & NFS_INO_INVALID_ATTR) { nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - invalidate_remote_inode(inode); - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); } else if (time_after(jiffies, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; } + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + invalid &= ~NFS_INO_INVALID_DATA; + nfsi->flags |= invalid; return 0; - out_nochange: - if (!timespec_equal(&fattr->atime, &inode->i_atime)) - inode->i_atime = fattr->atime; - return 0; out_changed: /* * Big trouble! The inode has become a different object. */ #ifdef NFS_PARANOIA - printk(KERN_DEBUG "nfs_refresh_inode: inode %ld mode changed, %07o to %07o\n", - inode->i_ino, inode->i_mode, fattr->mode); + printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", + __FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode); #endif /* * No need to worry about unhashing the dentry, as the @@ -1391,8 +1407,8 @@ static void nfs4_clear_inode(struct inod inode->i_sb->s_id, (long long)NFS_FILEID(inode), state); - list_del(&state->inode_states); - nfs4_put_open_state(state); + BUG_ON(atomic_read(&state->count) != 1); + nfs4_close_state(state, state->state); } /* Now call standard NFS clear_inode() code */ nfs_clear_inode(inode); @@ -1472,17 +1488,19 @@ static int nfs4_fill_super(struct super_ down_write(&clp->cl_sem); if (clp->cl_rpcclient == NULL) { xprt = xprt_create_proto(proto, &server->addr, &timeparms); - if (xprt == NULL) { + if (IS_ERR(xprt)) { up_write(&clp->cl_sem); printk(KERN_WARNING "NFS: cannot create RPC transport.\n"); + err = PTR_ERR(xprt); goto out_fail; } clnt = rpc_create_client(xprt, server->hostname, &nfs_program, server->rpc_ops->version, authflavour); - if (clnt == NULL) { + if (IS_ERR(clnt)) { up_write(&clp->cl_sem); printk(KERN_WARNING "NFS: cannot create RPC client.\n"); xprt_destroy(xprt); + err = PTR_ERR(clnt); goto out_fail; } clnt->cl_chatty = 1; @@ -1495,14 +1513,17 @@ static int nfs4_fill_super(struct super_ clear_bit(NFS4CLNT_OK, &clp->cl_state); list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); clnt = rpc_clone_client(clp->cl_rpcclient); - server->nfs4_state = clp; + if (!IS_ERR(clnt)) + server->nfs4_state = clp; up_write(&clp->cl_sem); clp = NULL; - if (clnt == NULL) { + if (IS_ERR(clnt)) { printk(KERN_WARNING "NFS: cannot create RPC client.\n"); + err = PTR_ERR(clnt); goto out_remove_list; } + err = -ENOMEM; if (server->nfs4_state->cl_idmap == NULL) { printk(KERN_WARNING "NFS: failed to create idmapper.\n"); goto out_shutdown; @@ -1601,7 +1622,7 @@ static struct super_block *nfs4_get_sb(s if (data->version != NFS4_MOUNT_VERSION) { printk("nfs warning: mount version %s than kernel\n", - data->version < NFS_MOUNT_VERSION ? "older" : "newer"); + data->version < NFS4_MOUNT_VERSION ? "older" : "newer"); } p = nfs_copy_user_string(NULL, &data->hostname, 256); @@ -1718,6 +1739,7 @@ static void init_once(void * foo, kmem_c INIT_LIST_HEAD(&nfsi->dirty); INIT_LIST_HEAD(&nfsi->commit); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + atomic_set(&nfsi->data_updates, 0); nfsi->ndirty = 0; nfsi->ncommit = 0; nfsi->npages = 0; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/mount_clnt.c linux-2.6.4-27-nfs4mount/fs/nfs/mount_clnt.c --- linux-2.6.4-pre3/fs/nfs/mount_clnt.c 2004-03-10 19:13:21.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/mount_clnt.c 2004-03-10 20:13:08.000000000 -0500 @@ -57,8 +57,9 @@ nfsroot_mount(struct sockaddr_in *addr, (unsigned)ntohl(addr->sin_addr.s_addr), path); sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr->sin_addr.s_addr)); - if (!(mnt_clnt = mnt_create(hostname, addr, version, protocol))) - return -EACCES; + mnt_clnt = mnt_create(hostname, addr, version, protocol); + if (IS_ERR(mnt_clnt)) + return PTR_ERR(mnt_clnt); call = (version == NFS_MNT3_VERSION) ? MOUNTPROC3_MNT : MNTPROC_MNT; status = rpc_call(mnt_clnt, call, path, &result, 0); @@ -72,13 +73,14 @@ mnt_create(char *hostname, struct sockad struct rpc_xprt *xprt; struct rpc_clnt *clnt; - if (!(xprt = xprt_create_proto(protocol, srvaddr, NULL))) - return NULL; + xprt = xprt_create_proto(protocol, srvaddr, NULL); + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; clnt = rpc_create_client(xprt, hostname, &mnt_program, version, - RPC_AUTH_NULL); - if (!clnt) { + RPC_AUTH_UNIX); + if (IS_ERR(clnt)) { xprt_destroy(xprt); } else { clnt->cl_softrtry = 1; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs2xdr.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs2xdr.c --- linux-2.6.4-pre3/fs/nfs/nfs2xdr.c 2004-03-10 19:21:16.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs2xdr.c 2004-03-10 20:13:14.000000000 -0500 @@ -36,33 +36,33 @@ extern int nfs_stat_to_errno(int stat) * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ -#define NFS_fhandle_sz 8 -#define NFS_sattr_sz 8 -#define NFS_filename_sz 1+(NFS2_MAXNAMLEN>>2) -#define NFS_path_sz 1+(NFS2_MAXPATHLEN>>2) -#define NFS_fattr_sz 17 -#define NFS_info_sz 5 -#define NFS_entry_sz NFS_filename_sz+3 - -#define NFS_diropargs_sz NFS_fhandle_sz+NFS_filename_sz -#define NFS_sattrargs_sz NFS_fhandle_sz+NFS_sattr_sz -#define NFS_readlinkargs_sz NFS_fhandle_sz -#define NFS_readargs_sz NFS_fhandle_sz+3 -#define NFS_writeargs_sz NFS_fhandle_sz+4 -#define NFS_createargs_sz NFS_diropargs_sz+NFS_sattr_sz -#define NFS_renameargs_sz NFS_diropargs_sz+NFS_diropargs_sz -#define NFS_linkargs_sz NFS_fhandle_sz+NFS_diropargs_sz -#define NFS_symlinkargs_sz NFS_diropargs_sz+NFS_path_sz+NFS_sattr_sz -#define NFS_readdirargs_sz NFS_fhandle_sz+2 - -#define NFS_attrstat_sz 1+NFS_fattr_sz -#define NFS_diropres_sz 1+NFS_fhandle_sz+NFS_fattr_sz -#define NFS_readlinkres_sz 1 -#define NFS_readres_sz 1+NFS_fattr_sz+1 -#define NFS_writeres_sz NFS_attrstat_sz -#define NFS_stat_sz 1 -#define NFS_readdirres_sz 1 -#define NFS_statfsres_sz 1+NFS_info_sz +#define NFS_fhandle_sz (8) +#define NFS_sattr_sz (8) +#define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) +#define NFS_path_sz (1+(NFS2_MAXPATHLEN>>2)) +#define NFS_fattr_sz (17) +#define NFS_info_sz (5) +#define NFS_entry_sz (NFS_filename_sz+3) + +#define NFS_diropargs_sz (NFS_fhandle_sz+NFS_filename_sz) +#define NFS_sattrargs_sz (NFS_fhandle_sz+NFS_sattr_sz) +#define NFS_readlinkargs_sz (NFS_fhandle_sz) +#define NFS_readargs_sz (NFS_fhandle_sz+3) +#define NFS_writeargs_sz (NFS_fhandle_sz+4) +#define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz) +#define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz) +#define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz) +#define NFS_symlinkargs_sz (NFS_diropargs_sz+NFS_path_sz+NFS_sattr_sz) +#define NFS_readdirargs_sz (NFS_fhandle_sz+2) + +#define NFS_attrstat_sz (1+NFS_fattr_sz) +#define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) +#define NFS_readlinkres_sz (1) +#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_writeres_sz (NFS_attrstat_sz) +#define NFS_stat_sz (1) +#define NFS_readdirres_sz (1) +#define NFS_statfsres_sz (1+NFS_info_sz) /* * Common NFS XDR functions as inlines diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs3proc.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs3proc.c --- linux-2.6.4-pre3/fs/nfs/nfs3proc.c 2004-03-10 19:24:32.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs3proc.c 2004-03-10 20:14:30.000000000 -0500 @@ -68,20 +68,6 @@ nfs3_async_handle_jukebox(struct rpc_tas return 1; } -static void -nfs3_write_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) -{ - if (fattr->valid & NFS_ATTR_FATTR) { - if (!(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - nfs_refresh_inode(inode, fattr); - } -} - static struct rpc_cred * nfs_cred(struct inode *inode, struct file *filp) { @@ -99,14 +85,18 @@ nfs_cred(struct inode *inode, struct fil */ static int nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fsinfo *info) { int status; - dprintk("NFS call getroot\n"); - fattr->valid = 0; - status = rpc_call(server->client, NFS3PROC_GETATTR, fhandle, fattr, 0); - dprintk("NFS reply getroot\n"); + dprintk("%s: call fsinfo\n", __FUNCTION__); + info->fattr->valid = 0; + status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); + dprintk("%s: reply fsinfo %d\n", __FUNCTION__, status); + if (!(info->fattr->valid & NFS_ATTR_FATTR)) { + status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0); + dprintk("%s: reply getattr %d\n", __FUNCTION__, status); + } return status; } @@ -280,7 +270,7 @@ nfs3_proc_write(struct nfs_write_data *w msg.rpc_cred = nfs_cred(inode, filp); status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags); if (status >= 0) - nfs3_write_refresh_inode(inode, fattr); + nfs_refresh_inode(inode, fattr); dprintk("NFS reply write: %d\n", status); return status < 0? status : wdata->res.count; } @@ -303,7 +293,7 @@ nfs3_proc_commit(struct nfs_write_data * msg.rpc_cred = nfs_cred(inode, filp); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status >= 0) - nfs3_write_refresh_inode(inode, fattr); + nfs_refresh_inode(inode, fattr); dprintk("NFS reply commit: %d\n", status); return status; } @@ -739,11 +729,10 @@ nfs3_read_done(struct rpc_task *task) } static void -nfs3_proc_read_setup(struct nfs_read_data *data, unsigned int count) +nfs3_proc_read_setup(struct nfs_read_data *data) { struct rpc_task *task = &data->task; struct inode *inode = data->inode; - struct nfs_page *req; int flags; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READ], @@ -751,47 +740,33 @@ nfs3_proc_read_setup(struct nfs_read_dat .rpc_resp = &data->res, .rpc_cred = data->cred, }; - - req = nfs_list_entry(data->pages.next); - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.pages = data->pagevec; - data->args.count = count; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - + /* N.B. Do we need to test? Never called for swapfile inode */ flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs3_read_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_readdata_release; - - rpc_call_setup(&data->task, &msg, 0); + rpc_call_setup(task, &msg, 0); } static void nfs3_write_done(struct rpc_task *task) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data; if (nfs3_async_handle_jukebox(task)) return; + data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) - nfs3_write_refresh_inode(data->inode, data->res.fattr); + nfs_refresh_inode(data->inode, data->res.fattr); nfs_writeback_done(task); } static void -nfs3_proc_write_setup(struct nfs_write_data *data, unsigned int count, int how) +nfs3_proc_write_setup(struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct inode *inode = data->inode; - struct nfs_page *req; int stable; int flags; struct rpc_message msg = { @@ -808,44 +783,31 @@ nfs3_proc_write_setup(struct nfs_write_d stable = NFS_DATA_SYNC; } else stable = NFS_UNSTABLE; - - req = nfs_list_entry(data->pages.next); - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.count = count; data->args.stable = stable; - data->args.pages = data->pagevec; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs3_write_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_writedata_release; - - rpc_call_setup(&data->task, &msg, 0); + rpc_call_setup(task, &msg, 0); } static void nfs3_commit_done(struct rpc_task *task) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data; if (nfs3_async_handle_jukebox(task)) return; + data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) - nfs3_write_refresh_inode(data->inode, data->res.fattr); + nfs_refresh_inode(data->inode, data->res.fattr); nfs_commit_done(task); } static void -nfs3_proc_commit_setup(struct nfs_write_data *data, u64 start, u32 len, int how) +nfs3_proc_commit_setup(struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct inode *inode = data->inode; @@ -857,23 +819,12 @@ nfs3_proc_commit_setup(struct nfs_write_ .rpc_cred = data->cred, }; - data->args.fh = NFS_FH(data->inode); - data->args.offset = start; - data->args.count = len; - data->res.count = len; - data->res.fattr = &data->fattr; - data->res.verf = &data->verf; - /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs3_commit_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_commit_release; - - rpc_call_setup(&data->task, &msg, 0); + rpc_call_setup(task, &msg, 0); } /* diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs3xdr.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs3xdr.c --- linux-2.6.4-pre3/fs/nfs/nfs3xdr.c 2004-03-10 19:40:57.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs3xdr.c 2004-03-10 20:13:48.000000000 -0500 @@ -33,51 +33,51 @@ extern int nfs_stat_to_errno(int); * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ -#define NFS3_fhandle_sz 1+16 -#define NFS3_fh_sz NFS3_fhandle_sz /* shorthand */ -#define NFS3_sattr_sz 15 -#define NFS3_filename_sz 1+(NFS3_MAXNAMLEN>>2) -#define NFS3_path_sz 1+(NFS3_MAXPATHLEN>>2) -#define NFS3_fattr_sz 21 -#define NFS3_wcc_attr_sz 6 -#define NFS3_pre_op_attr_sz 1+NFS3_wcc_attr_sz -#define NFS3_post_op_attr_sz 1+NFS3_fattr_sz -#define NFS3_wcc_data_sz NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz +#define NFS3_fhandle_sz (1+16) +#define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_sattr_sz (15) +#define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) +#define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) +#define NFS3_fattr_sz (21) +#define NFS3_wcc_attr_sz (6) +#define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) +#define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) +#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) #define NFS3_fsstat_sz #define NFS3_fsinfo_sz #define NFS3_pathconf_sz -#define NFS3_entry_sz NFS3_filename_sz+3 +#define NFS3_entry_sz (NFS3_filename_sz+3) -#define NFS3_sattrargs_sz NFS3_fh_sz+NFS3_sattr_sz+3 -#define NFS3_diropargs_sz NFS3_fh_sz+NFS3_filename_sz -#define NFS3_accessargs_sz NFS3_fh_sz+1 -#define NFS3_readlinkargs_sz NFS3_fh_sz -#define NFS3_readargs_sz NFS3_fh_sz+3 -#define NFS3_writeargs_sz NFS3_fh_sz+5 -#define NFS3_createargs_sz NFS3_diropargs_sz+NFS3_sattr_sz -#define NFS3_mkdirargs_sz NFS3_diropargs_sz+NFS3_sattr_sz -#define NFS3_symlinkargs_sz NFS3_diropargs_sz+NFS3_path_sz+NFS3_sattr_sz -#define NFS3_mknodargs_sz NFS3_diropargs_sz+2+NFS3_sattr_sz -#define NFS3_renameargs_sz NFS3_diropargs_sz+NFS3_diropargs_sz -#define NFS3_linkargs_sz NFS3_fh_sz+NFS3_diropargs_sz -#define NFS3_readdirargs_sz NFS3_fh_sz+2 -#define NFS3_commitargs_sz NFS3_fh_sz+3 - -#define NFS3_attrstat_sz 1+NFS3_fattr_sz -#define NFS3_wccstat_sz 1+NFS3_wcc_data_sz -#define NFS3_lookupres_sz 1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz) -#define NFS3_accessres_sz 1+NFS3_post_op_attr_sz+1 -#define NFS3_readlinkres_sz 1+NFS3_post_op_attr_sz -#define NFS3_readres_sz 1+NFS3_post_op_attr_sz+3 -#define NFS3_writeres_sz 1+NFS3_wcc_data_sz+4 -#define NFS3_createres_sz 1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz -#define NFS3_renameres_sz 1+(2 * NFS3_wcc_data_sz) -#define NFS3_linkres_sz 1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz -#define NFS3_readdirres_sz 1+NFS3_post_op_attr_sz+2 -#define NFS3_fsstatres_sz 1+NFS3_post_op_attr_sz+13 -#define NFS3_fsinfores_sz 1+NFS3_post_op_attr_sz+12 -#define NFS3_pathconfres_sz 1+NFS3_post_op_attr_sz+6 -#define NFS3_commitres_sz 1+NFS3_wcc_data_sz+2 +#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) +#define NFS3_accessargs_sz (NFS3_fh_sz+1) +#define NFS3_readlinkargs_sz (NFS3_fh_sz) +#define NFS3_readargs_sz (NFS3_fh_sz+3) +#define NFS3_writeargs_sz (NFS3_fh_sz+5) +#define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) +#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+NFS3_path_sz+NFS3_sattr_sz) +#define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) +#define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) +#define NFS3_readdirargs_sz (NFS3_fh_sz+2) +#define NFS3_commitargs_sz (NFS3_fh_sz+3) + +#define NFS3_attrstat_sz (1+NFS3_fattr_sz) +#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) +#define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) +#define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) +#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) +#define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) +#define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) +#define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) +#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) /* * Map file type to S_IFMT bits @@ -103,9 +103,7 @@ static struct { static inline u32 * xdr_encode_fhandle(u32 *p, struct nfs_fh *fh) { - *p++ = htonl(fh->size); - memcpy(p, fh->data, fh->size); - return p + XDR_QUADLEN(fh->size); + return xdr_encode_array(p, fh->data, fh->size); } static inline u32 * diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs4proc.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs4proc.c --- linux-2.6.4-pre3/fs/nfs/nfs4proc.c 2004-03-10 19:32:31.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs4proc.c 2004-03-10 20:14:30.000000000 -0500 @@ -54,12 +54,24 @@ #define GET_OP(cp,name) &cp->ops[cp->req_nops].u.name #define OPNUM(cp) cp->ops[cp->req_nops].opnum +static int nfs4_proc_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; extern nfs4_stateid zero_stateid; +/* Prevent leaks of NFSv4 errors into userland */ +static inline int nfs4_map_errors(int err) +{ + if (err < -1000) { + printk(KERN_WARNING "%s could not handle NFSv4 error %d\n", + __FUNCTION__, -err); + return -EIO; + } + return err; +} + static void nfs4_setup_compound(struct nfs4_compound *cp, struct nfs4_op *ops, struct nfs_server *server, char *tag) @@ -505,6 +517,8 @@ nfs4_open_reclaim(struct nfs4_state_owne status = rpc_call_sync(server->client, &msg, 0); nfs4_increment_seqid(status, sp); + if (status == 0) + memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); /* Update the inode attributes */ nfs_refresh_inode(inode, &fattr); return status; @@ -689,12 +703,12 @@ nfs4_do_setattr(struct nfs_server *serve retry: fattr->valid = 0; - if (state) + if (sattr->ia_valid & ATTR_SIZE) nfs4_copy_stateid(&arg.stateid, state, 0); - else + else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - status = rpc_call_sync(server->client, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); if (status) { status = nfs4_handle_error(server, status); if (!status) @@ -822,10 +836,11 @@ nfs4_open_revalidate(struct inode *dir, static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fsinfo *info) { struct nfs4_compound compound; struct nfs4_op ops[4]; + struct nfs_fattr * fattr = info->fattr; unsigned char * p; struct qstr q; int status; @@ -869,7 +884,9 @@ nfs4_proc_get_root(struct nfs_server *se break; } out: - return status; + if (status) + return nfs4_map_errors(status); + return nfs4_proc_fsinfo(server, fhandle, info); } static int @@ -883,7 +900,7 @@ nfs4_proc_getattr(struct inode *inode, s nfs4_setup_compound(&compound, ops, NFS_SERVER(inode), "getattr"); nfs4_setup_putfh(&compound, NFS_FH(inode)); nfs4_setup_getattr(&compound, fattr); - return nfs4_call_compound(&compound, NULL, 0); + return nfs4_map_errors(nfs4_call_compound(&compound, NULL, 0)); } /* @@ -969,7 +986,7 @@ nfs4_proc_lookup(struct inode *dir, stru if (status >= 0) status = nfs_refresh_inode(dir, &dir_attr); - return status; + return nfs4_map_errors(status); } static int @@ -1016,7 +1033,7 @@ nfs4_proc_access(struct inode *inode, st else if (req_access != resp_access) status = -EACCES; } - return status; + return nfs4_map_errors(status); } /* @@ -1052,7 +1069,7 @@ nfs4_proc_readlink(struct inode *inode, nfs4_setup_compound(&compound, ops, NFS_SERVER(inode), "readlink"); nfs4_setup_putfh(&compound, NFS_FH(inode)); nfs4_setup_readlink(&compound, PAGE_CACHE_SIZE, &page); - return nfs4_call_compound(&compound, NULL, 0); + return nfs4_map_errors(nfs4_call_compound(&compound, NULL, 0)); } static int @@ -1079,23 +1096,19 @@ nfs4_proc_read(struct nfs_read_data *rda if (filp) { struct nfs4_state *state; state = (struct nfs4_state *)filp->private_data; - nfs4_copy_stateid(&rdata->args.stateid, state, rdata->lockowner); + rdata->args.state = state; msg.rpc_cred = state->owner->so_cred; } else { - memcpy(&rdata->args.stateid, &zero_stateid, sizeof(rdata->args.stateid)); + rdata->args.state = NULL; msg.rpc_cred = NFS_I(inode)->mm_cred; } fattr->valid = 0; status = rpc_call_sync(server->client, &msg, flags); - if (!status) { + if (!status) renew_lease(server, timestamp); - /* Check cache consistency */ - if (fattr->change_attr != NFS_CHANGE_ATTR(inode)) - nfs_zap_caches(inode); - } dprintk("NFS reply read: %d\n", status); - return status; + return nfs4_map_errors(status); } static int @@ -1121,18 +1134,17 @@ nfs4_proc_write(struct nfs_write_data *w if (filp) { struct nfs4_state *state; state = (struct nfs4_state *)filp->private_data; - nfs4_copy_stateid(&wdata->args.stateid, state, wdata->lockowner); + wdata->args.state = state; msg.rpc_cred = state->owner->so_cred; } else { - memcpy(&wdata->args.stateid, &zero_stateid, sizeof(wdata->args.stateid)); + wdata->args.state = NULL; msg.rpc_cred = NFS_I(inode)->mm_cred; } fattr->valid = 0; status = rpc_call_sync(server->client, &msg, rpcflags); - NFS_CACHEINV(inode); dprintk("NFS reply write: %d\n", status); - return status; + return nfs4_map_errors(status); } static int @@ -1154,20 +1166,15 @@ nfs4_proc_commit(struct nfs_write_data * /* * Try first to use O_WRONLY, then O_RDWR stateid. */ - if (filp) { - struct nfs4_state *state; - state = (struct nfs4_state *)filp->private_data; - nfs4_copy_stateid(&cdata->args.stateid, state, cdata->lockowner); - msg.rpc_cred = state->owner->so_cred; - } else { - memcpy(&cdata->args.stateid, &zero_stateid, sizeof(cdata->args.stateid)); + if (filp) + msg.rpc_cred = ((struct nfs4_state *)filp->private_data)->owner->so_cred; + else msg.rpc_cred = NFS_I(inode)->mm_cred; - } fattr->valid = 0; status = rpc_call_sync(server->client, &msg, 0); dprintk("NFS reply commit: %d\n", status); - return status; + return nfs4_map_errors(status); } /* @@ -1234,7 +1241,7 @@ nfs4_proc_remove(struct inode *dir, stru process_cinfo(&dir_cinfo, &dir_attr); nfs_refresh_inode(dir, &dir_attr); } - return status; + return nfs4_map_errors(status); } struct unlink_desc { @@ -1312,7 +1319,7 @@ nfs4_proc_rename(struct inode *old_dir, nfs_refresh_inode(old_dir, &old_dir_attr); nfs_refresh_inode(new_dir, &new_dir_attr); } - return status; + return nfs4_map_errors(status); } static int @@ -1342,7 +1349,7 @@ nfs4_proc_link(struct inode *inode, stru nfs_refresh_inode(dir, &dir_attr); nfs_refresh_inode(inode, &fattr); } - return status; + return nfs4_map_errors(status); } static int @@ -1373,7 +1380,7 @@ nfs4_proc_symlink(struct inode *dir, str process_cinfo(&dir_cinfo, &dir_attr); nfs_refresh_inode(dir, &dir_attr); } - return status; + return nfs4_map_errors(status); } static int @@ -1403,7 +1410,7 @@ nfs4_proc_mkdir(struct inode *dir, struc process_cinfo(&dir_cinfo, &dir_attr); nfs_refresh_inode(dir, &dir_attr); } - return status; + return nfs4_map_errors(status); } static int @@ -1421,9 +1428,11 @@ nfs4_proc_readdir(struct dentry *dentry, nfs4_setup_putfh(&compound, NFS_FH(dir)); nfs4_setup_readdir(&compound, cookie, NFS_COOKIEVERF(dir), &page, count, dentry); status = nfs4_call_compound(&compound, cred, 0); + if (status == 0) + memcpy(NFS_COOKIEVERF(dir), ops[1].u.readdir.rd_resp_verifier.data, NFS4_VERIFIER_SIZE); unlock_kernel(); - return status; + return nfs4_map_errors(status); } static int @@ -1453,7 +1462,7 @@ nfs4_proc_mknod(struct inode *dir, struc process_cinfo(&dir_cinfo, &dir_attr); nfs_refresh_inode(dir, &dir_attr); } - return status; + return nfs4_map_errors(status); } static int @@ -1463,11 +1472,10 @@ nfs4_proc_statfs(struct nfs_server *serv struct nfs4_compound compound; struct nfs4_op ops[2]; - memset(fsstat, 0, sizeof(*fsstat)); nfs4_setup_compound(&compound, ops, server, "statfs"); nfs4_setup_putfh(&compound, fhandle); nfs4_setup_statfs(&compound, fsstat); - return nfs4_call_compound(&compound, NULL, 0); + return nfs4_map_errors(nfs4_call_compound(&compound, NULL, 0)); } static int @@ -1480,8 +1488,7 @@ nfs4_proc_fsinfo(struct nfs_server *serv .rpc_resp = fsinfo, }; - memset(fsinfo, 0, sizeof(*fsinfo)); - return rpc_call_sync(server->client, &msg, 0); + return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0)); } static int @@ -1491,25 +1498,10 @@ nfs4_proc_pathconf(struct nfs_server *se struct nfs4_compound compound; struct nfs4_op ops[2]; - memset(pathconf, 0, sizeof(*pathconf)); nfs4_setup_compound(&compound, ops, server, "statfs"); nfs4_setup_putfh(&compound, fhandle); nfs4_setup_pathconf(&compound, pathconf); - return nfs4_call_compound(&compound, NULL, 0); -} - -static void -nfs4_restart_read(struct rpc_task *task) -{ - struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; - struct nfs_page *req; - - rpc_restart_call(task); - req = nfs_list_entry(data->pages.next); - if (req->wb_state) - nfs4_copy_stateid(&data->args.stateid, req->wb_state, req->wb_lockowner); - else - memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); + return nfs4_map_errors(nfs4_call_compound(&compound, NULL, 0)); } static void @@ -1517,25 +1509,19 @@ nfs4_read_done(struct rpc_task *task) { struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; struct inode *inode = data->inode; - struct nfs_fattr *fattr = data->res.fattr; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { - task->tk_action = nfs4_restart_read; + rpc_restart_call(task); return; } if (task->tk_status > 0) renew_lease(NFS_SERVER(inode), data->timestamp); - /* Check cache consistency */ - if (fattr->change_attr != NFS_CHANGE_ATTR(inode)) - nfs_zap_caches(inode); - if (fattr->bitmap[1] & FATTR4_WORD1_TIME_ACCESS) - inode->i_atime = fattr->atime; /* Call back common NFS readpage processing */ nfs_readpage_result(task); } static void -nfs4_proc_read_setup(struct nfs_read_data *data, unsigned int count) +nfs4_proc_read_setup(struct nfs_read_data *data) { struct rpc_task *task = &data->task; struct rpc_message msg = { @@ -1545,85 +1531,36 @@ nfs4_proc_read_setup(struct nfs_read_dat .rpc_cred = data->cred, }; struct inode *inode = data->inode; - struct nfs_page *req = nfs_list_entry(data->pages.next); int flags; - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.pages = data->pagevec; - data->args.count = count; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; data->timestamp = jiffies; - data->lockowner = req->wb_lockowner; - if (req->wb_state) - nfs4_copy_stateid(&data->args.stateid, req->wb_state, req->wb_lockowner); - else - memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); - /* N.B. Do we need to test? Never called for swapfile inode */ flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs4_read_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_readdata_release; - rpc_call_setup(task, &msg, 0); } static void -nfs4_write_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) -{ - /* Check cache consistency */ - if (fattr->pre_change_attr != NFS_CHANGE_ATTR(inode)) - nfs_zap_caches(inode); - NFS_CHANGE_ATTR(inode) = fattr->change_attr; - if (fattr->bitmap[1] & FATTR4_WORD1_SPACE_USED) - inode->i_blocks = (fattr->du.nfs3.used + 511) >> 9; - if (fattr->bitmap[1] & FATTR4_WORD1_TIME_METADATA) - inode->i_ctime = fattr->ctime; - if (fattr->bitmap[1] & FATTR4_WORD1_TIME_MODIFY) - inode->i_mtime = fattr->mtime; -} - -static void -nfs4_restart_write(struct rpc_task *task) -{ - struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata; - struct nfs_page *req; - - rpc_restart_call(task); - req = nfs_list_entry(data->pages.next); - if (req->wb_state) - nfs4_copy_stateid(&data->args.stateid, req->wb_state, req->wb_lockowner); - else - memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); -} - -static void nfs4_write_done(struct rpc_task *task) { struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { - task->tk_action = nfs4_restart_write; + rpc_restart_call(task); return; } if (task->tk_status >= 0) renew_lease(NFS_SERVER(inode), data->timestamp); - nfs4_write_refresh_inode(inode, data->res.fattr); /* Call back common NFS writeback processing */ nfs_writeback_done(task); } static void -nfs4_proc_write_setup(struct nfs_write_data *data, unsigned int count, int how) +nfs4_proc_write_setup(struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct rpc_message msg = { @@ -1633,7 +1570,6 @@ nfs4_proc_write_setup(struct nfs_write_d .rpc_cred = data->cred, }; struct inode *inode = data->inode; - struct nfs_page *req = nfs_list_entry(data->pages.next); int stable; int flags; @@ -1644,33 +1580,15 @@ nfs4_proc_write_setup(struct nfs_write_d stable = NFS_DATA_SYNC; } else stable = NFS_UNSTABLE; - - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.count = count; data->args.stable = stable; - data->args.pages = data->pagevec; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; - data->timestamp = jiffies; - data->lockowner = req->wb_lockowner; - if (req->wb_state) - nfs4_copy_stateid(&data->args.stateid, req->wb_state, req->wb_lockowner); - else - memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); + data->timestamp = jiffies; /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs4_write_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_writedata_release; - rpc_call_setup(task, &msg, 0); } @@ -1681,16 +1599,15 @@ nfs4_commit_done(struct rpc_task *task) struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { - task->tk_action = nfs4_restart_write; + rpc_restart_call(task); return; } - nfs4_write_refresh_inode(inode, data->res.fattr); /* Call back common NFS writeback processing */ nfs_commit_done(task); } static void -nfs4_proc_commit_setup(struct nfs_write_data *data, u64 start, u32 len, int how) +nfs4_proc_commit_setup(struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct rpc_message msg = { @@ -1702,22 +1619,11 @@ nfs4_proc_commit_setup(struct nfs_write_ struct inode *inode = data->inode; int flags; - data->args.fh = NFS_FH(data->inode); - data->args.offset = start; - data->args.count = len; - data->res.count = len; - data->res.fattr = &data->fattr; - data->res.verf = &data->verf; - /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs4_commit_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_commit_release; - rpc_call_setup(task, &msg, 0); } @@ -1807,6 +1713,7 @@ nfs4_proc_file_open(struct inode *inode, if (filp->f_mode & FMODE_WRITE) { lock_kernel(); nfs_set_mmcred(inode, state->owner->so_cred); + nfs_begin_data_update(inode); unlock_kernel(); } filp->private_data = state; @@ -1823,6 +1730,11 @@ nfs4_proc_file_release(struct inode *ino if (state) nfs4_close_state(state, filp->f_mode); + if (filp->f_mode & FMODE_WRITE) { + lock_kernel(); + nfs_end_data_update(inode); + unlock_kernel(); + } return 0; } @@ -1850,7 +1762,7 @@ nfs4_async_handle_error(struct rpc_task { struct nfs4_client *clp = server->nfs4_state; - if (!clp) + if (!clp || task->tk_status >= 0) return 0; switch(task->tk_status) { case -NFS4ERR_STALE_CLIENTID: @@ -1869,6 +1781,7 @@ nfs4_async_handle_error(struct rpc_task task->tk_status = 0; return -EAGAIN; } + task->tk_status = nfs4_map_errors(task->tk_status); return 0; } @@ -1946,16 +1859,9 @@ nfs4_handle_error(struct nfs_server *ser break; case -NFS4ERR_OLD_STATEID: ret = 0; - break; - default: - if (errorcode <= -1000) { - printk(KERN_WARNING "%s could not handle NFSv4 error %d\n", - __FUNCTION__, -errorcode); - ret = -EIO; - } } /* We failed to handle the error */ - return ret; + return nfs4_map_errors(ret); } @@ -2130,7 +2036,7 @@ nfs4_proc_getlk(struct nfs4_state *state if (lsp) nfs4_put_lock_state(lsp); up(&state->lock_sema); - return status; + return nfs4_map_errors(status); } int @@ -2175,7 +2081,7 @@ nfs4_proc_unlck(struct nfs4_state *state nfs4_put_lock_state(lsp); out: up(&state->lock_sema); - return status; + return nfs4_map_errors(status); } static int @@ -2251,7 +2157,7 @@ nfs4_proc_setlk(struct nfs4_state *state nfs4_put_lock_state(lsp); out: up(&state->lock_sema); - return status; + return nfs4_map_errors(status); } static int diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs4state.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs4state.c --- linux-2.6.4-pre3/fs/nfs/nfs4state.c 2004-03-10 19:12:14.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs4state.c 2004-03-10 20:15:20.000000000 -0500 @@ -105,7 +105,7 @@ nfs4_alloc_client(struct in_addr *addr) INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); INIT_LIST_HEAD(&clp->cl_superblocks); init_waitqueue_head(&clp->cl_waitq); - INIT_RPC_WAITQ(&clp->cl_rpcwaitq, "NFS4 client"); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); clp->cl_state = 1 << NFS4CLNT_NEW; } return clp; @@ -411,18 +411,20 @@ out: return state; } -void -nfs4_put_open_state(struct nfs4_state *state) +static void +__nfs4_put_open_state(struct nfs4_state *state) { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; int status = 0; - if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) + if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) { + up(&owner->so_sema); return; - list_del(&state->inode_states); + } + if (!list_empty(&state->inode_states)) + list_del(&state->inode_states); spin_unlock(&inode->i_lock); - down(&owner->so_sema); list_del(&state->open_states); if (state->state != 0) { do { @@ -440,6 +442,13 @@ nfs4_put_open_state(struct nfs4_state *s } void +nfs4_put_open_state(struct nfs4_state *state) +{ + down(&state->owner->so_sema); + __nfs4_put_open_state(state); +} + +void nfs4_close_state(struct nfs4_state *state, mode_t mode) { struct inode *inode = state->inode; @@ -479,8 +488,7 @@ nfs4_close_state(struct nfs4_state *stat status = nfs4_handle_error(NFS_SERVER(inode), status); down(&owner->so_sema); } while (!status); - up(&owner->so_sema); - nfs4_put_open_state(state); + __nfs4_put_open_state(state); } /* @@ -790,7 +798,7 @@ reclaimer(void *ptr) restart_loop: spin_lock(&clp->cl_lock); list_for_each_entry(sp, &clp->cl_state_owners, so_list) { - if (sp->so_generation - generation <= 0) + if (sp->so_generation - generation >= 0) continue; atomic_inc(&sp->so_count); spin_unlock(&clp->cl_lock); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/nfs4xdr.c linux-2.6.4-27-nfs4mount/fs/nfs/nfs4xdr.c --- linux-2.6.4-pre3/fs/nfs/nfs4xdr.c 2004-03-10 19:40:07.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/nfs4xdr.c 2004-03-10 20:14:30.000000000 -0500 @@ -69,84 +69,84 @@ static int nfs_stat_to_errno(int); /* lock,open owner id: * we currently use size 1 (u32) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define owner_id_maxsz 1 + 1 -#define compound_encode_hdr_maxsz 3 + (NFS4_MAXTAGLEN >> 2) -#define compound_decode_hdr_maxsz 2 + (NFS4_MAXTAGLEN >> 2) -#define op_encode_hdr_maxsz 1 -#define op_decode_hdr_maxsz 2 -#define encode_putfh_maxsz op_encode_hdr_maxsz + 1 + \ - (NFS4_FHSIZE >> 2) -#define decode_putfh_maxsz op_decode_hdr_maxsz -#define encode_putrootfh_maxsz op_encode_hdr_maxsz -#define decode_putrootfh_maxsz op_decode_hdr_maxsz -#define encode_getfh_maxsz op_encode_hdr_maxsz -#define decode_getfh_maxsz op_decode_hdr_maxsz + 1 + \ - (NFS4_FHSIZE >> 2) -#define encode_getattr_maxsz op_encode_hdr_maxsz + 3 -#define nfs4_fattr_bitmap_maxsz 26 + 2 * ((NFS4_MAXNAMLEN +1) >> 2) -#define decode_getattr_maxsz op_decode_hdr_maxsz + 3 + \ - nfs4_fattr_bitmap_maxsz -#define encode_savefh_maxsz op_encode_hdr_maxsz -#define decode_savefh_maxsz op_decode_hdr_maxsz -#define encode_restorefh_maxsz op_encode_hdr_maxsz -#define decode_restorefh_maxsz op_decode_hdr_maxsz -#define encode_read_getattr_maxsz op_encode_hdr_maxsz + 2 -#define decode_read_getattr_maxsz op_decode_hdr_maxsz + 8 -#define encode_pre_write_getattr_maxsz op_encode_hdr_maxsz + 2 -#define decode_pre_write_getattr_maxsz op_decode_hdr_maxsz + 5 -#define encode_post_write_getattr_maxsz op_encode_hdr_maxsz + 2 -#define decode_post_write_getattr_maxsz op_decode_hdr_maxsz + 13 -#define encode_fsinfo_maxsz op_encode_hdr_maxsz + 2 -#define decode_fsinfo_maxsz op_decode_hdr_maxsz + 11 -#define encode_renew_maxsz op_encode_hdr_maxsz + 3 -#define decode_renew_maxsz op_decode_hdr_maxsz +#define owner_id_maxsz (1 + 1) +#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) +#define op_encode_hdr_maxsz (1) +#define op_decode_hdr_maxsz (2) +#define encode_putfh_maxsz (op_encode_hdr_maxsz + 1 + \ + (NFS4_FHSIZE >> 2)) +#define decode_putfh_maxsz (op_decode_hdr_maxsz) +#define encode_putrootfh_maxsz (op_encode_hdr_maxsz) +#define decode_putrootfh_maxsz (op_decode_hdr_maxsz) +#define encode_getfh_maxsz (op_encode_hdr_maxsz) +#define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + (NFS4_FHSIZE >> 2)) +#define encode_getattr_maxsz (op_encode_hdr_maxsz + 3) +#define nfs4_fattr_bitmap_maxsz (26 + 2 * ((NFS4_MAXNAMLEN +1) >> 2)) +#define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \ + nfs4_fattr_bitmap_maxsz) +#define encode_savefh_maxsz (op_encode_hdr_maxsz) +#define decode_savefh_maxsz (op_decode_hdr_maxsz) +#define encode_restorefh_maxsz (op_encode_hdr_maxsz) +#define decode_restorefh_maxsz (op_decode_hdr_maxsz) +#define encode_read_getattr_maxsz (op_encode_hdr_maxsz + 2) +#define decode_read_getattr_maxsz (op_decode_hdr_maxsz + 8) +#define encode_pre_write_getattr_maxsz (op_encode_hdr_maxsz + 2) +#define decode_pre_write_getattr_maxsz (op_decode_hdr_maxsz + 5) +#define encode_post_write_getattr_maxsz (op_encode_hdr_maxsz + 2) +#define decode_post_write_getattr_maxsz (op_decode_hdr_maxsz + 13) +#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) +#define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ - op_encode_hdr_maxsz + \ + (op_encode_hdr_maxsz + \ 4 /*server->ip_addr*/ + \ 1 /*Netid*/ + \ 6 /*uaddr*/ + \ - 6 + (NFS4_VERIFIER_SIZE >> 2) + 6 + (NFS4_VERIFIER_SIZE >> 2)) #define decode_setclientid_maxsz \ - op_decode_hdr_maxsz + \ + (op_decode_hdr_maxsz + \ 2 + \ - 1024 /* large value for CLID_INUSE */ + 1024) /* large value for CLID_INUSE */ #define encode_setclientid_confirm_maxsz \ - op_encode_hdr_maxsz + \ - 3 + (NFS4_VERIFIER_SIZE >> 2) + (op_encode_hdr_maxsz + \ + 3 + (NFS4_VERIFIER_SIZE >> 2)) #define decode_setclientid_confirm_maxsz \ - op_decode_hdr_maxsz + (op_decode_hdr_maxsz) -#define NFS4_enc_compound_sz 1024 /* XXX: large enough? */ -#define NFS4_dec_compound_sz 1024 /* XXX: large enough? */ -#define NFS4_enc_read_sz compound_encode_hdr_maxsz + \ +#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ +#define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_read_getattr_maxsz + \ - op_encode_hdr_maxsz + 7 -#define NFS4_dec_read_sz compound_decode_hdr_maxsz + \ + op_encode_hdr_maxsz + 7) +#define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_read_getattr_maxsz + \ - op_decode_hdr_maxsz + 2 -#define NFS4_enc_write_sz compound_encode_hdr_maxsz + \ + op_decode_hdr_maxsz + 2) +#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_pre_write_getattr_maxsz + \ op_encode_hdr_maxsz + 8 + \ - encode_post_write_getattr_maxsz -#define NFS4_dec_write_sz compound_decode_hdr_maxsz + \ + encode_post_write_getattr_maxsz) +#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_pre_write_getattr_maxsz + \ op_decode_hdr_maxsz + 4 + \ - decode_post_write_getattr_maxsz -#define NFS4_enc_commit_sz compound_encode_hdr_maxsz + \ + decode_post_write_getattr_maxsz) +#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_pre_write_getattr_maxsz + \ op_encode_hdr_maxsz + 3 + \ - encode_post_write_getattr_maxsz -#define NFS4_dec_commit_sz compound_decode_hdr_maxsz + \ + encode_post_write_getattr_maxsz) +#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_pre_write_getattr_maxsz + \ op_decode_hdr_maxsz + 2 + \ - decode_post_write_getattr_maxsz -#define NFS4_enc_open_sz compound_encode_hdr_maxsz + \ + decode_post_write_getattr_maxsz) +#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ op_encode_hdr_maxsz + \ @@ -154,107 +154,107 @@ static int nfs_stat_to_errno(int); encode_getattr_maxsz + \ encode_getfh_maxsz + \ encode_restorefh_maxsz + \ - encode_getattr_maxsz -#define NFS4_dec_open_sz compound_decode_hdr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ op_decode_hdr_maxsz + 4 + 5 + 2 + 3 + \ decode_getattr_maxsz + \ decode_getfh_maxsz + \ decode_restorefh_maxsz + \ - decode_getattr_maxsz + decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ - compound_encode_hdr_maxsz + \ + (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 5 -#define NFS4_dec_open_confirm_sz compound_decode_hdr_maxsz + \ + op_encode_hdr_maxsz + 5) +#define NFS4_dec_open_confirm_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4 -#define NFS4_enc_open_reclaim_sz compound_encode_hdr_maxsz + \ + op_decode_hdr_maxsz + 4) +#define NFS4_enc_open_reclaim_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + \ 11 + \ - encode_getattr_maxsz -#define NFS4_dec_open_reclaim_sz compound_decode_hdr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_open_reclaim_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ op_decode_hdr_maxsz + \ 4 + 5 + 2 + 3 + \ - decode_getattr_maxsz + decode_getattr_maxsz) #define NFS4_enc_open_downgrade_sz \ - compound_encode_hdr_maxsz + \ + (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 7 + op_encode_hdr_maxsz + 7) #define NFS4_dec_open_downgrade_sz \ - compound_decode_hdr_maxsz + \ + (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4 -#define NFS4_enc_close_sz compound_encode_hdr_maxsz + \ + op_decode_hdr_maxsz + 4) +#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - op_encode_hdr_maxsz + 5 -#define NFS4_dec_close_sz compound_decode_hdr_maxsz + \ + op_encode_hdr_maxsz + 5) +#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 4 -#define NFS4_enc_setattr_sz compound_encode_hdr_maxsz + \ + op_decode_hdr_maxsz + 4) +#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + 4 + \ nfs4_fattr_bitmap_maxsz + \ - encode_getattr_maxsz -#define NFS4_dec_setattr_sz compound_decode_hdr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 3 -#define NFS4_enc_fsinfo_sz compound_encode_hdr_maxsz + \ + op_decode_hdr_maxsz + 3) +#define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_fsinfo_maxsz -#define NFS4_dec_fsinfo_sz compound_decode_hdr_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_fsinfo_maxsz -#define NFS4_enc_renew_sz compound_encode_hdr_maxsz + \ - encode_renew_maxsz -#define NFS4_dec_renew_sz compound_decode_hdr_maxsz + \ - decode_renew_maxsz -#define NFS4_enc_setclientid_sz compound_encode_hdr_maxsz + \ - encode_setclientid_maxsz -#define NFS4_dec_setclientid_sz compound_decode_hdr_maxsz + \ - decode_setclientid_maxsz + decode_fsinfo_maxsz) +#define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_renew_sz (compound_decode_hdr_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_setclientid_sz (compound_encode_hdr_maxsz + \ + encode_setclientid_maxsz) +#define NFS4_dec_setclientid_sz (compound_decode_hdr_maxsz + \ + decode_setclientid_maxsz) #define NFS4_enc_setclientid_confirm_sz \ - compound_encode_hdr_maxsz + \ + (compound_encode_hdr_maxsz + \ encode_setclientid_confirm_maxsz + \ encode_putrootfh_maxsz + \ - encode_fsinfo_maxsz + encode_fsinfo_maxsz) #define NFS4_dec_setclientid_confirm_sz \ - compound_decode_hdr_maxsz + \ + (compound_decode_hdr_maxsz + \ decode_setclientid_confirm_maxsz + \ decode_putrootfh_maxsz + \ - decode_fsinfo_maxsz -#define NFS4_enc_lock_sz compound_encode_hdr_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz + \ op_encode_hdr_maxsz + \ 1 + 1 + 2 + 2 + \ 1 + 4 + 1 + 2 + \ - owner_id_maxsz -#define NFS4_dec_lock_sz compound_decode_hdr_maxsz + \ + owner_id_maxsz) +#define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_getattr_maxsz + \ op_decode_hdr_maxsz + \ 2 + 2 + 1 + 2 + \ - owner_id_maxsz -#define NFS4_enc_lockt_sz compound_encode_hdr_maxsz + \ + owner_id_maxsz) +#define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz + \ op_encode_hdr_maxsz + \ 1 + 2 + 2 + 2 + \ - owner_id_maxsz -#define NFS4_dec_lockt_sz NFS4_dec_lock_sz -#define NFS4_enc_locku_sz compound_encode_hdr_maxsz + \ + owner_id_maxsz) +#define NFS4_dec_lockt_sz (NFS4_dec_lock_sz) +#define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz + \ op_encode_hdr_maxsz + \ - 1 + 1 + 4 + 2 + 2 -#define NFS4_dec_locku_sz compound_decode_hdr_maxsz + \ + 1 + 1 + 4 + 2 + 2) +#define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_getattr_maxsz + \ - op_decode_hdr_maxsz + 4 + op_decode_hdr_maxsz + 4) @@ -324,7 +324,7 @@ encode_compound_hdr(struct xdr_stream *x dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+XDR_QUADLEN(hdr->taglen)); + RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); WRITE32(hdr->taglen); WRITEMEM(hdr->tag, hdr->taglen); WRITE32(NFS4_MINOR_VERSION); @@ -868,14 +868,32 @@ encode_putrootfh(struct xdr_stream *xdr) return 0; } +static void +encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, fl_owner_t lockowner) +{ + extern nfs4_stateid zero_stateid; + nfs4_stateid stateid; + uint32_t *p; + + RESERVE_SPACE(16); + if (state != NULL) { + nfs4_copy_stateid(&stateid, state, lockowner); + WRITEMEM(stateid.data, sizeof(stateid.data)); + } else + WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data)); +} + static int encode_read(struct xdr_stream *xdr, struct nfs_readargs *args) { uint32_t *p; - RESERVE_SPACE(32); + RESERVE_SPACE(4); WRITE32(OP_READ); - WRITEMEM(args->stateid.data, sizeof(args->stateid.data)); + + encode_stateid(xdr, args->state, args->lockowner); + + RESERVE_SPACE(12); WRITE64(args->offset); WRITE32(args->count); @@ -1057,9 +1075,12 @@ encode_write(struct xdr_stream *xdr, str { uint32_t *p; - RESERVE_SPACE(36); + RESERVE_SPACE(4); WRITE32(OP_WRITE); - WRITEMEM(args->stateid.data, sizeof(args->stateid.data)); + + encode_stateid(xdr, args->state, args->lockowner); + + RESERVE_SPACE(16); WRITE64(args->offset); WRITE32(args->stable); WRITE32(args->count); @@ -3165,6 +3186,10 @@ static struct { { NFS4ERR_SYMLINK, ELOOP }, { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, { NFS4ERR_DEADLOCK, EDEADLK }, + { NFS4ERR_WRONGSEC, EPERM }, /* FIXME: this needs + * to be handled by a + * middle-layer. + */ { -1, EIO } }; @@ -3180,6 +3205,10 @@ nfs_stat_to_errno(int stat) if (nfs_errtbl[i].stat == stat) return nfs_errtbl[i].errno; } + if (stat < 0) { + /* The server is looney tunes. */ + return ESERVERFAULT; + } /* If we cannot translate the error, the recovery routines should * handle it. * Note: remaining NFSv4 error codes have values > 10000, so should diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/pagelist.c linux-2.6.4-27-nfs4mount/fs/nfs/pagelist.c --- linux-2.6.4-pre3/fs/nfs/pagelist.c 2004-03-10 19:36:50.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/pagelist.c 2004-03-10 20:14:37.000000000 -0500 @@ -32,7 +32,7 @@ static inline struct nfs_page * nfs_page_alloc(void) { struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, SLAB_NOFS); + p = kmem_cache_alloc(nfs_page_cachep, SLAB_KERNEL); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->wb_list); @@ -88,6 +88,7 @@ nfs_create_request(struct file *file, st * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; + atomic_set(&req->wb_complete, 0); req->wb_index = page->index; page_cache_get(page); req->wb_offset = offset; @@ -246,7 +247,6 @@ nfs_coalesce_requests(struct list_head * * nfs_scan_list - Scan a list for matching requests * @head: One of the NFS inode request lists * @dst: Destination list - * @file: if set, ensure we match requests from this file * @idx_start: lower bound of page->index to scan * @npages: idx_start + npages sets the upper bound to scan. * @@ -258,7 +258,6 @@ nfs_coalesce_requests(struct list_head * */ int nfs_scan_list(struct list_head *head, struct list_head *dst, - struct file *file, unsigned long idx_start, unsigned int npages) { struct list_head *pos, *tmp; @@ -276,9 +275,6 @@ nfs_scan_list(struct list_head *head, st req = nfs_list_entry(pos); - if (file && req->wb_file != file) - continue; - if (req->wb_index < idx_start) continue; if (req->wb_index > idx_end) diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/proc.c linux-2.6.4-27-nfs4mount/fs/nfs/proc.c --- linux-2.6.4-pre3/fs/nfs/proc.c 2004-03-10 19:37:22.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/proc.c 2004-03-10 20:14:30.000000000 -0500 @@ -49,18 +49,6 @@ extern struct rpc_procinfo nfs_procedures[]; -static void -nfs_write_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - nfs_refresh_inode(inode, fattr); -} - static struct rpc_cred * nfs_cred(struct inode *inode, struct file *filp) { @@ -78,15 +66,33 @@ nfs_cred(struct inode *inode, struct fil */ static int nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fsinfo *info) { - int status; + struct nfs_fattr *fattr = info->fattr; + struct nfs2_fsstat fsinfo; + int status; - dprintk("NFS call getroot\n"); + dprintk("%s: call getattr\n", __FUNCTION__); fattr->valid = 0; - status = rpc_call(server->client, NFSPROC_GETATTR, fhandle, fattr, 0); - dprintk("NFS reply getroot\n"); - return status; + status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); + dprintk("%s: reply getattr %d\n", __FUNCTION__, status); + if (status) + return status; + dprintk("%s: call statfs\n", __FUNCTION__); + status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0); + dprintk("%s: reply statfs %d\n", __FUNCTION__, status); + if (status) + return status; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->lease_time = 0; + return 0; } /* @@ -180,8 +186,14 @@ nfs_proc_read(struct nfs_read_data *rdat msg.rpc_cred = nfs_cred(inode, filp); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); - if (status >= 0) + if (status >= 0) { nfs_refresh_inode(inode, fattr); + /* Emulate the eof flag, which isn't normally needed in NFSv2 + * as it is guaranteed to always return the file attributes + */ + if (rdata->args.offset + rdata->args.count >= fattr->size) + rdata->res.eof = 1; + } dprintk("NFS reply read: %d\n", status); return status; } @@ -205,7 +217,7 @@ nfs_proc_write(struct nfs_write_data *wd msg.rpc_cred = nfs_cred(inode, filp); status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); if (status >= 0) { - nfs_write_refresh_inode(inode, fattr); + nfs_refresh_inode(inode, fattr); wdata->res.count = wdata->args.count; wdata->verf.committed = NFS_FILE_SYNC; } @@ -331,10 +343,8 @@ nfs_proc_unlink_done(struct dentry *dir, { struct rpc_message *msg = &task->tk_msg; - if (msg->rpc_argp) { - NFS_CACHEINV(dir->d_inode); + if (msg->rpc_argp) kfree(msg->rpc_argp); - } return 0; } @@ -537,17 +547,22 @@ nfs_read_done(struct rpc_task *task) { struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { nfs_refresh_inode(data->inode, data->res.fattr); + /* Emulate the eof flag, which isn't normally needed in NFSv2 + * as it is guaranteed to always return the file attributes + */ + if (data->args.offset + data->args.count >= data->res.fattr->size) + data->res.eof = 1; + } nfs_readpage_result(task); } static void -nfs_proc_read_setup(struct nfs_read_data *data, unsigned int count) +nfs_proc_read_setup(struct nfs_read_data *data) { struct rpc_task *task = &data->task; struct inode *inode = data->inode; - struct nfs_page *req; int flags; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READ], @@ -555,27 +570,13 @@ nfs_proc_read_setup(struct nfs_read_data .rpc_resp = &data->res, .rpc_cred = data->cred, }; - - req = nfs_list_entry(data->pages.next); - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.pages = data->pagevec; - data->args.count = count; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - + /* N.B. Do we need to test? Never called for swapfile inode */ flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs_read_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_readdata_release; - - rpc_call_setup(&data->task, &msg, 0); + rpc_call_setup(task, &msg, 0); } static void @@ -584,16 +585,15 @@ nfs_write_done(struct rpc_task *task) struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; if (task->tk_status >= 0) - nfs_write_refresh_inode(data->inode, data->res.fattr); + nfs_refresh_inode(data->inode, data->res.fattr); nfs_writeback_done(task); } static void -nfs_proc_write_setup(struct nfs_write_data *data, unsigned int count, int how) +nfs_proc_write_setup(struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct inode *inode = data->inode; - struct nfs_page *req; int flags; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_WRITE], @@ -603,32 +603,18 @@ nfs_proc_write_setup(struct nfs_write_da }; /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ - - req = nfs_list_entry(data->pages.next); - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req); - data->args.pgbase = req->wb_pgbase; - data->args.count = count; data->args.stable = NFS_FILE_SYNC; - data->args.pages = data->pagevec; - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; /* Set the initial flags for the task. */ flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ rpc_init_task(task, NFS_CLIENT(inode), nfs_write_done, flags); - task->tk_calldata = data; - /* Release requests */ - task->tk_release = nfs_writedata_release; - - rpc_call_setup(&data->task, &msg, 0); + rpc_call_setup(task, &msg, 0); } static void -nfs_proc_commit_setup(struct nfs_write_data *data, u64 start, u32 len, int how) +nfs_proc_commit_setup(struct nfs_write_data *data, int how) { BUG(); } diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/read.c linux-2.6.4-27-nfs4mount/fs/nfs/read.c --- linux-2.6.4-pre3/fs/nfs/read.c 2004-03-10 19:11:07.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/read.c 2004-03-10 20:15:10.000000000 -0500 @@ -35,6 +35,8 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE static int nfs_pagein_one(struct list_head *, struct inode *); +static void nfs_readpage_result_partial(struct nfs_read_data *, int); +static void nfs_readpage_result_full(struct nfs_read_data *, int); static kmem_cache_t *nfs_rdata_cachep; static mempool_t *nfs_rdata_mempool; @@ -57,12 +59,37 @@ static __inline__ void nfs_readdata_free mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(struct rpc_task *task) +static void nfs_readdata_release(struct rpc_task *task) { struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; nfs_readdata_free(data); } +static +unsigned int nfs_page_length(struct inode *inode, struct page *page) +{ + loff_t i_size = i_size_read(inode); + unsigned long idx; + + if (i_size <= 0) + return 0; + idx = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (page->index > idx) + return 0; + if (page->index != idx) + return PAGE_CACHE_SIZE; + return 1 + ((i_size - 1) & (PAGE_CACHE_SIZE - 1)); +} + +static +int nfs_return_empty_page(struct page *page) +{ + memclear_highpage_flush(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + unlock_page(page); + return 0; +} + /* * Read a page synchronously. */ @@ -78,6 +105,7 @@ nfs_readpage_sync(struct file *file, str .inode = inode, .args = { .fh = NFS_FH(inode), + .lockowner = current->files, .pages = &page, .pgbase = 0UL, .count = rsize, @@ -121,9 +149,13 @@ nfs_readpage_sync(struct file *file, str } count -= result; rdata.args.pgbase += result; - if (result < rdata.args.count) /* NFSv2ism */ + /* Note: result == 0 should only happen if we're caching + * a write that extends the file and punches a hole. + */ + if (rdata.res.eof != 0 || result == 0) break; } while (count); + NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; if (count) memclear_highpage_flush(page, rdata.args.pgbase, count); @@ -142,89 +174,209 @@ nfs_readpage_async(struct file *file, st { LIST_HEAD(one_request); struct nfs_page *new; + unsigned int len; - new = nfs_create_request(file, inode, page, 0, PAGE_CACHE_SIZE); + len = nfs_page_length(inode, page); + if (len == 0) + return nfs_return_empty_page(page); + new = nfs_create_request(file, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); } + if (len < PAGE_CACHE_SIZE) + memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); + nfs_lock_request(new); nfs_list_add_request(new, &one_request); nfs_pagein_one(&one_request, inode); return 0; } +static void nfs_readpage_release(struct nfs_page *req) +{ + unlock_page(req->wb_page); + + nfs_clear_request(req); + nfs_release_request(req); + nfs_unlock_request(req); + + dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", + req->wb_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_inode), + req->wb_bytes, + (long long)req_offset(req)); +} + /* * Set up the NFS read request struct */ -static void -nfs_read_rpcsetup(struct list_head *head, struct nfs_read_data *data) +static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + unsigned int count, unsigned int offset) { struct inode *inode; - struct nfs_page *req; - struct page **pages; - unsigned int count; - pages = data->pagevec; - count = 0; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); - *pages++ = req->wb_page; - count += req->wb_bytes; - } - req = nfs_list_entry(data->pages.next); + data->req = req; data->inode = inode = req->wb_inode; data->cred = req->wb_cred; - NFS_PROTO(inode)->read_setup(data, count); + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pagevec; + data->args.count = count; + data->args.lockowner = req->wb_lockowner; + data->args.state = req->wb_state; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + + NFS_PROTO(inode)->read_setup(data); + + data->task.tk_cookie = (unsigned long)inode; + data->task.tk_calldata = data; + /* Release requests */ + data->task.tk_release = nfs_readdata_release; - dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu.\n", + dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", data->task.tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), count, - (unsigned long long)req_offset(req)); + data->args.offset); } static void nfs_async_read_error(struct list_head *head) { struct nfs_page *req; - struct page *page; while (!list_empty(head)) { req = nfs_list_entry(head->next); - page = req->wb_page; nfs_list_remove_request(req); - SetPageError(page); - unlock_page(page); - nfs_clear_request(req); - nfs_release_request(req); - nfs_unlock_request(req); + SetPageError(req->wb_page); + nfs_readpage_release(req); } } -static int -nfs_pagein_one(struct list_head *head, struct inode *inode) +/* + * Start an async read operation + */ +static void nfs_execute_read(struct nfs_read_data *data) { - struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct rpc_clnt *clnt = NFS_CLIENT(data->inode); + sigset_t oldset; + + rpc_clnt_sigmask(clnt, &oldset); + lock_kernel(); + rpc_execute(&data->task); + unlock_kernel(); + rpc_clnt_sigunmask(clnt, &oldset); +} + +/* + * Generate multiple requests to fill a single page. + * + * We optimize to reduce the number of read operations on the wire. If we + * detect that we're reading a page, or an area of a page, that is past the + * end of file, we do not generate NFS read operations but just clear the + * parts of the page that would have come back zero from the server anyway. + * + * We rely on the cached value of i_size to make this determination; another + * client can fill pages on the server past our cached end-of-file, but we + * won't see the new data until our attribute cache is updated. This is more + * or less conventional NFS client behavior. + */ +static int nfs_pagein_multi(struct list_head *head, struct inode *inode) +{ + struct nfs_page *req = nfs_list_entry(head->next); + struct page *page = req->wb_page; + struct nfs_read_data *data; + unsigned int rsize = NFS_SERVER(inode)->rsize; + unsigned int nbytes, offset; + int requests = 0; + LIST_HEAD(list); + + nfs_list_remove_request(req); + + nbytes = req->wb_bytes; + for(;;) { + data = nfs_readdata_alloc(); + if (!data) + goto out_bad; + list_add(&data->pages, &list); + requests++; + if (nbytes <= rsize) + break; + nbytes -= rsize; + } + atomic_set(&req->wb_complete, requests); + + ClearPageError(page); + offset = 0; + nbytes = req->wb_bytes; + do { + data = list_entry(list.next, struct nfs_read_data, pages); + list_del_init(&data->pages); + + data->pagevec[0] = page; + data->complete = nfs_readpage_result_partial; + + if (nbytes > rsize) { + nfs_read_rpcsetup(req, data, rsize, offset); + offset += rsize; + nbytes -= rsize; + } else { + nfs_read_rpcsetup(req, data, nbytes, offset); + nbytes = 0; + } + nfs_execute_read(data); + } while (nbytes != 0); + + return 0; + +out_bad: + while (!list_empty(&list)) { + data = list_entry(list.next, struct nfs_read_data, pages); + list_del(&data->pages); + nfs_readdata_free(data); + } + SetPageError(page); + nfs_readpage_release(req); + return -ENOMEM; +} + +static int nfs_pagein_one(struct list_head *head, struct inode *inode) +{ + struct nfs_page *req; + struct page **pages; struct nfs_read_data *data; - sigset_t oldset; + unsigned int count; + + if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) + return nfs_pagein_multi(head, inode); data = nfs_readdata_alloc(); if (!data) goto out_bad; - nfs_read_rpcsetup(head, data); + pages = data->pagevec; + count = 0; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + ClearPageError(req->wb_page); + *pages++ = req->wb_page; + count += req->wb_bytes; + } + req = nfs_list_entry(data->pages.next); - /* Start the async call */ - rpc_clnt_sigmask(clnt, &oldset); - lock_kernel(); - rpc_execute(&data->task); - unlock_kernel(); - rpc_clnt_sigunmask(clnt, &oldset); + data->complete = nfs_readpage_result_full; + nfs_read_rpcsetup(req, data, count, 0); + + nfs_execute_read(data); return 0; out_bad: nfs_async_read_error(head); @@ -254,54 +406,95 @@ nfs_pagein_list(struct list_head *head, } /* + * Handle a read reply that fills part of a page. + */ +static void nfs_readpage_result_partial(struct nfs_read_data *data, int status) +{ + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + + if (status >= 0) { + unsigned int request = data->args.count; + unsigned int result = data->res.count; + + if (result < request) { + memclear_highpage_flush(page, + data->args.pgbase + result, + request - result); + } + } else + SetPageError(page); + + if (atomic_dec_and_test(&req->wb_complete)) { + if (!PageError(page)) + SetPageUptodate(page); + nfs_readpage_release(req); + } +} + +/* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -void -nfs_readpage_result(struct rpc_task *task) +static void nfs_readpage_result_full(struct nfs_read_data *data, int status) { - struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; unsigned int count = data->res.count; - dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", - task->tk_pid, task->tk_status); - while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); struct page *page = req->wb_page; nfs_list_remove_request(req); - if (task->tk_status >= 0) { + if (status >= 0) { if (count < PAGE_CACHE_SIZE) { - memclear_highpage_flush(page, + if (count < req->wb_bytes) + memclear_highpage_flush(page, req->wb_pgbase + count, req->wb_bytes - count); - count = 0; } else count -= PAGE_CACHE_SIZE; SetPageUptodate(page); } else SetPageError(page); - unlock_page(page); + nfs_readpage_release(req); + } +} - dprintk("NFS: read (%s/%Ld %d@%Ld)\n", - req->wb_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_inode), - req->wb_bytes, - (long long)req_offset(req)); - nfs_clear_request(req); - nfs_release_request(req); - nfs_unlock_request(req); +/* + * This is the callback from RPC telling us whether a reply was + * received or some error occurred (timeout or socket shutdown). + */ +void nfs_readpage_result(struct rpc_task *task) +{ + struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; + int status = task->tk_status; + + dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", + task->tk_pid, status); + + /* Is this a short read? */ + if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { + /* Has the server at least made some progress? */ + if (resp->count != 0) { + /* Yes, so retry the read at the end of the data */ + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + rpc_restart_call(task); + return; + } + task->tk_status = -EIO; } + NFS_FLAGS(data->inode) |= NFS_INO_INVALID_ATIME; + data->complete(data, status); } /* * Read a page over NFS. - * We read the page synchronously in the following cases: - * - The NFS rsize is smaller than PAGE_CACHE_SIZE. We could kludge our way - * around this by creating several consecutive read requests, but - * that's hardly worth it. + * We read the page synchronously in the following case: * - The error flag is set for this page. This happens only when a * previous async read operation failed. */ @@ -324,7 +517,7 @@ nfs_readpage(struct file *file, struct p if (error) goto out_error; - if (!PageError(page) && NFS_SERVER(inode)->rsize >= PAGE_CACHE_SIZE) { + if (!IS_SYNC(inode)) { error = nfs_readpage_async(file, inode, page); goto out; } @@ -346,26 +539,25 @@ struct nfs_readdesc { }; static int -readpage_sync_filler(void *data, struct page *page) -{ - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - return nfs_readpage_sync(desc->filp, page->mapping->host, page); -} - -static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; struct inode *inode = page->mapping->host; struct nfs_page *new; + unsigned int len; nfs_wb_page(inode, page); - new = nfs_create_request(desc->filp, inode, page, 0, PAGE_CACHE_SIZE); + len = nfs_page_length(inode, page); + if (len == 0) + return nfs_return_empty_page(page); + new = nfs_create_request(desc->filp, inode, page, 0, len); if (IS_ERR(new)) { SetPageError(page); unlock_page(page); return PTR_ERR(new); } + if (len < PAGE_CACHE_SIZE) + memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); nfs_lock_request(new); nfs_list_add_request(new, desc->head); return 0; @@ -380,14 +572,16 @@ nfs_readpages(struct file *filp, struct .filp = filp, .head = &head, }; - struct nfs_server *server = NFS_SERVER(mapping->host); - int is_sync = server->rsize < PAGE_CACHE_SIZE; + struct inode *inode = mapping->host; + struct nfs_server *server = NFS_SERVER(inode); int ret; - ret = read_cache_pages(mapping, pages, - is_sync ? readpage_sync_filler : - readpage_async_filler, - &desc); + dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + nr_pages); + + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); if (!list_empty(&head)) { int err = nfs_pagein_list(&head, server->rpages); if (!ret) diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/unlink.c linux-2.6.4-27-nfs4mount/fs/nfs/unlink.c --- linux-2.6.4-pre3/fs/nfs/unlink.c 2004-03-10 19:37:45.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/unlink.c 2004-03-10 20:12:03.000000000 -0500 @@ -104,6 +104,7 @@ nfs_async_unlink_init(struct rpc_task *t status = NFS_PROTO(dir->d_inode)->unlink_setup(&msg, dir, &data->name); if (status < 0) goto out_err; + nfs_begin_data_update(dir->d_inode); rpc_call_setup(task, &msg, 0); return; out_err: @@ -126,7 +127,7 @@ nfs_async_unlink_done(struct rpc_task *t if (!dir) return; dir_i = dir->d_inode; - nfs_zap_caches(dir_i); + nfs_end_data_update(dir_i); if (NFS_PROTO(dir_i)->unlink_done(dir, task)) return; put_rpccred(data->cred); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/fs/nfs/write.c linux-2.6.4-27-nfs4mount/fs/nfs/write.c --- linux-2.6.4-pre3/fs/nfs/write.c 2004-03-10 19:32:17.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/fs/nfs/write.c 2004-03-10 20:15:10.000000000 -0500 @@ -74,12 +74,17 @@ static struct nfs_page * nfs_update_request(struct file*, struct inode *, struct page *, unsigned int, unsigned int); -static void nfs_strategy(struct inode *inode); +static void nfs_writeback_done_partial(struct nfs_write_data *, int); +static void nfs_writeback_done_full(struct nfs_write_data *, int); +static int nfs_wait_on_write_congestion(struct address_space *, int); +static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int); static kmem_cache_t *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; static mempool_t *nfs_commit_mempool; +static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); + static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) { struct nfs_write_data *p; @@ -96,7 +101,7 @@ static __inline__ void nfs_writedata_fre mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(struct rpc_task *task) +static void nfs_writedata_release(struct rpc_task *task) { struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; nfs_writedata_free(wdata); @@ -118,29 +123,69 @@ static __inline__ void nfs_commit_free(s mempool_free(p, nfs_commit_mempool); } -void nfs_commit_release(struct rpc_task *task) +/* Adjust the file length if we're writing beyond the end */ +static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { - struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; - nfs_commit_free(wdata); + struct inode *inode = page->mapping->host; + loff_t end, i_size = i_size_read(inode); + unsigned long end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + + if (i_size > 0 && page->index < end_index) + return; + end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); + if (i_size >= end) + return; + i_size_write(inode, end); +} + +/* We can set the PG_uptodate flag if we see that a write request + * covers the full page. + */ +static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +{ + loff_t end_offs; + + if (PageUptodate(page)) + return; + if (base != 0) + return; + if (count == PAGE_CACHE_SIZE) { + SetPageUptodate(page); + return; + } + + end_offs = i_size_read(page->mapping->host) - 1; + if (end_offs < 0) + return; + /* Is this the last page? */ + if (page->index != (unsigned long)(end_offs >> PAGE_CACHE_SHIFT)) + return; + /* This is the last page: set PG_uptodate if we cover the entire + * extent of the data, then zero the rest of the page. + */ + if (count == (unsigned int)(end_offs & (PAGE_CACHE_SIZE - 1)) + 1) { + memclear_highpage_flush(page, count, PAGE_CACHE_SIZE - count); + SetPageUptodate(page); + } } /* * Write a page synchronously. * Offset is the data offset within the page. */ -static int -nfs_writepage_sync(struct file *file, struct inode *inode, struct page *page, - unsigned int offset, unsigned int count) +static int nfs_writepage_sync(struct file *file, struct inode *inode, + struct page *page, unsigned int offset, unsigned int count, + int how) { unsigned int wsize = NFS_SERVER(inode)->wsize; int result, written = 0; - int swapfile = IS_SWAPFILE(inode); struct nfs_write_data wdata = { - .flags = swapfile ? NFS_RPC_SWAPFLAGS : 0, + .flags = how, .cred = NULL, .inode = inode, .args = { .fh = NFS_FH(inode), + .lockowner = current->files, .pages = &page, .stable = NFS_FILE_SYNC, .pgbase = offset, @@ -157,8 +202,9 @@ nfs_writepage_sync(struct file *file, st (long long)NFS_FILEID(inode), count, (long long)(page_offset(page) + offset)); + nfs_begin_data_update(inode); do { - if (count < wsize && !swapfile) + if (count < wsize) wdata.args.count = count; wdata.args.offset = page_offset(page) + wdata.args.pgbase; @@ -177,58 +223,62 @@ nfs_writepage_sync(struct file *file, st wdata.args.pgbase += result; written += result; count -= result; - - /* - * If we've extended the file, update the inode - * now so we don't invalidate the cache. - */ - if (wdata.args.offset > i_size_read(inode)) - i_size_write(inode, wdata.args.offset); } while (count); + /* Update file length */ + nfs_grow_file(page, offset, written); + /* Set the PG_uptodate flag? */ + nfs_mark_uptodate(page, offset, written); if (PageError(page)) ClearPageError(page); io_error: + nfs_end_data_update(inode); if (wdata.cred) put_rpccred(wdata.cred); return written ? written : result; } -static int -nfs_writepage_async(struct file *file, struct inode *inode, struct page *page, - unsigned int offset, unsigned int count) +static int nfs_writepage_async(struct file *file, struct inode *inode, + struct page *page, unsigned int offset, unsigned int count) { struct nfs_page *req; - loff_t end; int status; req = nfs_update_request(file, inode, page, offset, count); status = (IS_ERR(req)) ? PTR_ERR(req) : 0; if (status < 0) goto out; + /* Update file length */ + nfs_grow_file(page, offset, count); + /* Set the PG_uptodate flag? */ + nfs_mark_uptodate(page, offset, count); nfs_unlock_request(req); - nfs_strategy(inode); - end = ((loff_t)page->index<for_reclaim) + return FLUSH_HIGHPRI; + if (wbc->for_kupdate) + return FLUSH_LOWPRI; + return 0; +} + /* * Write an mmapped page to the server. */ -int -nfs_writepage(struct page *page, struct writeback_control *wbc) +int nfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; loff_t i_size = i_size_read(inode); int inode_referenced = 0; + int priority = wb_priority(wbc); int err; /* @@ -244,7 +294,7 @@ nfs_writepage(struct page *page, struct end_index = i_size >> PAGE_CACHE_SHIFT; /* Ensure we've flushed out any previous writes */ - nfs_wb_page(inode,page); + nfs_wb_page_priority(inode, page, priority); /* easy case */ if (page->index < end_index) @@ -258,44 +308,60 @@ nfs_writepage(struct page *page, struct goto out; do_it: lock_kernel(); - if (NFS_SERVER(inode)->wsize >= PAGE_CACHE_SIZE && !IS_SYNC(inode) && - inode_referenced) { + if (!IS_SYNC(inode) && inode_referenced) { err = nfs_writepage_async(NULL, inode, page, 0, offset); - if (err >= 0) + if (err >= 0) { err = 0; + if (wbc->for_reclaim) + err = WRITEPAGE_ACTIVATE; + } } else { - err = nfs_writepage_sync(NULL, inode, page, 0, offset); + err = nfs_writepage_sync(NULL, inode, page, 0, offset, priority); if (err == offset) err = 0; } unlock_kernel(); out: - unlock_page(page); + if (err != WRITEPAGE_ACTIVATE) + unlock_page(page); if (inode_referenced) iput(inode); return err; } -int -nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) +/* + * Note: causes nfs_update_request() to block on the assumption + * that the writeback is generated due to memory pressure. + */ +int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct backing_dev_info *bdi = mapping->backing_dev_info; struct inode *inode = mapping->host; - int is_sync = !wbc->nonblocking; int err; err = generic_writepages(mapping, wbc); if (err) - goto out; - err = nfs_flush_file(inode, NULL, 0, 0, 0); + return err; + while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) { + if (wbc->nonblocking) + return 0; + nfs_wait_on_write_congestion(mapping, 0); + } + err = nfs_flush_inode(inode, 0, 0, wb_priority(wbc)); if (err < 0) goto out; - if (wbc->sync_mode == WB_SYNC_HOLD) - goto out; - if (is_sync && wbc->sync_mode == WB_SYNC_ALL) { - err = nfs_wb_all(inode); - } else - nfs_commit_file(inode, NULL, 0, 0, 0); + wbc->nr_to_write -= err; + if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) { + err = nfs_wait_on_requests(inode, 0, 0); + if (err < 0) + goto out; + } + err = nfs_commit_inode(inode, 0, 0, wb_priority(wbc)); + if (err > 0) + wbc->nr_to_write -= err; out: + clear_bit(BDI_write_congested, &bdi->state); + wake_up_all(&nfs_write_congestion); return err; } @@ -312,8 +378,10 @@ nfs_inode_add_request(struct inode *inod BUG_ON(error == -EEXIST); if (error) return error; - if (!nfsi->npages) + if (!nfsi->npages) { igrab(inode); + nfs_begin_data_update(inode); + } nfsi->npages++; req->wb_count++; return 0; @@ -322,7 +390,7 @@ nfs_inode_add_request(struct inode *inod /* * Insert a write request into an inode */ -static inline void +static void nfs_inode_remove_request(struct nfs_page *req) { struct nfs_inode *nfsi; @@ -336,6 +404,7 @@ nfs_inode_remove_request(struct nfs_page nfsi->npages--; if (!nfsi->npages) { spin_unlock(&nfs_wreq_lock); + nfs_end_data_update(inode); iput(inode); } else spin_unlock(&nfs_wreq_lock); @@ -372,7 +441,7 @@ nfs_find_request(struct inode *inode, un /* * Add a request to the inode's dirty list. */ -static inline void +static void nfs_mark_request_dirty(struct nfs_page *req) { struct inode *inode = req->wb_inode; @@ -400,7 +469,7 @@ nfs_dirty_request(struct nfs_page *req) /* * Add a request to the inode's commit list. */ -static inline void +static void nfs_mark_request_commit(struct nfs_page *req) { struct inode *inode = req->wb_inode; @@ -421,7 +490,7 @@ nfs_mark_request_commit(struct nfs_page * Interruptible by signals only if mounted with intr flag. */ static int -nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages) +nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req; @@ -441,8 +510,6 @@ nfs_wait_on_requests(struct inode *inode break; next = req->wb_index + 1; - if (file && req->wb_file != file) - continue; if (!NFS_WBACK_BUSY(req)) continue; @@ -453,7 +520,6 @@ nfs_wait_on_requests(struct inode *inode if (error < 0) return error; spin_lock(&nfs_wreq_lock); - next = idx_start; res++; } spin_unlock(&nfs_wreq_lock); @@ -464,7 +530,6 @@ nfs_wait_on_requests(struct inode *inode * nfs_scan_dirty - Scan an inode for dirty requests * @inode: NFS inode to scan * @dst: destination list - * @file: if set, ensure we match requests from this file * @idx_start: lower bound of page->index to scan. * @npages: idx_start + npages sets the upper bound to scan. * @@ -472,11 +537,11 @@ nfs_wait_on_requests(struct inode *inode * The requests are *not* checked to ensure that they form a contiguous set. */ static int -nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) +nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); int res; - res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages); + res = nfs_scan_list(&nfsi->dirty, dst, idx_start, npages); nfsi->ndirty -= res; sub_page_state(nr_dirty,res); if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) @@ -489,7 +554,6 @@ nfs_scan_dirty(struct inode *inode, stru * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan * @dst: destination list - * @file: if set, ensure we collect requests from this file only. * @idx_start: lower bound of page->index to scan. * @npages: idx_start + npages sets the upper bound to scan. * @@ -497,11 +561,11 @@ nfs_scan_dirty(struct inode *inode, stru * The requests are *not* checked to ensure that they form a contiguous set. */ static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) +nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); int res; - res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages); + res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); nfsi->ncommit -= res; if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); @@ -509,6 +573,38 @@ nfs_scan_commit(struct inode *inode, str } #endif +static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + DEFINE_WAIT(wait); + int ret = 0; + + might_sleep(); + + if (!bdi_write_congested(bdi)) + return 0; + if (intr) { + struct rpc_clnt *clnt = NFS_CLIENT(mapping->host); + sigset_t oldset; + + rpc_clnt_sigmask(clnt, &oldset); + prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE); + if (bdi_write_congested(bdi)) { + if (signalled()) + ret = -ERESTARTSYS; + else + schedule(); + } + rpc_clnt_sigunmask(clnt, &oldset); + } else { + prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE); + if (bdi_write_congested(bdi)) + schedule(); + } + finish_wait(&nfs_write_congestion, &wait); + return ret; +} + /* * Try to update any existing write request, or create one if there is none. @@ -521,11 +617,14 @@ static struct nfs_page * nfs_update_request(struct file* file, struct inode *inode, struct page *page, unsigned int offset, unsigned int bytes) { + struct nfs_server *server = NFS_SERVER(inode); struct nfs_page *req, *new = NULL; unsigned long rqend, end; end = offset + bytes; + if (nfs_wait_on_write_congestion(page->mapping, server->flags & NFS_MOUNT_INTR)) + return ERR_PTR(-ERESTARTSYS); for (;;) { /* Loop over all inode entries and see if we find * A request for the page we wish to update @@ -600,46 +699,6 @@ nfs_update_request(struct file* file, st return req; } -/* - * This is the strategy routine for NFS. - * It is called by nfs_updatepage whenever the user wrote up to the end - * of a page. - * - * We always try to submit a set of requests in parallel so that the - * server's write code can gather writes. This is mainly for the benefit - * of NFSv2. - * - * We never submit more requests than we think the remote can handle. - * For UDP sockets, we make sure we don't exceed the congestion window; - * for TCP, we limit the number of requests to 8. - * - * NFS_STRATEGY_PAGES gives the minimum number of requests for NFSv2 that - * should be sent out in one go. This is for the benefit of NFSv2 servers - * that perform write gathering. - * - * FIXME: Different servers may have different sweet spots. - * Record the average congestion window in server struct? - */ -#define NFS_STRATEGY_PAGES 8 -static void -nfs_strategy(struct inode *inode) -{ - unsigned int dirty, wpages; - - dirty = NFS_I(inode)->ndirty; - wpages = NFS_SERVER(inode)->wpages; -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (NFS_PROTO(inode)->version == 2) { - if (dirty >= NFS_STRATEGY_PAGES * wpages) - nfs_flush_file(inode, NULL, 0, 0, 0); - } else if (dirty >= wpages) - nfs_flush_file(inode, NULL, 0, 0, 0); -#else - if (dirty >= NFS_STRATEGY_PAGES * wpages) - nfs_flush_file(inode, NULL, 0, 0, 0); -#endif -} - int nfs_flush_incompatible(struct file *file, struct page *page) { @@ -669,25 +728,20 @@ nfs_flush_incompatible(struct file *file * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad * things with a page scheduled for an RPC call (e.g. invalidate it). */ -int -nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) +int nfs_updatepage(struct file *file, struct page *page, + unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; struct inode *inode = page->mapping->host; struct nfs_page *req; - loff_t end; int status = 0; dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, count, (long long)(page_offset(page) +offset)); - /* - * If wsize is smaller than page size, update and write - * page synchronously. - */ - if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE || IS_SYNC(inode)) { - status = nfs_writepage_sync(file, inode, page, offset, count); + if (IS_SYNC(inode)) { + status = nfs_writepage_sync(file, inode, page, offset, count, 0); if (status > 0) { if (offset == 0 && status == PAGE_CACHE_SIZE) SetPageUptodate(page); @@ -696,6 +750,27 @@ nfs_updatepage(struct file *file, struct return status; } + /* If we're not using byte range locks, and we know the page + * is entirely in cache, it may be more efficient to avoid + * fragmenting write requests. + */ + if (PageUptodate(page) && inode->i_flock == NULL) { + loff_t end_offs = i_size_read(inode) - 1; + unsigned long end_index = end_offs >> PAGE_CACHE_SHIFT; + + count += offset; + offset = 0; + if (unlikely(end_offs < 0)) { + /* Do nothing */ + } else if (page->index == end_index) { + unsigned int pglen; + pglen = (unsigned int)(end_offs & (PAGE_CACHE_SIZE-1)) + 1; + if (count < pglen) + count = pglen; + } else if (page->index < end_index) + count = PAGE_CACHE_SIZE; + } + /* * Try to find an NFS request corresponding to this page * and update it. @@ -714,20 +789,12 @@ nfs_updatepage(struct file *file, struct goto done; status = 0; - end = ((loff_t)page->index<wb_pgbase == 0 && req->wb_bytes == PAGE_CACHE_SIZE) { - SetPageUptodate(page); - nfs_unlock_request(req); - nfs_strategy(inode); - } else - nfs_unlock_request(req); + + /* Update file length */ + nfs_grow_file(page, offset, count); + /* Set the PG_uptodate flag? */ + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_unlock_request(req); done: dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); @@ -736,43 +803,159 @@ done: return status; } +static void nfs_writepage_release(struct nfs_page *req) +{ + end_page_writeback(req->wb_page); + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (!PageError(req->wb_page)) { + if (NFS_NEED_RESCHED(req)) { + nfs_mark_request_dirty(req); + goto out; + } else if (NFS_NEED_COMMIT(req)) { + nfs_mark_request_commit(req); + goto out; + } + } + nfs_inode_remove_request(req); + +out: + nfs_clear_commit(req); + nfs_clear_reschedule(req); +#else + nfs_inode_remove_request(req); +#endif + nfs_unlock_request(req); +} + +static inline int flush_task_priority(int how) +{ + switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { + case FLUSH_HIGHPRI: + return RPC_PRIORITY_HIGH; + case FLUSH_LOWPRI: + return RPC_PRIORITY_LOW; + } + return RPC_PRIORITY_NORMAL; +} + /* * Set up the argument/result storage required for the RPC call. */ -static void -nfs_write_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how) +static void nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + unsigned int count, unsigned int offset, + int how) { struct rpc_task *task = &data->task; struct inode *inode; - struct nfs_page *req; - struct page **pages; - unsigned int count; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ - pages = data->pagevec; - count = 0; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); - SetPageWriteback(req->wb_page); - *pages++ = req->wb_page; - count += req->wb_bytes; - } - req = nfs_list_entry(data->pages.next); + data->req = req; data->inode = inode = req->wb_inode; data->cred = req->wb_cred; - NFS_PROTO(inode)->write_setup(data, count, how); + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.count = count; + data->args.pages = data->pagevec; + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.verf = &data->verf; + + NFS_PROTO(inode)->write_setup(data, how); + + data->task.tk_priority = flush_task_priority(how); + data->task.tk_cookie = (unsigned long)inode; + data->task.tk_calldata = data; + /* Release requests */ + data->task.tk_release = nfs_writedata_release; dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", task->tk_pid, inode->i_sb->s_id, (long long)NFS_FILEID(inode), count, - (unsigned long long)req_offset(req)); + data->args.offset); +} + +static void nfs_execute_write(struct nfs_write_data *data) +{ + struct rpc_clnt *clnt = NFS_CLIENT(data->inode); + sigset_t oldset; + + rpc_clnt_sigmask(clnt, &oldset); + lock_kernel(); + rpc_execute(&data->task); + unlock_kernel(); + rpc_clnt_sigunmask(clnt, &oldset); +} + +/* + * Generate multiple small requests to write out a single + * contiguous dirty area on one page. + */ +static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how) +{ + struct nfs_page *req = nfs_list_entry(head->next); + struct page *page = req->wb_page; + struct nfs_write_data *data; + unsigned int wsize = NFS_SERVER(inode)->wsize; + unsigned int nbytes, offset; + int requests = 0; + LIST_HEAD(list); + + nfs_list_remove_request(req); + + nbytes = req->wb_bytes; + for (;;) { + data = nfs_writedata_alloc(); + if (!data) + goto out_bad; + list_add(&data->pages, &list); + requests++; + if (nbytes <= wsize) + break; + nbytes -= wsize; + } + atomic_set(&req->wb_complete, requests); + + ClearPageError(page); + SetPageWriteback(page); + offset = 0; + nbytes = req->wb_bytes; + do { + data = list_entry(list.next, struct nfs_write_data, pages); + list_del_init(&data->pages); + + data->pagevec[0] = page; + data->complete = nfs_writeback_done_partial; + + if (nbytes > wsize) { + nfs_write_rpcsetup(req, data, wsize, offset, how); + offset += wsize; + nbytes -= wsize; + } else { + nfs_write_rpcsetup(req, data, nbytes, offset, how); + nbytes = 0; + } + nfs_execute_write(data); + } while (nbytes != 0); + + return 0; + +out_bad: + while (!list_empty(&list)) { + data = list_entry(list.next, struct nfs_write_data, pages); + list_del(&data->pages); + nfs_writedata_free(data); + } + nfs_mark_request_dirty(req); + nfs_unlock_request(req); + return -ENOMEM; } /* @@ -783,25 +966,38 @@ nfs_write_rpcsetup(struct list_head *hea * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int -nfs_flush_one(struct list_head *head, struct inode *inode, int how) +static int nfs_flush_one(struct list_head *head, struct inode *inode, int how) { - struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_page *req; + struct page **pages; struct nfs_write_data *data; - sigset_t oldset; + unsigned int count; + + if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE) + return nfs_flush_multi(head, inode, how); data = nfs_writedata_alloc(); if (!data) goto out_bad; + pages = data->pagevec; + count = 0; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &data->pages); + ClearPageError(req->wb_page); + SetPageWriteback(req->wb_page); + *pages++ = req->wb_page; + count += req->wb_bytes; + } + req = nfs_list_entry(data->pages.next); + + data->complete = nfs_writeback_done_full; /* Set up the argument struct */ - nfs_write_rpcsetup(head, data, how); + nfs_write_rpcsetup(req, data, count, 0, how); - rpc_clnt_sigmask(clnt, &oldset); - lock_kernel(); - rpc_execute(&data->task); - unlock_kernel(); - rpc_clnt_sigunmask(clnt, &oldset); + nfs_execute_write(data); return 0; out_bad: while (!list_empty(head)) { @@ -840,62 +1036,59 @@ nfs_flush_list(struct list_head *head, i return error; } - /* - * This function is called when the WRITE call is complete. + * Handle a write reply that flushed part of a page. */ -void -nfs_writeback_done(struct rpc_task *task) +static void nfs_writeback_done_partial(struct nfs_write_data *data, int status) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; - struct nfs_page *req; - struct page *page; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; - dprintk("NFS: %4d nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); + dprintk("NFS: write (%s/%Ld %d@%Ld)", + req->wb_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_inode), + req->wb_bytes, + (long long)req_offset(req)); - /* We can't handle that yet but we check for it nevertheless */ - if (resp->count < argp->count && task->tk_status >= 0) { - static unsigned long complain; - if (time_before(complain, jiffies)) { - printk(KERN_WARNING - "NFS: Server wrote less than requested.\n"); - complain = jiffies + 300 * HZ; - } - /* Can't do anything about it right now except throw - * an error. */ - task->tk_status = -EIO; - } + if (status < 0) { + ClearPageUptodate(page); + SetPageError(page); + if (req->wb_file) + req->wb_file->f_error = status; + dprintk(", error = %d\n", status); + } else { #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (data->verf.committed < argp->stable && task->tk_status >= 0) { - /* We tried a write call, but the server did not - * commit data to stable storage even though we - * requested it. - * Note: There is a known bug in Tru64 < 5.0 in which - * the server reports NFS_DATA_SYNC, but performs - * NFS_FILE_SYNC. We therefore implement this checking - * as a dprintk() in order to avoid filling syslog. - */ - static unsigned long complain; - - if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" - " (committed = %d) != (stable = %d)\n", - NFS_SERVER(data->inode)->hostname, - data->verf.committed, argp->stable); - complain = jiffies + 300 * HZ; - } - } + if (data->verf.committed < NFS_FILE_SYNC) { + if (!NFS_NEED_COMMIT(req)) { + nfs_defer_commit(req); + memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); + dprintk(" defer commit\n"); + } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { + nfs_defer_reschedule(req); + dprintk(" server reboot detected\n"); + } + } else #endif + dprintk(" OK\n"); + } - /* - * Update attributes as result of writeback. - * FIXME: There is an inherent race with invalidate_inode_pages and - * writebacks since the page->count is kept > 1 for as long - * as the page has a write request pending. - */ + if (atomic_dec_and_test(&req->wb_complete)) + nfs_writepage_release(req); +} + +/* + * Handle a write reply that flushes a whole page. + * + * FIXME: There is an inherent race with invalidate_inode_pages and + * writebacks since the page->count is kept > 1 for as long + * as the page has a write request pending. + */ +static void nfs_writeback_done_full(struct nfs_write_data *data, int status) +{ + struct nfs_page *req; + struct page *page; + + /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); @@ -907,20 +1100,20 @@ nfs_writeback_done(struct rpc_task *task req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { + if (status < 0) { ClearPageUptodate(page); SetPageError(page); if (req->wb_file) - req->wb_file->f_error = task->tk_status; + req->wb_file->f_error = status; end_page_writeback(page); nfs_inode_remove_request(req); - dprintk(", error = %d\n", task->tk_status); + dprintk(", error = %d\n", status); goto next; } end_page_writeback(page); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (argp->stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) { + if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) { nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; @@ -936,13 +1129,88 @@ nfs_writeback_done(struct rpc_task *task } } +/* + * This function is called when the WRITE call is complete. + */ +void nfs_writeback_done(struct rpc_task *task) +{ + struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + + dprintk("NFS: %4d nfs_writeback_done (status %d)\n", + task->tk_pid, task->tk_status); + +#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(data->inode)->hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } + } +#endif + /* Is this a short write? */ + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + + /* Has the server at least made some progress? */ + if (resp->count != 0) { + /* Was this an NFSv2 write or an NFSv3 stable write? */ + if (resp->verf->committed != NFS_UNSTABLE) { + /* Resend from where the server left off */ + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + } else { + /* Resend as a stable write in order to avoid + * headaches in the case of a server crash. + */ + argp->stable = NFS_FILE_SYNC; + } + rpc_restart_call(task); + return; + } + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote less than requested.\n"); + complain = jiffies + 300 * HZ; + } + /* Can't do anything about it except throw an error. */ + task->tk_status = -EIO; + } + + /* + * Process the nfs_page list + */ + data->complete(data, task->tk_status); +} + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static void nfs_commit_release(struct rpc_task *task) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; + nfs_commit_free(wdata); +} + /* * Set up the argument/result storage required for the RPC call. */ -static void -nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how) +static void nfs_commit_rpcsetup(struct list_head *head, + struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; struct nfs_page *first, *last; @@ -971,7 +1239,20 @@ nfs_commit_rpcsetup(struct list_head *he data->inode = inode; data->cred = first->wb_cred; - NFS_PROTO(inode)->commit_setup(data, start, len, how); + data->args.fh = NFS_FH(data->inode); + data->args.offset = start; + data->args.count = len; + data->res.count = len; + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + + NFS_PROTO(inode)->commit_setup(data, how); + + data->task.tk_priority = flush_task_priority(how); + data->task.tk_cookie = (unsigned long)inode; + data->task.tk_calldata = data; + /* Release requests */ + data->task.tk_release = nfs_commit_release; dprintk("NFS: %4d initiated commit call\n", task->tk_pid); } @@ -982,10 +1263,8 @@ nfs_commit_rpcsetup(struct list_head *he int nfs_commit_list(struct list_head *head, int how) { - struct rpc_clnt *clnt; struct nfs_write_data *data; struct nfs_page *req; - sigset_t oldset; data = nfs_commit_alloc(); @@ -994,13 +1273,8 @@ nfs_commit_list(struct list_head *head, /* Set up the argument struct */ nfs_commit_rpcsetup(head, data, how); - clnt = NFS_CLIENT(data->inode); - rpc_clnt_sigmask(clnt, &oldset); - lock_kernel(); - rpc_execute(&data->task); - unlock_kernel(); - rpc_clnt_sigunmask(clnt, &oldset); + nfs_execute_write(data); return 0; out_bad: while (!list_empty(head)) { @@ -1061,7 +1335,7 @@ nfs_commit_done(struct rpc_task *task) } #endif -int nfs_flush_file(struct inode *inode, struct file *file, unsigned long idx_start, +int nfs_flush_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { LIST_HEAD(head); @@ -1069,7 +1343,7 @@ int nfs_flush_file(struct inode *inode, error = 0; spin_lock(&nfs_wreq_lock); - res = nfs_scan_dirty(inode, &head, file, idx_start, npages); + res = nfs_scan_dirty(inode, &head, idx_start, npages); spin_unlock(&nfs_wreq_lock); if (res) error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how); @@ -1079,7 +1353,7 @@ int nfs_flush_file(struct inode *inode, } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -int nfs_commit_file(struct inode *inode, struct file *file, unsigned long idx_start, +int nfs_commit_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { LIST_HEAD(head); @@ -1087,9 +1361,9 @@ int nfs_commit_file(struct inode *inode, error = 0; spin_lock(&nfs_wreq_lock); - res = nfs_scan_commit(inode, &head, file, idx_start, npages); + res = nfs_scan_commit(inode, &head, idx_start, npages); if (res) { - res += nfs_scan_commit(inode, &head, NULL, 0, 0); + res += nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&nfs_wreq_lock); error = nfs_commit_list(&head, how); } else @@ -1100,7 +1374,7 @@ int nfs_commit_file(struct inode *inode, } #endif -int nfs_sync_file(struct inode *inode, struct file *file, unsigned long idx_start, +int nfs_sync_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { int error, @@ -1109,18 +1383,15 @@ int nfs_sync_file(struct inode *inode, s wait = how & FLUSH_WAIT; how &= ~FLUSH_WAIT; - if (!inode && file) - inode = file->f_dentry->d_inode; - do { error = 0; if (wait) - error = nfs_wait_on_requests(inode, file, idx_start, npages); + error = nfs_wait_on_requests(inode, idx_start, npages); if (error == 0) - error = nfs_flush_file(inode, file, idx_start, npages, how); + error = nfs_flush_inode(inode, idx_start, npages, how); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (error == 0) - error = nfs_commit_file(inode, file, idx_start, npages, how); + error = nfs_commit_inode(inode, idx_start, npages, how); #endif } while (error > 0); return error; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/fs.h linux-2.6.4-27-nfs4mount/include/linux/fs.h --- linux-2.6.4-pre3/include/linux/fs.h 2004-03-10 19:18:14.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/fs.h 2004-03-10 20:12:03.000000000 -0500 @@ -137,6 +137,7 @@ extern int leases_enable, dir_notify_ena #define S_DEAD 32 /* removed, but still open directory */ #define S_NOQUOTA 64 /* Inode is not counted to quota */ #define S_DIRSYNC 128 /* Directory modifications are synchronous */ +#define S_NOCMTIME 256 /* Do not update file c/mtime */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -170,6 +171,7 @@ extern int leases_enable, dir_notify_ena #define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND) #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) +#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/lockd/debug.h linux-2.6.4-27-nfs4mount/include/linux/lockd/debug.h --- linux-2.6.4-pre3/include/linux/lockd/debug.h 2004-03-10 19:34:06.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/lockd/debug.h 2004-03-10 20:13:14.000000000 -0500 @@ -23,7 +23,7 @@ #undef ifdebug #if defined(RPC_DEBUG) && defined(LOCKD_DEBUG) -# define ifdebug(flag) if (nlm_debug & NLMDBG_##flag) +# define ifdebug(flag) if (unlikely(nlm_debug & NLMDBG_##flag)) #else # define ifdebug(flag) if (0) #endif diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/lockd/lockd.h linux-2.6.4-27-nfs4mount/include/linux/lockd/lockd.h --- linux-2.6.4-pre3/include/linux/lockd/lockd.h 2004-03-10 19:16:00.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/lockd/lockd.h 2004-03-10 20:13:34.000000000 -0500 @@ -165,6 +165,7 @@ u32 nlmsvc_cancel_blocked(struct nlm_ unsigned long nlmsvc_retry_blocked(void); int nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, int action); +void nlmsvc_grant_reply(struct svc_rqst *, struct nlm_cookie *, u32); /* * File handling for the server personality diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/nfs_fs.h linux-2.6.4-27-nfs4mount/include/linux/nfs_fs.h --- linux-2.6.4-pre3/include/linux/nfs_fs.h 2004-03-10 19:20:59.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/nfs_fs.h 2004-03-10 20:14:52.000000000 -0500 @@ -69,6 +69,8 @@ #define FLUSH_SYNC 1 /* file being synced, or contention */ #define FLUSH_WAIT 2 /* wait for completion */ #define FLUSH_STABLE 4 /* commit to stable storage */ +#define FLUSH_LOWPRI 8 /* low priority background flush */ +#define FLUSH_HIGHPRI 16 /* high priority memory reclaim flush */ #ifdef __KERNEL__ @@ -99,7 +101,7 @@ struct nfs_inode { /* * Various flags */ - unsigned short flags; + unsigned int flags; /* * read_cache_jiffies is when we started read-caching this inode, @@ -118,19 +120,22 @@ struct nfs_inode { * * mtime != read_cache_mtime */ + unsigned long readdir_timestamp; unsigned long read_cache_jiffies; - struct timespec read_cache_ctime; - struct timespec read_cache_mtime; - __u64 read_cache_isize; unsigned long attrtimeo; unsigned long attrtimeo_timestamp; __u64 change_attr; /* v4 only */ + /* "Generation counter" for the attribute cache. This is + * bumped whenever we update the metadata on the + * server. + */ + unsigned long cache_change_attribute; /* - * Timestamp that dates the change made to read_cache_mtime. - * This is of use for dentry revalidation + * Counter indicating the number of outstanding requests that + * will cause a file data update. */ - unsigned long cache_mtime_jiffies; + atomic_t data_updates; struct nfs_access_cache cache_access; @@ -170,8 +175,9 @@ struct nfs_inode { #define NFS_INO_STALE 0x0001 /* possible stale inode */ #define NFS_INO_ADVISE_RDPLUS 0x0002 /* advise readdirplus */ #define NFS_INO_REVALIDATING 0x0004 /* revalidating attrs */ -#define NFS_INO_FLUSH 0x0008 /* inode is due for flushing */ -#define NFS_INO_FAKE_ROOT 0x0080 /* root inode placeholder */ +#define NFS_INO_INVALID_ATTR 0x0008 /* cached attrs are invalid */ +#define NFS_INO_INVALID_DATA 0x0010 /* cached data is invalid */ +#define NFS_INO_INVALID_ATIME 0x0020 /* cached atime is invalid */ static inline struct nfs_inode *NFS_I(struct inode *inode) { @@ -186,15 +192,7 @@ static inline struct nfs_inode *NFS_I(st #define NFS_ADDR(inode) (RPC_PEERADDR(NFS_CLIENT(inode))) #define NFS_COOKIEVERF(inode) (NFS_I(inode)->cookieverf) #define NFS_READTIME(inode) (NFS_I(inode)->read_cache_jiffies) -#define NFS_MTIME_UPDATE(inode) (NFS_I(inode)->cache_mtime_jiffies) -#define NFS_CACHE_CTIME(inode) (NFS_I(inode)->read_cache_ctime) -#define NFS_CACHE_MTIME(inode) (NFS_I(inode)->read_cache_mtime) -#define NFS_CACHE_ISIZE(inode) (NFS_I(inode)->read_cache_isize) #define NFS_CHANGE_ATTR(inode) (NFS_I(inode)->change_attr) -#define NFS_CACHEINV(inode) \ -do { \ - NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; \ -} while (0) #define NFS_ATTRTIMEO(inode) (NFS_I(inode)->attrtimeo) #define NFS_MINATTRTIMEO(inode) \ (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ @@ -207,10 +205,20 @@ do { \ #define NFS_FLAGS(inode) (NFS_I(inode)->flags) #define NFS_REVALIDATING(inode) (NFS_FLAGS(inode) & NFS_INO_REVALIDATING) #define NFS_STALE(inode) (NFS_FLAGS(inode) & NFS_INO_STALE) -#define NFS_FAKE_ROOT(inode) (NFS_FLAGS(inode) & NFS_INO_FAKE_ROOT) #define NFS_FILEID(inode) (NFS_I(inode)->fileid) +static inline int nfs_caches_unstable(struct inode *inode) +{ + return atomic_read(&NFS_I(inode)->data_updates) != 0; +} + +static inline void NFS_CACHEINV(struct inode *inode) +{ + if (!nfs_caches_unstable(inode)) + NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR; +} + static inline int nfs_server_capable(struct inode *inode, int cap) { return NFS_SERVER(inode)->caps & cap; @@ -227,13 +235,37 @@ loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_CACHE_SHIFT; } +/** + * nfs_save_change_attribute - Returns the inode attribute change cookie + * @inode - pointer to inode + * The "change attribute" is updated every time we finish an operation + * that will result in a metadata change on the server. + */ +static inline long nfs_save_change_attribute(struct inode *inode) +{ + return NFS_I(inode)->cache_change_attribute; +} + +/** + * nfs_verify_change_attribute - Detects NFS inode cache updates + * @inode - pointer to inode + * @chattr - previously saved change attribute + * Return "false" if metadata has been updated (or is in the process of + * being updated) since the change attribute was saved. + */ +static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr) +{ + return !nfs_caches_unstable(inode) + && chattr == NFS_I(inode)->cache_change_attribute; +} + /* * linux/fs/nfs/inode.c */ extern void nfs_zap_caches(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *); -extern int __nfs_refresh_inode(struct inode *, struct nfs_fattr *); +extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int nfs_permission(struct inode *, int, struct nameidata *); extern void nfs_set_mmcred(struct inode *, struct rpc_cred *); @@ -241,6 +273,10 @@ extern int nfs_open(struct inode *, stru extern int nfs_release(struct inode *, struct file *); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_setattr(struct dentry *, struct iattr *); +extern void nfs_begin_attr_update(struct inode *); +extern void nfs_end_attr_update(struct inode *); +extern void nfs_begin_data_update(struct inode *); +extern void nfs_end_data_update(struct inode *); /* * linux/fs/nfs/file.c @@ -298,10 +334,8 @@ extern int nfs_writepages(struct addres extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern void nfs_writeback_done(struct rpc_task *task); -extern void nfs_writedata_release(struct rpc_task *task); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -extern void nfs_commit_release(struct rpc_task *task); extern void nfs_commit_done(struct rpc_task *); #endif @@ -309,16 +343,15 @@ extern void nfs_commit_done(struct rpc_t * Try to write back everything synchronously (but check the * return value!) */ -extern int nfs_sync_file(struct inode *, struct file *, unsigned long, unsigned int, int); -extern int nfs_flush_file(struct inode *, struct file *, unsigned long, unsigned int, int); +extern int nfs_sync_inode(struct inode *, unsigned long, unsigned int, int); +extern int nfs_flush_inode(struct inode *, unsigned long, unsigned int, int); extern int nfs_flush_list(struct list_head *, int, int); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -extern int nfs_commit_file(struct inode *, struct file *, unsigned long, unsigned int, int); +extern int nfs_commit_inode(struct inode *, unsigned long, unsigned int, int); extern int nfs_commit_list(struct list_head *, int); #else static inline int -nfs_commit_file(struct inode *inode, struct file *file, unsigned long offset, - unsigned int len, int flags) +nfs_commit_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { return 0; } @@ -333,29 +366,23 @@ nfs_have_writebacks(struct inode *inode) static inline int nfs_wb_all(struct inode *inode) { - int error = nfs_sync_file(inode, 0, 0, 0, FLUSH_WAIT); + int error = nfs_sync_inode(inode, 0, 0, FLUSH_WAIT); return (error < 0) ? error : 0; } /* * Write back all requests on one page - we do this before reading it. */ -static inline int -nfs_wb_page(struct inode *inode, struct page* page) +static inline int nfs_wb_page_priority(struct inode *inode, struct page* page, int how) { - int error = nfs_sync_file(inode, 0, page->index, 1, - FLUSH_WAIT | FLUSH_STABLE); + int error = nfs_sync_inode(inode, page->index, 1, + how | FLUSH_WAIT | FLUSH_STABLE); return (error < 0) ? error : 0; } -/* - * Write back all pending writes for one user.. - */ -static inline int -nfs_wb_file(struct inode *inode, struct file *file) +static inline int nfs_wb_page(struct inode *inode, struct page* page) { - int error = nfs_sync_file(inode, file, 0, 0, FLUSH_WAIT); - return (error < 0) ? error : 0; + return nfs_wb_page_priority(inode, page, 0); } /* Hack for future NFS swap support */ @@ -371,7 +398,6 @@ extern int nfs_readpages(struct file *, struct list_head *, unsigned); extern int nfs_pagein_list(struct list_head *, int); extern void nfs_readpage_result(struct rpc_task *); -extern void nfs_readdata_release(struct rpc_task *); /* * linux/fs/mount_clnt.c @@ -383,20 +409,27 @@ extern int nfsroot_mount(struct sockadd /* * inline functions */ -static inline int -nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) + +static inline int nfs_attribute_timeout(struct inode *inode) { - if (time_before(jiffies, NFS_READTIME(inode)+NFS_ATTRTIMEO(inode))) - return NFS_STALE(inode) ? -ESTALE : 0; - return __nfs_revalidate_inode(server, inode); + struct nfs_inode *nfsi = NFS_I(inode); + + return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); } -static inline int -nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) +/** + * nfs_revalidate_inode - Revalidate the inode attributes + * @server - pointer to nfs_server struct + * @inode - pointer to inode struct + * + * Updates inode attribute information by retrieving the data from the server. + */ +static inline int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if ((fattr->valid & NFS_ATTR_FATTR) == 0) - return 0; - return __nfs_refresh_inode(inode,fattr); + if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + && !nfs_attribute_timeout(inode)) + return NFS_STALE(inode) ? -ESTALE : 0; + return __nfs_revalidate_inode(server, inode); } static inline loff_t @@ -661,7 +694,7 @@ struct nfs4_mount_data; #ifdef __KERNEL__ # undef ifdebug # ifdef NFS_DEBUG -# define ifdebug(fac) if (nfs_debug & NFSDBG_##fac) +# define ifdebug(fac) if (unlikely(nfs_debug & NFSDBG_##fac)) # else # define ifdebug(fac) if (0) # endif diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/nfs_page.h linux-2.6.4-27-nfs4mount/include/linux/nfs_page.h --- linux-2.6.4-pre3/include/linux/nfs_page.h 2004-03-10 19:13:57.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/nfs_page.h 2004-03-10 20:14:44.000000000 -0500 @@ -17,10 +17,14 @@ #include #include +#include + /* * Valid flags for a dirty buffer */ #define PG_BUSY 0 +#define PG_NEED_COMMIT 1 +#define PG_NEED_RESCHED 2 struct nfs_page { struct list_head wb_list, /* Defines state of page: */ @@ -31,6 +35,7 @@ struct nfs_page { struct rpc_cred *wb_cred; struct nfs4_state *wb_state; struct page *wb_page; /* page to read in/write out */ + atomic_t wb_complete; /* i/os we're waiting for */ wait_queue_head_t wb_wait; /* wait queue */ unsigned long wb_index; /* Offset >> PAGE_CACHE_SHIFT */ unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ @@ -42,6 +47,8 @@ struct nfs_page { }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +#define NFS_NEED_COMMIT(req) (test_bit(PG_NEED_COMMIT,&(req)->wb_flags)) +#define NFS_NEED_RESCHED(req) (test_bit(PG_NEED_RESCHED,&(req)->wb_flags)) extern struct nfs_page *nfs_create_request(struct file *, struct inode *, struct page *, @@ -53,7 +60,7 @@ extern void nfs_release_request(struct n extern void nfs_list_add_request(struct nfs_page *, struct list_head *); extern int nfs_scan_list(struct list_head *, struct list_head *, - struct file *, unsigned long, unsigned int); + unsigned long, unsigned int); extern int nfs_coalesce_requests(struct list_head *, struct list_head *, unsigned int); extern int nfs_wait_on_request(struct nfs_page *); @@ -93,8 +100,7 @@ nfs_unlock_request(struct nfs_page *req) smp_mb__before_clear_bit(); clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_clear_bit(); - if (waitqueue_active(&req->wb_wait)) - wake_up_all(&req->wb_wait); + wake_up_all(&req->wb_wait); nfs_release_request(req); } @@ -115,6 +121,38 @@ nfs_list_remove_request(struct nfs_page req->wb_list_head = NULL; } +static inline int +nfs_defer_commit(struct nfs_page *req) +{ + if (test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) + return 0; + return 1; +} + +static inline void +nfs_clear_commit(struct nfs_page *req) +{ + smp_mb__before_clear_bit(); + clear_bit(PG_NEED_COMMIT, &req->wb_flags); + smp_mb__after_clear_bit(); +} + +static inline int +nfs_defer_reschedule(struct nfs_page *req) +{ + if (test_and_set_bit(PG_NEED_RESCHED, &req->wb_flags)) + return 0; + return 1; +} + +static inline void +nfs_clear_reschedule(struct nfs_page *req) +{ + smp_mb__before_clear_bit(); + clear_bit(PG_NEED_RESCHED, &req->wb_flags); + smp_mb__after_clear_bit(); +} + static inline struct nfs_page * nfs_list_entry(struct list_head *head) { diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/nfs_xdr.h linux-2.6.4-27-nfs4mount/include/linux/nfs_xdr.h --- linux-2.6.4-pre3/include/linux/nfs_xdr.h 2004-03-10 19:34:32.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/nfs_xdr.h 2004-03-10 20:14:30.000000000 -0500 @@ -229,7 +229,8 @@ struct nfs_lockres { struct nfs_readargs { struct nfs_fh * fh; - nfs4_stateid stateid; + fl_owner_t lockowner; + struct nfs4_state * state; __u64 offset; __u32 count; unsigned int pgbase; @@ -252,7 +253,8 @@ struct nfs_readres { struct nfs_writeargs { struct nfs_fh * fh; - nfs4_stateid stateid; + fl_owner_t lockowner; + struct nfs4_state * state; __u64 offset; __u32 count; enum nfs3_stable_how stable; @@ -656,20 +658,23 @@ struct nfs4_compound { #endif /* CONFIG_NFS_V4 */ +struct nfs_page; + struct nfs_read_data { int flags; struct rpc_task task; struct inode *inode; struct rpc_cred *cred; - fl_owner_t lockowner; struct nfs_fattr fattr; /* fattr storage */ struct list_head pages; /* Coalesced read requests */ + struct nfs_page *req; /* multi ops per nfs_page */ struct page *pagevec[NFS_READ_MAXIOV]; struct nfs_readargs args; struct nfs_readres res; #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif + void (*complete) (struct nfs_read_data *, int); }; struct nfs_write_data { @@ -677,20 +682,19 @@ struct nfs_write_data { struct rpc_task task; struct inode *inode; struct rpc_cred *cred; - fl_owner_t lockowner; struct nfs_fattr fattr; struct nfs_writeverf verf; struct list_head pages; /* Coalesced requests we wish to flush */ + struct nfs_page *req; /* multi ops per nfs_page */ struct page *pagevec[NFS_WRITE_MAXIOV]; struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif + void (*complete) (struct nfs_write_data *, int); }; -struct nfs_page; - /* * RPC procedure vector for NFSv2/NFSv3 demuxing */ @@ -700,7 +704,7 @@ struct nfs_rpc_ops { struct inode_operations *dir_inode_ops; int (*getroot) (struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *); + struct nfs_fsinfo *); int (*getattr) (struct inode *, struct nfs_fattr *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); @@ -737,9 +741,9 @@ struct nfs_rpc_ops { int (*pathconf) (struct nfs_server *, struct nfs_fh *, struct nfs_pathconf *); u32 * (*decode_dirent)(u32 *, struct nfs_entry *, int plus); - void (*read_setup) (struct nfs_read_data *, unsigned int count); - void (*write_setup) (struct nfs_write_data *, unsigned int count, int how); - void (*commit_setup) (struct nfs_write_data *, u64 start, u32 len, int how); + void (*read_setup) (struct nfs_read_data *); + void (*write_setup) (struct nfs_write_data *, int how); + void (*commit_setup) (struct nfs_write_data *, int how); int (*file_open) (struct inode *, struct file *); int (*file_release) (struct inode *, struct file *); void (*request_init)(struct nfs_page *, struct file *); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/sunrpc/debug.h linux-2.6.4-27-nfs4mount/include/linux/sunrpc/debug.h --- linux-2.6.4-pre3/include/linux/sunrpc/debug.h 2004-03-10 19:41:49.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/sunrpc/debug.h 2004-03-10 20:13:55.000000000 -0500 @@ -54,7 +54,7 @@ extern unsigned int nlm_debug; #undef ifdebug #ifdef RPC_DEBUG -# define ifdebug(fac) if (rpc_debug & RPCDBG_##fac) +# define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) # define dfprintk(fac, args...) do { ifdebug(fac) printk(args); } while(0) # define RPC_IFDEBUG(x) x #else @@ -92,6 +92,8 @@ enum { CTL_NFSDEBUG, CTL_NFSDDEBUG, CTL_NLMDEBUG, + CTL_SLOTTABLE_UDP, + CTL_SLOTTABLE_TCP, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/sunrpc/sched.h linux-2.6.4-27-nfs4mount/include/linux/sunrpc/sched.h --- linux-2.6.4-pre3/include/linux/sunrpc/sched.h 2004-03-10 19:30:15.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/sunrpc/sched.h 2004-03-10 20:14:52.000000000 -0500 @@ -49,6 +49,8 @@ struct rpc_task { tk_cred_retry, tk_suid_retry; + unsigned long tk_cookie; /* Cookie for batching tasks */ + /* * timeout_fn to be executed by timer bottom half * callback to be executed after waking up @@ -72,7 +74,9 @@ struct rpc_task { unsigned long tk_timeout; /* timeout for rpc_sleep() */ unsigned short tk_flags; /* misc flags */ unsigned char tk_active : 1;/* Task has been activated */ + unsigned char tk_priority : 2;/* Task priority */ unsigned long tk_runstate; /* Task run status */ + struct list_head tk_links; /* links to related tasks */ #ifdef RPC_DEBUG unsigned short tk_pid; /* debugging aid */ #endif @@ -138,28 +142,58 @@ typedef void (*rpc_action)(struct rpc_ } while(0) /* + * Task priorities. + * Note: if you change these, you must also change + * the task initialization definitions below. + */ +#define RPC_PRIORITY_LOW 0 +#define RPC_PRIORITY_NORMAL 1 +#define RPC_PRIORITY_HIGH 2 +#define RPC_NR_PRIORITY (RPC_PRIORITY_HIGH+1) + +/* * RPC synchronization objects */ struct rpc_wait_queue { - struct list_head tasks; + struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */ + unsigned long cookie; /* cookie of last task serviced */ + unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ + unsigned char priority; /* current priority */ + unsigned char count; /* # task groups remaining serviced so far */ + unsigned char nr; /* # tasks remaining for cookie */ #ifdef RPC_DEBUG - char * name; + const char * name; #endif }; +/* + * This is the # requests to send consecutively + * from a single cookie. The aim is to improve + * performance of NFS operations such as read/write. + */ +#define RPC_BATCH_COUNT 16 + #ifndef RPC_DEBUG -# define RPC_WAITQ_INIT(var,qname) ((struct rpc_wait_queue) {LIST_HEAD_INIT(var)}) -# define RPC_WAITQ(var,qname) struct rpc_wait_queue var = RPC_WAITQ_INIT(var.tasks,qname) -# define INIT_RPC_WAITQ(ptr,qname) do { \ - INIT_LIST_HEAD(&(ptr)->tasks); \ - } while(0) +# define RPC_WAITQ_INIT(var,qname) { \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ + [2] = LIST_HEAD_INIT(var.tasks[2]), \ + }, \ + } #else -# define RPC_WAITQ_INIT(var,qname) ((struct rpc_wait_queue) {LIST_HEAD_INIT(var.tasks), qname}) -# define RPC_WAITQ(var,qname) struct rpc_wait_queue var = RPC_WAITQ_INIT(var,qname) -# define INIT_RPC_WAITQ(ptr,qname) do { \ - INIT_LIST_HEAD(&(ptr)->tasks); (ptr)->name = qname; \ - } while(0) +# define RPC_WAITQ_INIT(var,qname) { \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ + [2] = LIST_HEAD_INIT(var.tasks[2]), \ + }, \ + .name = qname, \ + } #endif +# define RPC_WAITQ(var,qname) struct rpc_wait_queue var = RPC_WAITQ_INIT(var,qname) + +#define RPC_IS_PRIORITY(q) ((q)->maxpriority > 0) /* * Function prototypes @@ -175,6 +209,8 @@ void rpc_run_child(struct rpc_task *par rpc_action action); int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *); void rpc_remove_wait_queue(struct rpc_task *); +void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); +void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action, rpc_action timer); void rpc_add_timer(struct rpc_task *, rpc_action); @@ -194,16 +230,14 @@ void rpc_show_tasks(void); int rpc_init_mempool(void); void rpc_destroy_mempool(void); -static __inline__ void -rpc_exit(struct rpc_task *task, int status) +static inline void rpc_exit(struct rpc_task *task, int status) { task->tk_status = status; task->tk_action = NULL; } #ifdef RPC_DEBUG -static __inline__ char * -rpc_qname(struct rpc_wait_queue *q) +static inline const char * rpc_qname(struct rpc_wait_queue *q) { return ((q && q->name) ? q->name : "unknown"); } diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/sunrpc/timer.h linux-2.6.4-27-nfs4mount/include/linux/sunrpc/timer.h --- linux-2.6.4-pre3/include/linux/sunrpc/timer.h 2004-03-10 19:27:16.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/sunrpc/timer.h 2004-03-10 20:12:55.000000000 -0500 @@ -25,9 +25,18 @@ extern unsigned long rpc_calc_rto(struct static inline void rpc_set_timeo(struct rpc_rtt *rt, int timer, int ntimeo) { + int *t; if (!timer) return; - rt->ntimeouts[timer-1] = ntimeo; + t = &rt->ntimeouts[timer-1]; + if (ntimeo < *t) { + if (*t > 0) + (*t)--; + } else { + if (ntimeo > 8) + ntimeo = 8; + *t = ntimeo; + } } static inline int rpc_ntimeo(struct rpc_rtt *rt, int timer) diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/sunrpc/xdr.h linux-2.6.4-27-nfs4mount/include/linux/sunrpc/xdr.h --- linux-2.6.4-pre3/include/linux/sunrpc/xdr.h 2004-03-10 19:40:28.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/sunrpc/xdr.h 2004-03-10 20:13:48.000000000 -0500 @@ -87,7 +87,7 @@ struct xdr_buf { /* * Miscellaneous XDR helper functions */ -u32 * xdr_encode_array(u32 *p, const char *s, unsigned int len); +u32 * xdr_encode_array(u32 *p, const void *s, unsigned int len); u32 * xdr_encode_string(u32 *p, const char *s); u32 * xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen); u32 * xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/include/linux/sunrpc/xprt.h linux-2.6.4-27-nfs4mount/include/linux/sunrpc/xprt.h --- linux-2.6.4-pre3/include/linux/sunrpc/xprt.h 2004-03-10 19:32:03.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/include/linux/sunrpc/xprt.h 2004-03-10 20:13:55.000000000 -0500 @@ -28,16 +28,18 @@ * * Upper procedures may check whether a request would block waiting for * a free RPC slot by using the RPC_CONGESTED() macro. - * - * Note: on machines with low memory we should probably use a smaller - * MAXREQS value: At 32 outstanding reqs with 8 megs of RAM, fragment - * reassembly will frequently run out of memory. - */ -#define RPC_MAXCONG (16) -#define RPC_MAXREQS RPC_MAXCONG -#define RPC_CWNDSCALE (256) -#define RPC_MAXCWND (RPC_MAXCONG * RPC_CWNDSCALE) + */ +extern unsigned int xprt_udp_slot_table_entries; +extern unsigned int xprt_tcp_slot_table_entries; + +#define RPC_MIN_SLOT_TABLE (2U) +#define RPC_DEF_SLOT_TABLE (16U) +#define RPC_MAX_SLOT_TABLE (128U) + +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) #define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) /* Default timeout values */ @@ -92,7 +94,6 @@ struct rpc_rqst { */ struct rpc_task * rq_task; /* RPC task data */ __u32 rq_xid; /* request XID */ - struct rpc_rqst * rq_next; /* free list */ int rq_cong; /* has incremented xprt->cong */ int rq_received; /* receive completed */ u32 rq_seqno; /* gss seq no. used on req. */ @@ -102,7 +103,6 @@ struct rpc_rqst { struct xdr_buf rq_private_buf; /* The receive buffer * used in the softirq. */ - /* * For authentication (e.g. auth_des) */ @@ -146,8 +146,9 @@ struct rpc_xprt { struct rpc_wait_queue resend; /* requests waiting to resend */ struct rpc_wait_queue pending; /* requests in flight */ struct rpc_wait_queue backlog; /* waiting for slot */ - struct rpc_rqst * free; /* free slots */ - struct rpc_rqst slot[RPC_MAXREQS]; + struct list_head free; /* free slots */ + struct rpc_rqst * slot; /* slot table storage */ + unsigned int max_reqs; /* total slots */ unsigned long sockstate; /* Socket state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ @@ -155,6 +156,11 @@ struct rpc_xprt { stream : 1; /* TCP */ /* + * XID + */ + __u32 xid; /* Next XID value to use */ + + /* * State of TCP reply receive stuff */ u32 tcp_recm, /* Fragment header */ @@ -164,6 +170,11 @@ struct rpc_xprt { unsigned long tcp_copied, /* copied to request */ tcp_flags; /* + * Connection of sockets + */ + struct work_struct sock_connect; + unsigned short port; + /* * Disconnection of idle sockets */ struct work_struct task_cleanup; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/auth_gss/auth_gss.c linux-2.6.4-27-nfs4mount/net/sunrpc/auth_gss/auth_gss.c --- linux-2.6.4-pre3/net/sunrpc/auth_gss/auth_gss.c 2004-03-10 19:23:39.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/auth_gss/auth_gss.c 2004-03-10 20:14:52.000000000 -0500 @@ -365,7 +365,7 @@ retry: gss_msg = gss_new; memset(gss_new, 0, sizeof(*gss_new)); INIT_LIST_HEAD(&gss_new->list); - INIT_RPC_WAITQ(&gss_new->waitq, "RPCSEC_GSS upcall waitq"); + rpc_init_wait_queue(&gss_new->waitq, "RPCSEC_GSS upcall waitq"); atomic_set(&gss_new->count, 2); msg = &gss_new->msg; msg->data = &gss_new->uid; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/auth_unix.c linux-2.6.4-27-nfs4mount/net/sunrpc/auth_unix.c --- linux-2.6.4-pre3/net/sunrpc/auth_unix.c 2004-03-10 19:27:45.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/auth_unix.c 2004-03-10 20:13:48.000000000 -0500 @@ -149,7 +149,7 @@ unx_marshal(struct rpc_task *task, u32 * struct rpc_clnt *clnt = task->tk_client; struct unx_cred *cred = (struct unx_cred *) task->tk_msg.rpc_cred; u32 *base, *hold; - int i, n; + int i; *p++ = htonl(RPC_AUTH_UNIX); base = p++; @@ -158,10 +158,7 @@ unx_marshal(struct rpc_task *task, u32 * /* * Copy the UTS nodename captured when the client was created. */ - n = clnt->cl_nodelen; - *p++ = htonl(n); - memcpy(p, clnt->cl_nodename, n); - p += (n + 3) >> 2; + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); /* Note: we don't use real uid if it involves raising privilege */ if (ruid && cred->uc_puid != 0 && cred->uc_pgid != 0) { diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/clnt.c linux-2.6.4-27-nfs4mount/net/sunrpc/clnt.c --- linux-2.6.4-pre3/net/sunrpc/clnt.c 2004-03-10 19:21:33.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/clnt.c 2004-03-10 20:14:52.000000000 -0500 @@ -102,19 +102,22 @@ rpc_create_client(struct rpc_xprt *xprt, { struct rpc_version *version; struct rpc_clnt *clnt = NULL; + int err; int len; dprintk("RPC: creating %s client for %s (xprt %p)\n", program->name, servname, xprt); + err = -EINVAL; if (!xprt) - goto out; + goto out_err; if (vers >= program->nrvers || !(version = program->version[vers])) - goto out; + goto out_err; + err = -ENOMEM; clnt = (struct rpc_clnt *) kmalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) - goto out_no_clnt; + goto out_err; memset(clnt, 0, sizeof(*clnt)); atomic_set(&clnt->cl_users, 0); atomic_set(&clnt->cl_count, 1); @@ -141,7 +144,7 @@ rpc_create_client(struct rpc_xprt *xprt, clnt->cl_vers = version->number; clnt->cl_prot = xprt->prot; clnt->cl_stats = program->stats; - INIT_RPC_WAITQ(&clnt->cl_pmap_default.pm_bindwait, "bindwait"); + rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait"); if (!clnt->cl_port) clnt->cl_autobind = 1; @@ -149,9 +152,11 @@ rpc_create_client(struct rpc_xprt *xprt, clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, xprt->timeout.to_initval); - if (rpc_setup_pipedir(clnt, program->pipe_dir_name) < 0) + err = rpc_setup_pipedir(clnt, program->pipe_dir_name); + if (err < 0) goto out_no_path; + err = -ENOMEM; if (!rpcauth_create(flavor, clnt)) { printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", flavor); @@ -163,20 +168,16 @@ rpc_create_client(struct rpc_xprt *xprt, if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); -out: return clnt; -out_no_clnt: - printk(KERN_INFO "RPC: out of memory in rpc_create_client\n"); - goto out; out_no_auth: rpc_rmdir(clnt->cl_pathname); out_no_path: if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); kfree(clnt); - clnt = NULL; - goto out; +out_err: + return ERR_PTR(err); } /* @@ -198,11 +199,10 @@ rpc_clone_client(struct rpc_clnt *clnt) atomic_inc(&new->cl_parent->cl_count); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); -out: return new; out_no_clnt: printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__); - goto out; + return ERR_PTR(-ENOMEM); } /* @@ -611,9 +611,6 @@ call_encode(struct rpc_task *task) rcvbuf->page_len = 0; rcvbuf->len = bufsiz; - /* Zero buffer so we have automatic zero-padding of opaque & string */ - memset(task->tk_buffer, 0, bufsiz); - /* Encode header and provided arguments */ encode = task->tk_msg.rpc_proc->p_encode; if (!(p = call_header(task))) { diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/pmap_clnt.c linux-2.6.4-27-nfs4mount/net/sunrpc/pmap_clnt.c --- linux-2.6.4-pre3/net/sunrpc/pmap_clnt.c 2004-03-10 19:24:58.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/pmap_clnt.c 2004-03-10 20:13:08.000000000 -0500 @@ -65,9 +65,11 @@ rpc_getport(struct rpc_task *task, struc map->pm_binding = 1; spin_unlock(&pmap_lock); - task->tk_status = -EACCES; /* why set this? returns -EIO below */ - if (!(pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot))) + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + if (IS_ERR(pmap_clnt)) { + task->tk_status = PTR_ERR(pmap_clnt); goto bailout; + } task->tk_status = 0; /* @@ -110,8 +112,9 @@ rpc_getport_external(struct sockaddr_in NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); - if (!(pmap_clnt = pmap_create(hostname, sin, prot))) - return -EACCES; + pmap_clnt = pmap_create(hostname, sin, prot); + if (IS_ERR(pmap_clnt)) + return PTR_ERR(pmap_clnt); /* Setup the call info struct */ status = rpc_call(pmap_clnt, PMAP_GETPORT, &map, &map.pm_port, 0); @@ -161,16 +164,18 @@ rpc_register(u32 prog, u32 vers, int pro struct sockaddr_in sin; struct rpc_portmap map; struct rpc_clnt *pmap_clnt; - unsigned int error = 0; + int error = 0; dprintk("RPC: registering (%d, %d, %d, %d) with portmapper.\n", prog, vers, prot, port); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (!(pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP))) { - dprintk("RPC: couldn't create pmap client\n"); - return -EACCES; + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + if (IS_ERR(pmap_clnt)) { + error = PTR_ERR(pmap_clnt); + dprintk("RPC: couldn't create pmap client. Error = %d\n", error); + return error; } map.pm_prog = prog; @@ -199,15 +204,16 @@ pmap_create(char *hostname, struct socka struct rpc_clnt *clnt; /* printk("pmap: create xprt\n"); */ - if (!(xprt = xprt_create_proto(proto, srvaddr, NULL))) - return NULL; + xprt = xprt_create_proto(proto, srvaddr, NULL); + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); /* printk("pmap: create clnt\n"); */ clnt = rpc_create_client(xprt, hostname, &pmap_program, RPC_PMAP_VERSION, RPC_AUTH_NULL); - if (!clnt) { + if (IS_ERR(clnt)) { xprt_destroy(xprt); } else { clnt->cl_softrtry = 1; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/sched.c linux-2.6.4-27-nfs4mount/net/sunrpc/sched.c --- linux-2.6.4-pre3/net/sunrpc/sched.c 2004-03-10 19:26:21.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/sched.c 2004-03-10 20:15:00.000000000 -0500 @@ -162,6 +162,26 @@ rpc_delete_timer(struct rpc_task *task) } /* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; + list_for_each_entry(t, q, tk_list) { + if (t->tk_cookie == task->tk_cookie) { + list_add_tail(&task->tk_list, &t->tk_links); + return; + } + } + list_add_tail(&task->tk_list, q); +} + +/* * Add new request to wait queue. * * Swapper tasks always get inserted at the head of the queue. @@ -169,8 +189,7 @@ rpc_delete_timer(struct rpc_task *task) * improve overall performance. * Everyone else gets appended to the queue to ensure proper FIFO behavior. */ -static inline int -__rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { if (task->tk_rpcwait == queue) return 0; @@ -179,10 +198,12 @@ __rpc_add_wait_queue(struct rpc_wait_que printk(KERN_WARNING "RPC: doubly enqueued task!\n"); return -EWOULDBLOCK; } - if (RPC_IS_SWAPPER(task)) - list_add(&task->tk_list, &queue->tasks); + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) + list_add(&task->tk_list, &queue->tasks[0]); else - list_add_tail(&task->tk_list, &queue->tasks); + list_add_tail(&task->tk_list, &queue->tasks[0]); task->tk_rpcwait = queue; dprintk("RPC: %4d added to queue %p \"%s\"\n", @@ -191,8 +212,7 @@ __rpc_add_wait_queue(struct rpc_wait_que return 0; } -int -rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task) +int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task) { int result; @@ -203,18 +223,35 @@ rpc_add_wait_queue(struct rpc_wait_queue } /* + * Remove request from a priority queue. + */ +static void __rpc_remove_wait_queue_priority(struct rpc_task *task) +{ + struct rpc_task *t; + + if (!list_empty(&task->tk_links)) { + t = list_entry(task->tk_links.next, struct rpc_task, tk_list); + list_move(&t->tk_list, &task->tk_list); + list_splice_init(&task->tk_links, &t->tk_links); + } + list_del(&task->tk_list); +} + +/* * Remove request from queue. * Note: must be called with spin lock held. */ -static inline void -__rpc_remove_wait_queue(struct rpc_task *task) +static void __rpc_remove_wait_queue(struct rpc_task *task) { struct rpc_wait_queue *queue = task->tk_rpcwait; if (!queue) return; - list_del(&task->tk_list); + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + else + list_del(&task->tk_list); task->tk_rpcwait = NULL; dprintk("RPC: %4d removed from queue %p \"%s\"\n", @@ -231,6 +268,48 @@ rpc_remove_wait_queue(struct rpc_task *t spin_unlock_bh(&rpc_queue_lock); } +static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) +{ + queue->priority = priority; + queue->count = 1 << (priority * 2); +} + +static inline void rpc_set_waitqueue_cookie(struct rpc_wait_queue *queue, unsigned long cookie) +{ + queue->cookie = cookie; + queue->nr = RPC_BATCH_COUNT; +} + +static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) +{ + rpc_set_waitqueue_priority(queue, queue->maxpriority); + rpc_set_waitqueue_cookie(queue, 0); +} + +static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, int maxprio) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = maxprio; + rpc_reset_waitqueue_priority(queue); +#ifdef RPC_DEBUG + queue->name = qname; +#endif +} + +void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, RPC_PRIORITY_HIGH); +} + +void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, 0); +} +EXPORT_SYMBOL(rpc_init_wait_queue); + /* * Make an RPC task runnable. * @@ -255,13 +334,11 @@ rpc_make_runnable(struct rpc_task *task) return; } rpc_clear_sleeping(task); - if (waitqueue_active(&rpciod_idle)) - wake_up(&rpciod_idle); + wake_up(&rpciod_idle); } } else { rpc_clear_sleeping(task); - if (waitqueue_active(&task->tk_wait)) - wake_up(&task->tk_wait); + wake_up(&task->tk_wait); } } @@ -287,8 +364,7 @@ void rpciod_wake_up(void) { if(rpciod_pid==0) printk(KERN_ERR "rpciod: wot no daemon?\n"); - if (waitqueue_active(&rpciod_idle)) - wake_up(&rpciod_idle); + wake_up(&rpciod_idle); } /* @@ -406,17 +482,72 @@ rpc_wake_up_task(struct rpc_task *task) } /* + * Wake up the next task on a priority queue. + */ +static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue) +{ + struct list_head *q; + struct rpc_task *task; + + /* + * Service a batch of tasks from a single cookie. + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, tk_list); + if (queue->cookie == task->tk_cookie) { + if (--queue->nr) + goto out; + list_move_tail(&task->tk_list, q); + } + /* + * Check if we need to switch queues. + */ + if (--queue->count) + goto new_cookie; + } + + /* + * Service the next queue. + */ + do { + if (q == &queue->tasks[0]) + q = &queue->tasks[queue->maxpriority]; + else + q = q - 1; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, tk_list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); + + rpc_reset_waitqueue_priority(queue); + return NULL; + +new_queue: + rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); +new_cookie: + rpc_set_waitqueue_cookie(queue, task->tk_cookie); +out: + __rpc_wake_up_task(task); + return task; +} + +/* * Wake up the next task on the wait queue. */ -struct rpc_task * -rpc_wake_up_next(struct rpc_wait_queue *queue) +struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) { struct rpc_task *task = NULL; dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); spin_lock_bh(&rpc_queue_lock); - task_for_first(task, &queue->tasks) - __rpc_wake_up_task(task); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + __rpc_wake_up_task(task); + } spin_unlock_bh(&rpc_queue_lock); return task; @@ -428,15 +559,22 @@ rpc_wake_up_next(struct rpc_wait_queue * * * Grabs rpc_queue_lock */ -void -rpc_wake_up(struct rpc_wait_queue *queue) +void rpc_wake_up(struct rpc_wait_queue *queue) { struct rpc_task *task; + struct list_head *head; spin_lock_bh(&rpc_queue_lock); - while (!list_empty(&queue->tasks)) - task_for_first(task, &queue->tasks) + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, tk_list); __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } spin_unlock_bh(&rpc_queue_lock); } @@ -447,17 +585,22 @@ rpc_wake_up(struct rpc_wait_queue *queue * * Grabs rpc_queue_lock */ -void -rpc_wake_up_status(struct rpc_wait_queue *queue, int status) +void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { + struct list_head *head; struct rpc_task *task; spin_lock_bh(&rpc_queue_lock); - while (!list_empty(&queue->tasks)) { - task_for_first(task, &queue->tasks) { + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, tk_list); task->tk_status = status; __rpc_wake_up_task(task); } + if (head == &queue->tasks[0]) + break; + head--; } spin_unlock_bh(&rpc_queue_lock); } @@ -530,6 +673,9 @@ __rpc_execute(struct rpc_task *task) if (!task->tk_action) break; task->tk_action(task); + /* micro-optimization to avoid spinlock */ + if (RPC_IS_RUNNING(task)) + continue; } /* @@ -545,29 +691,31 @@ __rpc_execute(struct rpc_task *task) } spin_unlock_bh(&rpc_queue_lock); - while (RPC_IS_SLEEPING(task)) { - /* sync task: sleep here */ - dprintk("RPC: %4d sync task going to sleep\n", - task->tk_pid); - if (current->pid == rpciod_pid) - printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); + if (!RPC_IS_SLEEPING(task)) + continue; + /* sync task: sleep here */ + dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); + if (current->pid == rpciod_pid) + printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); + if (!task->tk_client->cl_intr) { __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task)); - dprintk("RPC: %4d sync task resuming\n", task->tk_pid); - + } else { + __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status); /* * When a sync task receives a signal, it exits with * -ERESTARTSYS. In order to catch any callbacks that * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ - if (task->tk_client->cl_intr && signalled()) { + if (status == -ERESTARTSYS) { dprintk("RPC: %4d got signal\n", task->tk_pid); task->tk_flags |= RPC_TASK_KILLED; rpc_exit(task, -ERESTARTSYS); rpc_wake_up_task(task); } } + dprintk("RPC: %4d sync task resuming\n", task->tk_pid); } if (task->tk_exit) { @@ -638,21 +786,22 @@ __rpc_schedule(void) dprintk("RPC: rpc_schedule enter\n"); while (1) { - spin_lock_bh(&rpc_queue_lock); - task_for_first(task, &schedq.tasks) { + task_for_first(task, &schedq.tasks[0]) { __rpc_remove_wait_queue(task); spin_unlock_bh(&rpc_queue_lock); __rpc_execute(task); + spin_lock_bh(&rpc_queue_lock); } else { - spin_unlock_bh(&rpc_queue_lock); break; } if (++count >= 200 || need_resched()) { count = 0; + spin_unlock_bh(&rpc_queue_lock); schedule(); + spin_lock_bh(&rpc_queue_lock); } } dprintk("RPC: rpc_schedule leave\n"); @@ -704,9 +853,7 @@ rpc_free(struct rpc_task *task) /* * Creation and deletion of RPC task structures */ -inline void -rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, - rpc_action callback, int flags) +void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags) { memset(task, 0, sizeof(*task)); init_timer(&task->tk_timer); @@ -724,6 +871,10 @@ rpc_init_task(struct rpc_task *task, str task->tk_cred_retry = 2; task->tk_suid_retry = 1; + task->tk_priority = RPC_PRIORITY_NORMAL; + task->tk_cookie = (unsigned long)current; + INIT_LIST_HEAD(&task->tk_links); + /* Add to global list of all tasks */ spin_lock(&rpc_sched_lock); list_add(&task->tk_task, &all_tasks); @@ -861,7 +1012,7 @@ rpc_find_parent(struct rpc_task *child) struct list_head *le; parent = (struct rpc_task *) child->tk_calldata; - task_for_each(task, le, &childq.tasks) + task_for_each(task, le, &childq.tasks[0]) if (task == parent) return parent; @@ -941,7 +1092,7 @@ static DECLARE_MUTEX_LOCKED(rpciod_runni static inline int rpciod_task_pending(void) { - return !list_empty(&schedq.tasks); + return !list_empty(&schedq.tasks[0]); } @@ -964,27 +1115,41 @@ rpciod(void *ptr) allow_signal(SIGKILL); dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid); + spin_lock_bh(&rpc_queue_lock); while (rpciod_users) { + DEFINE_WAIT(wait); if (signalled()) { + spin_unlock_bh(&rpc_queue_lock); rpciod_killall(); flush_signals(current); + spin_lock_bh(&rpc_queue_lock); } __rpc_schedule(); - if (current->flags & PF_FREEZE) + if (current->flags & PF_FREEZE) { + spin_unlock_bh(&rpc_queue_lock); refrigerator(PF_IOTHREAD); + spin_lock_bh(&rpc_queue_lock); + } if (++rounds >= 64) { /* safeguard */ + spin_unlock_bh(&rpc_queue_lock); schedule(); rounds = 0; + spin_lock_bh(&rpc_queue_lock); } - if (!rpciod_task_pending()) { - dprintk("RPC: rpciod back to sleep\n"); - wait_event_interruptible(rpciod_idle, rpciod_task_pending()); - dprintk("RPC: switch to rpciod\n"); + dprintk("RPC: rpciod back to sleep\n"); + prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE); + if (!rpciod_task_pending() && !signalled()) { + spin_unlock_bh(&rpc_queue_lock); + schedule(); rounds = 0; + spin_lock_bh(&rpc_queue_lock); } + finish_wait(&rpciod_idle, &wait); + dprintk("RPC: switch to rpciod\n"); } + spin_unlock_bh(&rpc_queue_lock); dprintk("RPC: rpciod shutdown commences\n"); if (!list_empty(&all_tasks)) { @@ -1008,7 +1173,9 @@ rpciod_killall(void) while (!list_empty(&all_tasks)) { clear_thread_flag(TIF_SIGPENDING); rpc_killall_tasks(NULL); + spin_lock_bh(&rpc_queue_lock); __rpc_schedule(); + spin_unlock_bh(&rpc_queue_lock); if (!list_empty(&all_tasks)) { dprintk("rpciod_killall: waiting for tasks to exit\n"); yield(); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/sunrpc_syms.c linux-2.6.4-27-nfs4mount/net/sunrpc/sunrpc_syms.c --- linux-2.6.4-pre3/net/sunrpc/sunrpc_syms.c 2004-03-10 19:35:31.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/sunrpc_syms.c 2004-03-10 20:13:55.000000000 -0500 @@ -63,6 +63,8 @@ EXPORT_SYMBOL(rpc_mkpipe); EXPORT_SYMBOL(xprt_create_proto); EXPORT_SYMBOL(xprt_destroy); EXPORT_SYMBOL(xprt_set_timeout); +EXPORT_SYMBOL(xprt_udp_slot_table_entries); +EXPORT_SYMBOL(xprt_tcp_slot_table_entries); /* Client credential cache */ EXPORT_SYMBOL(rpcauth_register); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/sysctl.c linux-2.6.4-27-nfs4mount/net/sunrpc/sysctl.c --- linux-2.6.4-pre3/net/sunrpc/sysctl.c 2004-03-10 19:13:17.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/sysctl.c 2004-03-10 20:13:55.000000000 -0500 @@ -1,7 +1,7 @@ /* * linux/net/sunrpc/sysctl.c * - * Sysctl interface to sunrpc module. This is for debugging only now. + * Sysctl interface to sunrpc module. * * I would prefer to register the sunrpc table below sys/net, but that's * impossible at the moment. @@ -19,6 +19,7 @@ #include #include #include +#include /* * Declare the debug flags here @@ -117,6 +118,9 @@ done: return 0; } +static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; + static ctl_table debug_table[] = { { .ctl_name = CTL_RPCDEBUG, @@ -150,6 +154,28 @@ static ctl_table debug_table[] = { .mode = 0644, .proc_handler = &proc_dodebug }, + { + .ctl_name = CTL_SLOTTABLE_UDP, + .procname = "udp_slot_table_entries", + .data = &xprt_udp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .ctl_name = CTL_SLOTTABLE_TCP, + .procname = "tcp_slot_table_entries", + .data = &xprt_tcp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, { .ctl_name = 0 } }; diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/xdr.c linux-2.6.4-27-nfs4mount/net/sunrpc/xdr.c --- linux-2.6.4-pre3/net/sunrpc/xdr.c 2004-03-10 19:31:11.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/xdr.c 2004-03-10 20:13:48.000000000 -0500 @@ -54,7 +54,7 @@ xdr_decode_netobj(u32 *p, struct xdr_net } u32 * -xdr_encode_array(u32 *p, const char *array, unsigned int len) +xdr_encode_array(u32 *p, const void *array, unsigned int len) { int quadlen = XDR_QUADLEN(len); diff -u --recursive --new-file --show-c-function linux-2.6.4-pre3/net/sunrpc/xprt.c linux-2.6.4-27-nfs4mount/net/sunrpc/xprt.c --- linux-2.6.4-pre3/net/sunrpc/xprt.c 2004-03-10 19:30:26.000000000 -0500 +++ linux-2.6.4-27-nfs4mount/net/sunrpc/xprt.c 2004-03-10 20:14:52.000000000 -0500 @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -74,6 +75,7 @@ #define XPRT_MAX_BACKOFF (8) #define XPRT_IDLE_TIMEOUT (5*60*HZ) +#define XPRT_MAX_RESVPORT (800) /* * Local functions @@ -84,7 +86,7 @@ static void xprt_disconnect(struct rpc_x static void xprt_connect_status(struct rpc_task *task); static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to); -static struct socket *xprt_create_socket(int, struct rpc_timeout *, int); +static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); static void xprt_bind_socket(struct rpc_xprt *, struct socket *); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); @@ -336,8 +338,8 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; - if (cwnd > RPC_MAXCWND) - cwnd = RPC_MAXCWND; + if (cwnd > RPC_MAXCWND(xprt)) + cwnd = RPC_MAXCWND(xprt); __xprt_lock_write_next(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; @@ -452,17 +454,74 @@ out_abort: spin_unlock(&xprt->sock_lock); } +static void xprt_socket_connect(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int status = -EIO; + + if (xprt->shutdown) { + rpc_wake_up_status(&xprt->pending, -EIO); + return; + } + if (!xprt->addr.sin_port) + goto out_err; + + /* + * Start by resetting any existing state + */ + xprt_close(xprt); + sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + if (sock == NULL) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + goto out_err; + return; + } + xprt_bind_socket(xprt, sock); + xprt_sock_setbufsize(xprt); + + if (!xprt->stream) + goto out; + + /* + * Tell the socket layer to start connecting... + */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status >= 0) + goto out; + switch (status) { + case -EINPROGRESS: + case -EALREADY: + return; + default: + goto out_err; + } +out: + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); + return; +out_err: + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) { + xprt->snd_task->tk_status = status; + rpc_wake_up_task(xprt->snd_task); + } + spin_unlock_bh(&xprt->sock_lock); +} + /* * Attempt to connect a TCP socket. * */ -void -xprt_connect(struct rpc_task *task) +void xprt_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - struct socket *sock = xprt->sock; - struct sock *inet; - int status; dprintk("RPC: %4d xprt_connect xprt %p %s connected\n", task->tk_pid, xprt, (xprt_connected(xprt) ? "is" : "is not")); @@ -483,79 +542,9 @@ xprt_connect(struct rpc_task *task) if (task->tk_rqstp) task->tk_rqstp->rq_bytes_sent = 0; - /* - * We're here because the xprt was marked disconnected. - * Start by resetting any existing state. - */ - xprt_close(xprt); - if (!(sock = xprt_create_socket(xprt->prot, &xprt->timeout, xprt->resvport))) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ - task->tk_status = -EIO; - goto out_write; - } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); - - if (!xprt->stream) - goto out_write; - - inet = sock->sk; - - /* - * Tell the socket layer to start connecting... - */ - status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, - sizeof(xprt->addr), O_NONBLOCK); - dprintk("RPC: %4d connect status %d connected %d sock state %d\n", - task->tk_pid, -status, xprt_connected(xprt), inet->sk_state); - - if (status >= 0) - return; - - switch (status) { - case -EINPROGRESS: - case -EALREADY: - /* Protect against TCP socket state changes */ - lock_sock(inet); - if (inet->sk_state != TCP_ESTABLISHED) { - dprintk("RPC: %4d waiting for connection\n", - task->tk_pid); - task->tk_timeout = RPC_CONNECT_TIMEOUT; - /* if the socket is already closing, delay briefly */ - if ((1 << inet->sk_state) & - ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) - task->tk_timeout = RPC_REESTABLISH_TIMEOUT; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status, - NULL); - } - release_sock(inet); - break; - case -ECONNREFUSED: - case -ECONNRESET: - case -ENOTCONN: - if (!RPC_IS_SOFT(task)) { - rpc_delay(task, RPC_REESTABLISH_TIMEOUT); - task->tk_status = -ENOTCONN; - break; - } - default: - /* Report myriad other possible returns. If this file - * system is soft mounted, just error out, like Solaris. */ - if (RPC_IS_SOFT(task)) { - printk(KERN_WARNING - "RPC: error %d connecting to server %s, exiting\n", - -status, task->tk_client->cl_server); - task->tk_status = -EIO; - goto out_write; - } - printk(KERN_WARNING "RPC: error %d connecting to server %s\n", - -status, task->tk_client->cl_server); - /* This will prevent anybody else from reconnecting */ - rpc_delay(task, RPC_REESTABLISH_TIMEOUT); - task->tk_status = status; - break; - } + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + schedule_work(&xprt->sock_connect); return; out_write: xprt_release_write(xprt, task); @@ -580,6 +569,8 @@ xprt_connect_status(struct rpc_task *tas task->tk_status = -EIO; switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: case -ENOTCONN: rpc_delay(task, RPC_REESTABLISH_TIMEOUT); return; @@ -791,8 +782,6 @@ udp_data_ready(struct sock *sk, int len) dropit: skb_free_datagram(sk, skb); out: - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -1052,8 +1041,6 @@ tcp_state_change(struct sock *sk) break; } out: - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible_all(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -1093,8 +1080,6 @@ xprt_write_space(struct sock *sk) if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) rpc_wake_up_task(xprt->snd_task); spin_unlock_bh(&xprt->sock_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); out: read_unlock(&sk->sk_callback_lock); } @@ -1313,10 +1298,9 @@ do_xprt_reserve(struct rpc_task *task) task->tk_status = 0; if (task->tk_rqstp) return; - if (xprt->free) { - struct rpc_rqst *req = xprt->free; - xprt->free = req->rq_next; - req->rq_next = NULL; + if (!list_empty(&xprt->free)) { + struct rpc_rqst *req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); + list_del_init(&req->rq_list); task->tk_rqstp = req; xprt_request_init(task, xprt); return; @@ -1330,22 +1314,14 @@ do_xprt_reserve(struct rpc_task *task) /* * Allocate a 'unique' XID */ -static u32 -xprt_alloc_xid(void) +static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) { - static spinlock_t xid_lock = SPIN_LOCK_UNLOCKED; - static int need_init = 1; - static u32 xid; - u32 ret; - - spin_lock(&xid_lock); - if (unlikely(need_init)) { - xid = get_seconds() << 12; - need_init = 0; - } - ret = xid++; - spin_unlock(&xid_lock); - return ret; + return xprt->xid++; +} + +static inline void xprt_init_xid(struct rpc_xprt *xprt) +{ + get_random_bytes(&xprt->xid, sizeof(xprt->xid)); } /* @@ -1359,8 +1335,7 @@ xprt_request_init(struct rpc_task *task, req->rq_timeout = xprt->timeout; req->rq_task = task; req->rq_xprt = xprt; - req->rq_xid = xprt_alloc_xid(); - INIT_LIST_HEAD(&req->rq_list); + req->rq_xid = xprt_alloc_xid(xprt); dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, req->rq_xid); } @@ -1391,9 +1366,7 @@ xprt_release(struct rpc_task *task) dprintk("RPC: %4d release request %p\n", task->tk_pid, req); spin_lock(&xprt->xprt_lock); - req->rq_next = xprt->free; - xprt->free = req; - + list_add(&req->rq_list, &xprt->free); xprt_clear_backlog(xprt); spin_unlock(&xprt->xprt_lock); } @@ -1424,6 +1397,9 @@ xprt_set_timeout(struct rpc_timeout *to, to->to_exponential = 0; } +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE << 2; + /* * Initialize an RPC client */ @@ -1431,21 +1407,33 @@ static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { struct rpc_xprt *xprt; + unsigned int entries; + size_t slot_table_size; struct rpc_rqst *req; - int i; dprintk("RPC: setting up %s transport...\n", proto == IPPROTO_UDP? "UDP" : "TCP"); + entries = (proto == IPPROTO_TCP)? + xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; + if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) - return NULL; + return ERR_PTR(-ENOMEM); memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ + xprt->max_reqs = entries; + slot_table_size = entries * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) { + kfree(xprt); + return ERR_PTR(-ENOMEM); + } + memset(xprt->slot, 0, slot_table_size); xprt->addr = *ap; xprt->prot = proto; xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; if (xprt->stream) { - xprt->cwnd = RPC_MAXCWND; + xprt->cwnd = RPC_MAXCWND(xprt); xprt->nocong = 1; } else xprt->cwnd = RPC_INITCWND; @@ -1453,12 +1441,15 @@ xprt_setup(int proto, struct sockaddr_in spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); + INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; + xprt->port = XPRT_MAX_RESVPORT; /* Set timeout parameters */ if (to) { @@ -1467,21 +1458,22 @@ xprt_setup(int proto, struct sockaddr_in } else xprt_default_timeout(&xprt->timeout, xprt->prot); - INIT_RPC_WAITQ(&xprt->pending, "xprt_pending"); - INIT_RPC_WAITQ(&xprt->sending, "xprt_sending"); - INIT_RPC_WAITQ(&xprt->resend, "xprt_resend"); - INIT_RPC_WAITQ(&xprt->backlog, "xprt_backlog"); + rpc_init_wait_queue(&xprt->pending, "xprt_pending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->resend, "xprt_resend"); + rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); /* initialize free list */ - for (i = 0, req = xprt->slot; i < RPC_MAXREQS-1; i++, req++) - req->rq_next = req + 1; - req->rq_next = NULL; - xprt->free = xprt->slot; + for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + list_add(&req->rq_list, &xprt->free); + + xprt_init_xid(xprt); /* Check whether we want to use a reserved port */ xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - dprintk("RPC: created transport %p\n", xprt); + dprintk("RPC: created transport %p with %u slots\n", xprt, + xprt->max_reqs); return xprt; } @@ -1489,31 +1481,28 @@ xprt_setup(int proto, struct sockaddr_in /* * Bind to a reserved port */ -static inline int -xprt_bindresvport(struct socket *sock) +static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { - struct sockaddr_in myaddr; + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; int err, port; - kernel_cap_t saved_cap = current->cap_effective; - /* Override capabilities. - * They were checked in xprt_create_proto i.e. at mount time - */ - cap_raise(current->cap_effective, CAP_NET_BIND_SERVICE); - - memset(&myaddr, 0, sizeof(myaddr)); - myaddr.sin_family = AF_INET; - port = 800; + /* Were we already bound to a given port? Try to reuse it */ + port = xprt->port; do { myaddr.sin_port = htons(port); err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr)); - } while (err == -EADDRINUSE && --port > 0); - current->cap_effective = saved_cap; - - if (err < 0) - printk("RPC: Can't bind to reserved port (%d).\n", -err); + if (err == 0) { + xprt->port = port; + return 0; + } + if (--port == 0) + port = XPRT_MAX_RESVPORT; + } while (err == -EADDRINUSE && port != xprt->port); + printk("RPC: Can't bind to reserved port (%d).\n", -err); return err; } @@ -1563,11 +1552,11 @@ xprt_sock_setbufsize(struct rpc_xprt *xp return; if (xprt->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = xprt->rcvsize * RPC_MAXCONG * 2; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; } if (xprt->sndsize) { sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = xprt->sndsize * RPC_MAXCONG * 2; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; sk->sk_write_space(sk); } } @@ -1576,8 +1565,7 @@ xprt_sock_setbufsize(struct rpc_xprt *xp * Datastream sockets are created here, but xprt_connect will create * and connect stream sockets. */ -static struct socket * -xprt_create_socket(int proto, struct rpc_timeout *to, int resvport) +static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) { struct socket *sock; int type, err; @@ -1593,7 +1581,7 @@ xprt_create_socket(int proto, struct rpc } /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(sock) < 0) { + if (resvport && xprt_bindresvport(xprt, sock) < 0) { printk("RPC: can't bind to reserved port.\n"); goto failed; } @@ -1614,16 +1602,11 @@ xprt_create_proto(int proto, struct sock struct rpc_xprt *xprt; xprt = xprt_setup(proto, sap, to); - if (!xprt) - goto out_bad; - - dprintk("RPC: xprt_create_proto created xprt %p\n", xprt); + if (IS_ERR(xprt)) + dprintk("RPC: xprt_create_proto failed\n"); + else + dprintk("RPC: xprt_create_proto created xprt %p\n", xprt); return xprt; - out_bad: - dprintk("RPC: xprt_create_proto failed\n"); - if (xprt) - kfree(xprt); - return NULL; } /* @@ -1637,8 +1620,7 @@ xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->resend); rpc_wake_up(&xprt->pending); rpc_wake_up(&xprt->backlog); - if (waitqueue_active(&xprt->cong_wait)) - wake_up(&xprt->cong_wait); + wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); } @@ -1648,8 +1630,7 @@ xprt_shutdown(struct rpc_xprt *xprt) int xprt_clear_backlog(struct rpc_xprt *xprt) { rpc_wake_up_next(&xprt->backlog); - if (waitqueue_active(&xprt->cong_wait)) - wake_up(&xprt->cong_wait); + wake_up(&xprt->cong_wait); return 1; } @@ -1662,6 +1643,7 @@ xprt_destroy(struct rpc_xprt *xprt) dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); xprt_close(xprt); + kfree(xprt->slot); kfree(xprt); return 0;