fs/dcache.c | 48 +++ fs/locks.c | 6 fs/nfs/dir.c | 225 ++++++----------- fs/nfs/direct.c | 429 +++++++++++++++++++++++--------- fs/nfs/file.c | 50 ++- fs/nfs/inode.c | 107 +++++--- fs/nfs/nfs3proc.c | 20 - fs/nfs/nfs4proc.c | 189 ++++++-------- fs/nfs/nfs4state.c | 10 fs/nfs/proc.c | 16 - fs/nfs/read.c | 19 - fs/nfs/unlink.c | 3 fs/nfs/write.c | 48 --- include/linux/dcache.h | 18 + include/linux/fs.h | 5 include/linux/nfs_fs.h | 65 ++++ include/linux/nfs_xdr.h | 2 include/linux/sunrpc/auth.h | 2 include/linux/sunrpc/sched.h | 66 +++-- net/sunrpc/auth.c | 12 net/sunrpc/auth_gss/auth_gss.c | 46 ++- net/sunrpc/clnt.c | 2 net/sunrpc/sched.c | 538 +++++++++++++---------------------------- net/sunrpc/xprt.c | 9 24 files changed, 999 insertions(+), 936 deletions(-) diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/dcache.c linux-2.6.10-rc1-NFS_ALL/fs/dcache.c --- linux-2.6.10-rc1/fs/dcache.c 2004-10-27 14:10:18.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/dcache.c 2004-10-27 19:08:34.000000000 -0400 @@ -780,6 +780,54 @@ void d_instantiate(struct dentry *entry, } /** + * d_instantiate_unique - instantiate a non-aliased dentry + * @entry: dentry to instantiate + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. On success, it returns NULL. + * If an unhashed alias of "entry" already exists, then we return the + * aliased dentry instead. + * + * Note that in order to avoid conflicts with rename() etc, the caller + * had better be holding the parent directory semaphore. + */ +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + BUG_ON(!list_empty(&entry->d_alias)); + spin_lock(&dcache_lock); + if (!inode) + goto do_negative; + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct qstr *qstr = &alias->d_name; + + if (qstr->hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (qstr->len != len) + continue; + if (memcmp(qstr->name, name, len)) + continue; + dget_locked(alias); + spin_unlock(&dcache_lock); + BUG_ON(!d_unhashed(alias)); + return alias; + } + list_add(&entry->d_alias, &inode->i_dentry); +do_negative: + entry->d_inode = inode; + spin_unlock(&dcache_lock); + security_d_instantiate(entry, inode); + return NULL; +} +EXPORT_SYMBOL(d_instantiate_unique); + +/** * d_alloc_root - allocate root dentry * @root_inode: inode to allocate the root for * diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/locks.c linux-2.6.10-rc1-NFS_ALL/fs/locks.c --- linux-2.6.10-rc1/fs/locks.c 2004-10-27 14:11:24.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/locks.c 2004-10-27 19:08:34.000000000 -0400 @@ -1563,9 +1563,6 @@ int fcntl_getlk(struct file *filp, struc error = filp->f_op->lock(filp, F_GETLK, &file_lock); if (error < 0) goto out; - else if (error == LOCK_USE_CLNT) - /* Bypass for NFS with no locking - 2.0.36 compat */ - fl = posix_test_lock(filp, &file_lock); else fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); } else { @@ -1708,9 +1705,6 @@ int fcntl_getlk64(struct file *filp, str error = filp->f_op->lock(filp, F_GETLK, &file_lock); if (error < 0) goto out; - else if (error == LOCK_USE_CLNT) - /* Bypass for NFS with no locking - 2.0.36 compat */ - fl = posix_test_lock(filp, &file_lock); else fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock); } else { diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/dir.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/dir.c --- linux-2.6.10-rc1/fs/nfs/dir.c 2004-10-27 14:10:34.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/dir.c 2004-10-27 19:08:34.000000000 -0400 @@ -40,8 +40,6 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_cached_lookup(struct inode *, struct dentry *, - struct nfs_fh *, struct nfs_fattr *); static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); static int nfs_mkdir(struct inode *, struct dentry *, int); static int nfs_rmdir(struct inode *, struct dentry *); @@ -294,24 +292,13 @@ int readdir_search_pagecache(nfs_readdir return res; } -static unsigned int nfs_type2dtype[] = { - DT_UNKNOWN, - DT_REG, - DT_DIR, - DT_BLK, - DT_CHR, - DT_LNK, - DT_SOCK, - DT_UNKNOWN, - DT_FIFO -}; - -static inline -unsigned int nfs_type_to_d_type(enum nfs_ftype type) +static inline unsigned int dt_type(struct inode *inode) { - return nfs_type2dtype[type]; + return (inode->i_mode >> 12) & 15; } +static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -321,6 +308,7 @@ int nfs_do_filldir(nfs_readdir_descripto { struct file *file = desc->file; struct nfs_entry *entry = desc->entry; + struct dentry *dentry = NULL; unsigned long fileid; int loop_count = 0, res; @@ -333,9 +321,16 @@ int nfs_do_filldir(nfs_readdir_descripto * retrieving the current dirent on the server */ fileid = nfs_fileid_to_ino_t(entry->ino); + /* Get a dentry if we have one */ + if (dentry != NULL) + dput(dentry); + dentry = nfs_readdir_lookup(desc); + /* Use readdirplus info */ - if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) - d_type = nfs_type_to_d_type(entry->fattr->type); + if (dentry != NULL && dentry->d_inode != NULL) { + d_type = dt_type(dentry->d_inode); + fileid = dentry->d_inode->i_ino; + } res = filldir(dirent, entry->name, entry->len, entry->prev_cookie, fileid, d_type); @@ -352,7 +347,8 @@ int nfs_do_filldir(nfs_readdir_descripto } } dir_page_release(desc); - + if (dentry != NULL) + dput(dentry); dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res); return res; } @@ -615,24 +611,10 @@ static int nfs_lookup_revalidate(struct goto out_valid; } - /* - * Note: we're not holding inode->i_sem and so may be racing with - * operations that change the directory. We therefore save the - * change attribute *before* we do the RPC call. - */ - verifier = nfs_save_change_attribute(dir); - error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); - if (!error) { - if (nfs_compare_fh(NFS_FH(inode), &fhandle)) - goto out_bad; - if (nfs_lookup_verify_inode(inode, isopen)) - goto out_zap_parent; - goto out_valid_renew; - } - if (NFS_STALE(inode)) goto out_bad; + verifier = nfs_save_change_attribute(dir); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error) goto out_bad; @@ -641,7 +623,6 @@ static int nfs_lookup_revalidate(struct if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; - out_valid_renew: nfs_renew_times(dentry); nfs_set_verifier(dentry, verifier); out_valid: @@ -723,6 +704,7 @@ int nfs_is_exclusive_create(struct inode static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *res; struct inode *inode = NULL; int error; struct nfs_fh fhandle; @@ -731,11 +713,11 @@ static struct dentry *nfs_lookup(struct dfprintk(VFS, "NFS: lookup(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - error = -ENAMETOOLONG; + res = ERR_PTR(-ENAMETOOLONG); if (dentry->d_name.len > NFS_SERVER(dir)->namelen) goto out; - error = -ENOMEM; + res = ERR_PTR(-ENOMEM); dentry->d_op = NFS_PROTO(dir)->dentry_ops; lock_kernel(); @@ -746,29 +728,27 @@ static struct dentry *nfs_lookup(struct if (nfs_is_exclusive_create(dir, nd)) goto no_entry; - error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); - if (error != 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, - &fhandle, &fattr); - if (error == -ENOENT) - goto no_entry; - if (error != 0) - goto out_unlock; + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + if (error == -ENOENT) + goto no_entry; + if (error < 0) { + res = ERR_PTR(error); + goto out_unlock; } - error = -EACCES; + res = ERR_PTR(-EACCES); inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); if (!inode) goto out_unlock; no_entry: - error = 0; - d_add(dentry, inode); + res = d_add_unique(dentry, inode); + if (res != NULL) + dentry = res; nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unlock: unlock_kernel(); out: - BUG_ON(error > 0); - return ERR_PTR(error); + return res; } #ifdef CONFIG_NFS_V4 @@ -798,15 +778,15 @@ static int is_atomic_open(struct inode * static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { + struct dentry *res = NULL; struct inode *inode = NULL; - int error = 0; /* Check that we are indeed trying to open this file */ if (!is_atomic_open(dir, nd)) goto no_open; if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { - error = -ENAMETOOLONG; + res = ERR_PTR(-ENAMETOOLONG); goto out; } dentry->d_op = NFS_PROTO(dir)->dentry_ops; @@ -828,7 +808,7 @@ static struct dentry *nfs_atomic_lookup( inode = nfs4_atomic_open(dir, dentry, nd); unlock_kernel(); if (IS_ERR(inode)) { - error = PTR_ERR(inode); + int error = PTR_ERR(inode); switch (error) { /* Make a negative dentry */ case -ENOENT: @@ -841,16 +821,18 @@ static struct dentry *nfs_atomic_lookup( /* case -EISDIR: */ /* case -EINVAL: */ default: + res = ERR_PTR(error); goto out; } } no_entry: - d_add(dentry, inode); + res = d_add_unique(dentry, inode); + if (res != NULL) + dentry = res; nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out: - BUG_ON(error > 0); - return ERR_PTR(error); + return res; no_open: return nfs_lookup(dir, dentry, nd); } @@ -906,83 +888,51 @@ no_open: } #endif /* CONFIG_NFSV4 */ -static inline -int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) +static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) { + struct dentry *parent = desc->file->f_dentry; + struct inode *dir = parent->d_inode; struct nfs_entry *entry = desc->entry; - int status; - - while((status = dir_decode(desc)) == 0) { - if (entry->len != dentry->d_name.len) - continue; - if (memcmp(entry->name, dentry->d_name.name, entry->len)) - continue; - if (!(entry->fattr->valid & NFS_ATTR_FATTR)) - continue; - break; - } - return status; -} - -/* - * Use the cached Readdirplus results in order to avoid a LOOKUP call - * whenever we believe that the parent directory has not changed. - * - * We assume that any file creation/rename changes the directory mtime. - * As this results in a page cache invalidation whenever it occurs, - * we don't require any other tests for cache coherency. - */ -static -int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, - struct nfs_fh *fh, struct nfs_fattr *fattr) -{ - nfs_readdir_descriptor_t desc; - struct nfs_server *server; - struct nfs_entry entry; - struct page *page; - unsigned long timestamp; - int res; - - if (!NFS_USE_READDIRPLUS(dir)) - return -ENOENT; - server = NFS_SERVER(dir); - /* Don't use readdirplus unless the cache is stable */ - if ((server->flags & NFS_MOUNT_NOAC) != 0 - || nfs_caches_unstable(dir) - || nfs_attribute_timeout(dir)) - return -ENOENT; - if ((NFS_FLAGS(dir) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) != 0) - return -ENOENT; - timestamp = NFS_I(dir)->readdir_timestamp; - - entry.fh = fh; - entry.fattr = fattr; - - desc.decode = NFS_PROTO(dir)->decode_dirent; - desc.entry = &entry; - desc.page_index = 0; - desc.plus = 1; - - for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) { - - res = -EIO; - if (PageUptodate(page)) { - void * kaddr = kmap_atomic(page, KM_USER0); - desc.ptr = kaddr; - res = find_dirent_name(&desc, page, dentry); - kunmap_atomic(kaddr, KM_USER0); - } - page_cache_release(page); + struct dentry *dentry, *alias; + struct qstr name = { + .name = entry->name, + .len = entry->len, + }; + struct inode *inode; - if (res == 0) - goto out_found; - if (res != -EAGAIN) + switch (name.len) { + case 2: + if (name.name[0] == '.' && name.name[1] == '.') + return dget_parent(parent); break; + case 1: + if (name.name[0] == '.') + return dget(parent); + } + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry != NULL) + return dentry; + if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) + return NULL; + /* Note: caller is already holding the dir->i_sem! */ + dentry = d_alloc(parent, &name); + if (dentry == NULL) + return NULL; + dentry->d_op = NFS_PROTO(dir)->dentry_ops; + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + if (!inode) { + dput(dentry); + return NULL; } - return -ENOENT; - out_found: - fattr->timestamp = timestamp; - return 0; + alias = d_add_unique(dentry, inode); + if (alias != NULL) { + dput(dentry); + dentry = alias; + } + nfs_renew_times(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return dentry; } /* @@ -1045,15 +995,9 @@ static int nfs_create(struct inode *dir, if (nd && (nd->flags & LOOKUP_CREATE)) open_flags = nd->intent.open.flags; - /* - * The 0 argument passed into the create function should one day - * contain the O_EXCL flag if requested. This allows NFSv3 to - * select the appropriate create strategy. Currently open_namei - * does not pass the create flags. - */ lock_kernel(); nfs_begin_data_update(dir); - inode = NFS_PROTO(dir)->create(dir, &dentry->d_name, &attr, open_flags); + inode = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); nfs_end_data_update(dir); if (!IS_ERR(inode)) { d_instantiate(dentry, inode); @@ -1438,7 +1382,7 @@ static int nfs_rename(struct inode *old_ goto go_ahead; if (S_ISDIR(new_inode->i_mode)) goto out; - else if (atomic_read(&new_dentry->d_count) > 1) { + else if (atomic_read(&new_dentry->d_count) > 2) { int err; /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, @@ -1453,10 +1397,8 @@ static int nfs_rename(struct inode *old_ new_inode = NULL; /* instantiate the replacement target */ d_instantiate(new_dentry, NULL); - } - + } else if (atomic_read(&new_dentry->d_count) > 1) { /* dentry still busy? */ - if (atomic_read(&new_dentry->d_count) > 1) { #ifdef NFS_PARANOIA printk("nfs_rename: target %s/%s busy, d_count=%d\n", new_dentry->d_parent->d_name.name, @@ -1510,7 +1452,7 @@ int nfs_access_get_cached(struct inode * if (cache->cred != cred || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) - || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) + || (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS)) return -ENOENT; memcpy(res, cache, sizeof(*res)); return 0; @@ -1524,6 +1466,7 @@ void nfs_access_add_cache(struct inode * if (cache->cred) put_rpccred(cache->cred); cache->cred = get_rpccred(set->cred); + NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS; } cache->jiffies = set->jiffies; cache->mask = set->mask; diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/direct.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/direct.c --- linux-2.6.10-rc1/fs/nfs/direct.c 2004-10-27 14:11:23.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/direct.c 2004-10-27 19:08:34.000000000 -0400 @@ -33,6 +33,7 @@ * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy * 08 Jun 2003 Port to 2.5 APIs --cel * 31 Mar 2004 Handle direct I/O without VFS support --cel + * 15 Sep 2004 Parallel async reads --cel * */ @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -50,11 +52,27 @@ #include #include +#include #define NFSDBG_FACILITY NFSDBG_VFS -#define VERF_SIZE (2 * sizeof(__u32)) #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) +static kmem_cache_t *nfs_direct_cachep; + +/* + * This represents a set of asynchronous requests that we're waiting on + */ +struct nfs_direct_req { + struct kref kref; /* release manager */ + struct list_head list; /* nfs_read_data structs */ + wait_queue_head_t wait; /* wait for i/o completion */ + struct page ** pages; /* pages in our buffer */ + unsigned int npages; /* count of pages */ + atomic_t complete, /* i/os we're waiting for */ + count, /* bytes actually processed */ + error; /* any reported error */ +}; + /** * nfs_get_user_pages - find and set up pages underlying user's buffer @@ -71,7 +89,8 @@ nfs_get_user_pages(int rw, unsigned long unsigned long page_count; size_t array_size; - /* set an arbitrary limit to prevent arithmetic overflow */ + /* set an arbitrary limit to prevent type overflow */ + /* XXX: this can probably be as large as INT_MAX */ if (size > MAX_DIRECTIO_SIZE) return -EFBIG; @@ -93,6 +112,8 @@ nfs_get_user_pages(int rw, unsigned long /** * nfs_free_user_pages - tear down page struct array * @pages: array of page struct pointers underlying target buffer + * @npages: number of pages in the array + * @do_dirty: dirty the pages as we release them */ static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty) @@ -107,77 +128,231 @@ nfs_free_user_pages(struct page **pages, } /** - * nfs_direct_read_seg - Read in one iov segment. Generate separate - * read RPCs for each "rsize" bytes. + * nfs_direct_req_release - release nfs_direct_req structure for direct read + * @kref: kref object embedded in an nfs_direct_req structure + * + */ +static void nfs_direct_req_release(struct kref *kref) +{ + struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + kmem_cache_free(nfs_direct_cachep, dreq); +} + +/** + * nfs_direct_read_alloc - allocate nfs_read_data structures for direct read + * @count: count of bytes for the read request + * @rsize: local rsize setting + * + * Note we also set the number of requests we have in the dreq when we are + * done. This prevents races with I/O completion so we will always wait + * until all requests have been dispatched and completed. + */ +static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int rsize) +{ + struct list_head *list; + struct nfs_direct_req *dreq; + unsigned int reads = 0; + + dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); + if (!dreq) + return NULL; + + kref_init(&dreq->kref); + init_waitqueue_head(&dreq->wait); + INIT_LIST_HEAD(&dreq->list); + atomic_set(&dreq->count, 0); + atomic_set(&dreq->error, 0); + + list = &dreq->list; + for(;;) { + struct nfs_read_data *data = nfs_readdata_alloc(); + + if (unlikely(!data)) { + while (!list_empty(list)) { + data = list_entry(list->next, + struct nfs_read_data, pages); + list_del(&data->pages); + nfs_readdata_free(data); + } + kref_put(&dreq->kref, nfs_direct_req_release); + return NULL; + } + + INIT_LIST_HEAD(&data->pages); + list_add(&data->pages, list); + + data->req = (struct nfs_page *) dreq; + reads++; + if (nbytes <= rsize) + break; + nbytes -= rsize; + } + kref_get(&dreq->kref); + atomic_set(&dreq->complete, reads); + return dreq; +} + +/** + * nfs_direct_read_result - handle a read reply for a direct read request + * @data: address of NFS READ operation control block + * @status: status of this NFS READ operation + * + * We must hold a reference to all the pages in this direct read request + * until the RPCs complete. This could be long *after* we are woken up in + * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server). + */ +static void nfs_direct_read_result(struct nfs_read_data *data, int status) +{ + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + + if (likely(status >= 0)) + atomic_add(data->res.count, &dreq->count); + else + atomic_set(&dreq->error, status); + + if (unlikely(atomic_dec_and_test(&dreq->complete))) { + nfs_free_user_pages(dreq->pages, dreq->npages, 1); + wake_up(&dreq->wait); + kref_put(&dreq->kref, nfs_direct_req_release); + } +} + +/** + * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read + * @dreq: address of nfs_direct_req struct for this request * @inode: target inode * @ctx: target file open context - * user_addr: starting address of this segment of user's buffer - * count: size of this segment - * file_offset: offset in file to begin the operation - * @pages: array of addresses of page structs defining user's buffer - * nr_pages: size of pages array + * @user_addr: starting address of this segment of user's buffer + * @count: size of this segment + * @file_offset: offset in file to begin the operation + * + * For each nfs_read_data struct that was allocated on the list, dispatch + * an NFS READ operation */ -static int -nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx, - unsigned long user_addr, size_t count, loff_t file_offset, - struct page **pages, int nr_pages) -{ - const unsigned int rsize = NFS_SERVER(inode)->rsize; - int tot_bytes = 0; - int curpage = 0; - struct nfs_read_data rdata = { - .inode = inode, - .cred = ctx->cred, - .args = { - .fh = NFS_FH(inode), - .context = ctx, - }, - .res = { - .fattr = &rdata.fattr, - }, - }; +static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, + struct inode *inode, struct nfs_open_context *ctx, + unsigned long user_addr, size_t count, loff_t file_offset) +{ + struct list_head *list = &dreq->list; + struct page **pages = dreq->pages; + unsigned int curpage, pgbase; + unsigned int rsize = NFS_SERVER(inode)->rsize; - rdata.args.pgbase = user_addr & ~PAGE_MASK; - rdata.args.offset = file_offset; - do { - int result; - - rdata.args.count = count; - if (rdata.args.count > rsize) - rdata.args.count = rsize; - rdata.args.pages = &pages[curpage]; - - dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", - rdata.args.count, (long long) rdata.args.offset, - user_addr + tot_bytes, rdata.args.pgbase, curpage); + curpage = 0; + pgbase = user_addr & ~PAGE_MASK; + do { + struct nfs_read_data *data; + unsigned int bytes; + + bytes = rsize; + if (count < rsize) + bytes = count; + + data = list_entry(list->next, struct nfs_read_data, pages); + list_del_init(&data->pages); + + data->inode = inode; + data->cred = ctx->cred; + data->args.fh = NFS_FH(inode); + data->args.context = ctx; + data->args.offset = file_offset; + data->args.pgbase = pgbase; + data->args.pages = &pages[curpage]; + data->args.count = bytes; + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; + + NFS_PROTO(inode)->read_setup(data); + + data->task.tk_cookie = (unsigned long) inode; + data->task.tk_calldata = data; + data->task.tk_release = nfs_readdata_release; + data->complete = nfs_direct_read_result; lock_kernel(); - result = NFS_PROTO(inode)->read(&rdata); + rpc_execute(&data->task); unlock_kernel(); - if (result <= 0) { - if (tot_bytes > 0) - break; - if (result == -EISDIR) - result = -EINVAL; - return result; - } + dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + bytes, + (unsigned long long)data->args.offset); + + file_offset += bytes; + pgbase += bytes; + curpage += pgbase >> PAGE_SHIFT; + pgbase &= ~PAGE_MASK; - tot_bytes += result; - if (rdata.res.eof) - break; - - rdata.args.offset += result; - rdata.args.pgbase += result; - curpage += rdata.args.pgbase >> PAGE_SHIFT; - rdata.args.pgbase &= ~PAGE_MASK; - count -= result; + count -= bytes; } while (count != 0); +} - /* XXX: should we zero the rest of the user's buffer if we - * hit eof? */ +/** + * nfs_direct_read_wait - wait for I/O completion for direct reads + * @dreq: request on which we are to wait + * @intr: whether or not this wait can be interrupted + * + * Collects and returns the final error value/byte-count. + */ +static ssize_t nfs_direct_read_wait(struct nfs_direct_req *dreq, int intr) +{ + int result = 0; - return tot_bytes; + if (intr) { + result = wait_event_interruptible(dreq->wait, + (atomic_read(&dreq->complete) == 0)); + } else { + wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0)); + } + + if (!result) + result = atomic_read(&dreq->error); + if (!result) + result = atomic_read(&dreq->count); + + kref_put(&dreq->kref, nfs_direct_req_release); + return (ssize_t) result; +} + +/** + * nfs_direct_read_seg - Read in one iov segment. Generate separate + * read RPCs for each "rsize" bytes. + * @inode: target inode + * @ctx: target file open context + * @user_addr: starting address of this segment of user's buffer + * @count: size of this segment + * @file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * @nr_pages: number of pages in the array + * + */ +static ssize_t nfs_direct_read_seg(struct inode *inode, + struct nfs_open_context *ctx, unsigned long user_addr, + size_t count, loff_t file_offset, struct page **pages, + unsigned int nr_pages) +{ + ssize_t result; + sigset_t oldset; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_direct_req *dreq; + + dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize); + if (!dreq) + return -ENOMEM; + + dreq->pages = pages; + dreq->npages = nr_pages; + + rpc_clnt_sigmask(clnt, &oldset); + nfs_direct_read_schedule(dreq, inode, ctx, user_addr, count, + file_offset); + result = nfs_direct_read_wait(dreq, clnt->cl_intr); + rpc_clnt_sigunmask(clnt, &oldset); + + return result; } /** @@ -189,9 +364,8 @@ nfs_direct_read_seg(struct inode *inode, * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * - * generic_file_direct_IO has already pushed out any non-direct - * writes so that this read will see them when we read from the - * server. + * We've already pushed out any non-direct writes so that this read + * will see them when we read from the server. */ static ssize_t nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, @@ -220,8 +394,6 @@ nfs_direct_read(struct inode *inode, str result = nfs_direct_read_seg(inode, ctx, user_addr, size, file_offset, pages, page_count); - nfs_free_user_pages(pages, page_count, 1); - if (result <= 0) { if (tot_bytes > 0) break; @@ -247,31 +419,31 @@ nfs_direct_read(struct inode *inode, str * @pages: array of addresses of page structs defining user's buffer * nr_pages: size of pages array */ -static int -nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx, - unsigned long user_addr, size_t count, loff_t file_offset, - struct page **pages, int nr_pages) +static ssize_t nfs_direct_write_seg(struct inode *inode, + struct nfs_open_context *ctx, unsigned long user_addr, + size_t count, loff_t file_offset, struct page **pages, + int nr_pages) { const unsigned int wsize = NFS_SERVER(inode)->wsize; size_t request; - int curpage, need_commit, result, tot_bytes; + int curpage, need_commit; + ssize_t result, tot_bytes; struct nfs_writeverf first_verf; - struct nfs_write_data wdata = { - .inode = inode, - .cred = ctx->cred, - .args = { - .fh = NFS_FH(inode), - .context = ctx, - }, - .res = { - .fattr = &wdata.fattr, - .verf = &wdata.verf, - }, - }; + struct nfs_write_data *wdata; - wdata.args.stable = NFS_UNSTABLE; + wdata = nfs_writedata_alloc(); + if (!wdata) + return -ENOMEM; + + wdata->inode = inode; + wdata->cred = ctx->cred; + wdata->args.fh = NFS_FH(inode); + wdata->args.context = ctx; + wdata->args.stable = NFS_UNSTABLE; if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) - wdata.args.stable = NFS_FILE_SYNC; + wdata->args.stable = NFS_FILE_SYNC; + wdata->res.fattr = &wdata->fattr; + wdata->res.verf = &wdata->verf; nfs_begin_data_update(inode); retry: @@ -279,20 +451,20 @@ retry: tot_bytes = 0; curpage = 0; request = count; - wdata.args.pgbase = user_addr & ~PAGE_MASK; - wdata.args.offset = file_offset; - do { - wdata.args.count = request; - if (wdata.args.count > wsize) - wdata.args.count = wsize; - wdata.args.pages = &pages[curpage]; + wdata->args.pgbase = user_addr & ~PAGE_MASK; + wdata->args.offset = file_offset; + do { + wdata->args.count = request; + if (wdata->args.count > wsize) + wdata->args.count = wsize; + wdata->args.pages = &pages[curpage]; dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", - wdata.args.count, (long long) wdata.args.offset, - user_addr + tot_bytes, wdata.args.pgbase, curpage); + wdata->args.count, (long long) wdata->args.offset, + user_addr + tot_bytes, wdata->args.pgbase, curpage); lock_kernel(); - result = NFS_PROTO(inode)->write(&wdata); + result = NFS_PROTO(inode)->write(wdata); unlock_kernel(); if (result <= 0) { @@ -302,20 +474,25 @@ retry: } if (tot_bytes == 0) - memcpy(&first_verf.verifier, &wdata.verf.verifier, - VERF_SIZE); - if (wdata.verf.committed != NFS_FILE_SYNC) { + memcpy(&first_verf.verifier, &wdata->verf.verifier, + sizeof(first_verf.verifier)); + if (wdata->verf.committed != NFS_FILE_SYNC) { need_commit = 1; - if (memcmp(&first_verf.verifier, - &wdata.verf.verifier, VERF_SIZE)) + if (memcmp(&first_verf.verifier, &wdata->verf.verifier, + sizeof(first_verf.verifier))); goto sync_retry; } - tot_bytes += result; - wdata.args.offset += result; - wdata.args.pgbase += result; - curpage += wdata.args.pgbase >> PAGE_SHIFT; - wdata.args.pgbase &= ~PAGE_MASK; + tot_bytes += result; + + /* in case of a short write: stop now, let the app recover */ + if (result < wdata->args.count) + break; + + wdata->args.offset += result; + wdata->args.pgbase += result; + curpage += wdata->args.pgbase >> PAGE_SHIFT; + wdata->args.pgbase &= ~PAGE_MASK; request -= result; } while (request != 0); @@ -323,27 +500,27 @@ retry: * Commit data written so far, even in the event of an error */ if (need_commit) { - wdata.args.count = tot_bytes; - wdata.args.offset = file_offset; + wdata->args.count = tot_bytes; + wdata->args.offset = file_offset; lock_kernel(); - result = NFS_PROTO(inode)->commit(&wdata); + result = NFS_PROTO(inode)->commit(wdata); unlock_kernel(); if (result < 0 || memcmp(&first_verf.verifier, - &wdata.verf.verifier, - VERF_SIZE) != 0) + &wdata->verf.verifier, + sizeof(first_verf.verifier)) != 0) goto sync_retry; } result = tot_bytes; out: nfs_end_data_update_defer(inode); - + nfs_writedata_free(wdata); return result; sync_retry: - wdata.args.stable = NFS_FILE_SYNC; + wdata->args.stable = NFS_FILE_SYNC; goto retry; } @@ -360,9 +537,9 @@ sync_retry: * that non-direct readers might access, so they will pick up these * writes immediately. */ -static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx, - const struct iovec *iov, loff_t file_offset, - unsigned long nr_segs) +static ssize_t nfs_direct_write(struct inode *inode, + struct nfs_open_context *ctx, const struct iovec *iov, + loff_t file_offset, unsigned long nr_segs) { ssize_t tot_bytes = 0; unsigned long seg = 0; @@ -502,6 +679,8 @@ nfs_file_direct_read(struct kiocb *iocb, if (mapping->nrpages) { retval = filemap_fdatawrite(mapping); if (retval == 0) + retval = nfs_wb_all(inode); + if (retval == 0) retval = filemap_fdatawait(mapping); if (retval) goto out; @@ -591,6 +770,8 @@ nfs_file_direct_write(struct kiocb *iocb if (mapping->nrpages) { retval = filemap_fdatawrite(mapping); if (retval == 0) + retval = nfs_wb_all(inode); + if (retval == 0) retval = filemap_fdatawait(mapping); if (retval) goto out; @@ -605,3 +786,21 @@ nfs_file_direct_write(struct kiocb *iocb out: return retval; } + +int nfs_init_directcache(void) +{ + nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", + sizeof(struct nfs_direct_req), + 0, SLAB_RECLAIM_ACCOUNT, + NULL, NULL); + if (nfs_direct_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_directcache(void) +{ + if (kmem_cache_destroy(nfs_direct_cachep)) + printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); +} diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/file.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/file.c --- linux-2.6.10-rc1/fs/nfs/file.c 2004-10-27 14:10:59.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/file.c 2004-10-27 19:08:34.000000000 -0400 @@ -295,10 +295,19 @@ out_swapfile: static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; - int status; + int status = 0; lock_kernel(); - status = NFS_PROTO(inode)->lock(filp, cmd, fl); + /* Use local locking if mounted with "-onolock" */ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else { + struct file_lock *cfl = posix_test_lock(filp, fl); + if (cfl != NULL) { + memcpy(fl, cfl, sizeof(*fl)); + fl->fl_type = F_UNLCK; + } + } unlock_kernel(); return status; } @@ -325,7 +334,11 @@ static int do_unlk(struct file *filp, in * still need to complete the unlock. */ lock_kernel(); - status = NFS_PROTO(inode)->lock(filp, cmd, fl); + /* Use local locking if mounted with "-onolock" */ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + else + status = posix_lock_file_wait(filp, fl); rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset); return status; } @@ -351,15 +364,19 @@ static int do_setlk(struct file *filp, i return status; lock_kernel(); - status = NFS_PROTO(inode)->lock(filp, cmd, fl); - /* If we were signalled we still need to ensure that - * we clean up any state on the server. We therefore - * record the lock call as having succeeded in order to - * ensure that locks_remove_posix() cleans it out when - * the process exits. - */ - if (status == -EINTR || status == -ERESTARTSYS) - posix_lock_file(filp, fl); + /* Use local locking if mounted with "-onolock" */ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) { + status = NFS_PROTO(inode)->lock(filp, cmd, fl); + /* If we were signalled we still need to ensure that + * we clean up any state on the server. We therefore + * record the lock call as having succeeded in order to + * ensure that locks_remove_posix() cleans it out when + * the process exits. + */ + if (status == -EINTR || status == -ERESTARTSYS) + posix_lock_file(filp, fl); + } else + status = posix_lock_file_wait(filp, fl); unlock_kernel(); if (status < 0) return status; @@ -396,15 +413,6 @@ nfs_lock(struct file *filp, int cmd, str if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) return -ENOLCK; - if (NFS_PROTO(inode)->version != 4) { - /* Fake OK code if mounted without NLM support */ - if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) { - if (IS_GETLK(cmd)) - return LOCK_USE_CLNT; - return 0; - } - } - /* * No BSD flocks over NFS allowed. * Note: we could try to fake a POSIX lock request here by diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/inode.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/inode.c --- linux-2.6.10-rc1/fs/nfs/inode.c 2004-10-27 14:11:07.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/inode.c 2004-10-27 19:08:34.000000000 -0400 @@ -486,13 +486,27 @@ nfs_statfs(struct super_block *sb, struc if (error < 0) goto out_err; - buf->f_frsize = server->wtmult; + /* + * Current versions of glibc do not correctly handle the + * case where f_frsize != f_bsize. Eventually we want to + * report the value of wtmult in this field. + */ + buf->f_frsize = sb->s_blocksize; + + /* + * On most *nix systems, f_blocks, f_bfree, and f_bavail + * are reported in units of f_frsize. Linux hasn't had + * an f_frsize field in its statfs struct until recently, + * thus historically Linux's sys_statfs reports these + * fields in units of f_bsize. + */ buf->f_bsize = sb->s_blocksize; blockbits = sb->s_blocksize_bits; blockres = (1 << blockbits) - 1; buf->f_blocks = (res.tbytes + blockres) >> blockbits; buf->f_bfree = (res.fbytes + blockres) >> blockbits; buf->f_bavail = (res.abytes + blockres) >> blockbits; + buf->f_files = res.tfiles; buf->f_ffree = res.afiles; @@ -565,9 +579,9 @@ nfs_zap_caches(struct inode *inode) memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) - nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; else - nfsi->flags |= NFS_INO_INVALID_ATTR; + nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; } /* @@ -605,7 +619,7 @@ nfs_find_actor(struct inode *inode, void return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; - if (is_bad_inode(inode)) + if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; return 1; } @@ -766,13 +780,8 @@ nfs_setattr(struct dentry *dentry, struc vmtruncate(inode, attr->ia_size); } } - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; - if (*cred) { - put_rpccred(*cred); - *cred = NULL; - } - } + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS; nfs_end_data_update(inode); unlock_kernel(); return error; @@ -949,14 +958,14 @@ __nfs_revalidate_inode(struct nfs_server lock_kernel(); if (!inode || is_bad_inode(inode)) goto out_nowait; - if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) + if (NFS_STALE(inode)) goto out_nowait; while (NFS_REVALIDATING(inode)) { status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING); if (status < 0) goto out_nowait; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOAC) + if (NFS_ATTRTIMEO(inode) == 0) continue; if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) continue; @@ -968,14 +977,14 @@ __nfs_revalidate_inode(struct nfs_server /* Protect against RPC races by saving the change attribute */ verifier = nfs_save_change_attribute(inode); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); - if (status) { + if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); if (status == -ESTALE) { - NFS_FLAGS(inode) |= NFS_INO_STALE; - if (inode != inode->i_sb->s_root->d_inode) - remove_inode_hash(inode); + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + NFS_FLAGS(inode) |= NFS_INO_STALE; } goto out; } @@ -1014,7 +1023,6 @@ __nfs_revalidate_inode(struct nfs_server inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - NFS_FLAGS(inode) &= ~NFS_INO_STALE; out: NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; wake_up(&nfsi->nfs_i_wait); @@ -1161,7 +1169,7 @@ int nfs_refresh_inode(struct inode *inod if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || inode->i_uid != fattr->uid || inode->i_gid != fattr->gid) - nfsi->flags |= NFS_INO_INVALID_ATTR; + nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; /* Has the link count changed? */ if (inode->i_nlink != fattr->nlink) @@ -1270,7 +1278,7 @@ static int nfs_update_inode(struct inode #endif nfsi->change_attr = fattr->change_attr; if (!data_unstable) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; } memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); @@ -1278,14 +1286,8 @@ static int nfs_update_inode(struct inode if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || inode->i_uid != fattr->uid || - inode->i_gid != fattr->gid) { - struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; - if (*cred) { - put_rpccred(*cred); - *cred = NULL; - } - invalid |= NFS_INO_INVALID_ATTR; - } + inode->i_gid != fattr->gid) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; @@ -1335,7 +1337,8 @@ static int nfs_update_inode(struct inode */ nfs_invalidate_inode(inode); out_err: - return -EIO; + NFS_FLAGS(inode) |= NFS_INO_STALE; + return -ESTALE; } /* @@ -1449,8 +1452,6 @@ static void nfs_kill_super(struct super_ kill_anon_super(s); - nfs4_renewd_prepare_shutdown(server); - if (server->client != NULL && !IS_ERR(server->client)) rpc_shutdown_client(server->client); if (server->client_sys != NULL && !IS_ERR(server->client_sys)) @@ -1461,8 +1462,6 @@ static void nfs_kill_super(struct super_ rpciod_down(); /* release rpciod */ - destroy_nfsv4_state(server); - if (server->hostname != NULL) kfree(server->hostname); kfree(server); @@ -1543,9 +1542,6 @@ static int nfs4_fill_super(struct super_ server->wsize = nfs_block_size(data->wsize, NULL); server->flags = data->flags & NFS_MOUNT_FLAGMASK; - /* NFSv4 doesn't use NLM locking */ - server->flags |= NFS_MOUNT_NONLM; - server->acregmin = data->acregmin*HZ; server->acregmax = data->acregmax*HZ; server->acdirmin = data->acdirmin*HZ; @@ -1790,8 +1786,22 @@ out_free: static void nfs4_kill_super(struct super_block *sb) { + struct nfs_server *server = NFS_SB(sb); + nfs_return_all_delegations(sb); - nfs_kill_super(sb); + kill_anon_super(sb); + + nfs4_renewd_prepare_shutdown(server); + + if (server->client != NULL && !IS_ERR(server->client)) + rpc_shutdown_client(server->client); + rpciod_down(); /* release rpciod */ + + destroy_nfsv4_state(server); + + if (server->hostname != NULL) + kfree(server->hostname); + kfree(server); } static struct file_system_type nfs4_fs_type = { @@ -1821,9 +1831,13 @@ static struct file_system_type nfs4_fs_t extern int nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int nfs_init_readpagecache(void); -extern int nfs_destroy_readpagecache(void); +extern void nfs_destroy_readpagecache(void); extern int nfs_init_writepagecache(void); -extern int nfs_destroy_writepagecache(void); +extern void nfs_destroy_writepagecache(void); +#ifdef CONFIG_NFS_DIRECTIO +extern int nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); +#endif static kmem_cache_t * nfs_inode_cachep; @@ -1904,6 +1918,12 @@ static int __init init_nfs_fs(void) if (err) goto out1; +#ifdef CONFIG_NFS_DIRECTIO + err = nfs_init_directcache(); + if (err) + goto out0; +#endif + #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); #endif @@ -1914,8 +1934,14 @@ static int __init init_nfs_fs(void) goto out; return 0; out: +#ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); +#endif nfs_destroy_writepagecache(); +#ifdef CONFIG_NFS_DIRECTIO +out0: + nfs_destroy_directcache(); +#endif out1: nfs_destroy_readpagecache(); out2: @@ -1928,6 +1954,9 @@ out4: static void __exit exit_nfs_fs(void) { +#ifdef CONFIG_NFS_DIRECTIO + nfs_destroy_directcache(); +#endif nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_inodecache(); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/nfs3proc.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs3proc.c --- linux-2.6.10-rc1/fs/nfs/nfs3proc.c 2004-10-27 14:10:40.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs3proc.c 2004-10-27 19:08:34.000000000 -0400 @@ -80,10 +80,10 @@ nfs3_proc_get_root(struct nfs_server *se dprintk("%s: call fsinfo\n", __FUNCTION__); info->fattr->valid = 0; status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); - dprintk("%s: reply fsinfo %d\n", __FUNCTION__, status); + dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); if (!(info->fattr->valid & NFS_ATTR_FATTR)) { status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0); - dprintk("%s: reply getattr %d\n", __FUNCTION__, status); + dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); } return status; } @@ -101,7 +101,7 @@ nfs3_proc_getattr(struct nfs_server *ser fattr->valid = 0; status = rpc_call(server->client, NFS3PROC_GETATTR, fhandle, fattr, 0); - dprintk("NFS reply getattr\n"); + dprintk("NFS reply getattr: %d\n", status); return status; } @@ -119,7 +119,7 @@ nfs3_proc_setattr(struct dentry *dentry, dprintk("NFS call setattr\n"); fattr->valid = 0; status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); - dprintk("NFS reply setattr\n"); + dprintk("NFS reply setattr: %d\n", status); return status; } @@ -198,7 +198,7 @@ static int nfs3_proc_access(struct inode if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) entry->mask |= MAY_EXEC; } - dprintk("NFS reply access, status = %d\n", status); + dprintk("NFS reply access: %d\n", status); return status; } @@ -296,7 +296,7 @@ static int nfs3_proc_commit(struct nfs_w * For now, we don't implement O_EXCL. */ static struct inode * -nfs3_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, +nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct nfs_fh fhandle; @@ -304,8 +304,8 @@ nfs3_proc_create(struct inode *dir, stru struct nfs_fattr dir_attr; struct nfs3_createargs arg = { .fh = NFS_FH(dir), - .name = name->name, - .len = name->len, + .name = dentry->d_name.name, + .len = dentry->d_name.len, .sattr = sattr, }; struct nfs3_diropres res = { @@ -315,7 +315,7 @@ nfs3_proc_create(struct inode *dir, stru }; int status; - dprintk("NFS call create %s\n", name->name); + dprintk("NFS call create %s\n", dentry->d_name.name); arg.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { arg.createmode = NFS3_CREATE_EXCLUSIVE; @@ -353,7 +353,7 @@ exit: if (status != 0) goto out; if (fhandle.size == 0 || !(fattr.valid & NFS_ATTR_FATTR)) { - status = nfs3_proc_lookup(dir, name, &fhandle, &fattr); + status = nfs3_proc_lookup(dir, &dentry->d_name, &fhandle, &fattr); if (status != 0) goto out; } diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/nfs4proc.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs4proc.c --- linux-2.6.10-rc1/fs/nfs/nfs4proc.c 2004-10-27 14:11:07.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs4proc.c 2004-10-27 19:08:34.000000000 -0400 @@ -477,7 +477,7 @@ static struct nfs4_state *nfs4_open_dele /* * Returns an nfs4_state + an referenced inode */ -static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -491,7 +491,7 @@ static int _nfs4_do_open(struct inode *d struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .open_flags = flags, - .name = name, + .name = &dentry->d_name, .server = server, .bitmask = server->attr_bitmask, .claim = NFS4_OPEN_CLAIM_NULL, @@ -581,14 +581,14 @@ out_err: } -struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred) +struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred) { struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, name, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, dentry, flags, sattr, cred, &res); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -635,6 +635,8 @@ static int _nfs4_do_setattr(struct nfs_s fattr->valid = 0; + if (state != NULL) + msg.rpc_cred = state->owner->so_cred; if (sattr->ia_valid & ATTR_SIZE) nfs4_copy_stateid(&arg.stateid, state, NULL); else @@ -658,6 +660,61 @@ int nfs4_do_setattr(struct nfs_server *s return err; } +struct nfs4_closedata { + struct inode *inode; + struct nfs4_state *state; + struct nfs_closeargs arg; + struct nfs_closeres res; +}; + +static void nfs4_close_done(struct rpc_task *task) +{ + struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; + struct nfs4_state *state = calldata->state; + struct nfs4_state_owner *sp = state->owner; + struct nfs_server *server = NFS_SERVER(calldata->inode); + + /* hmm. we are done with the inode, and in the process of freeing + * the state_owner. we keep this around to process errors + */ + nfs4_increment_seqid(task->tk_status, sp); + switch (task->tk_status) { + case 0: + state->state = calldata->arg.open_flags; + memcpy(&state->stateid, &calldata->res.stateid, + sizeof(state->stateid)); + break; + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + state->state = calldata->arg.open_flags; + nfs4_schedule_state_recovery(server->nfs4_state); + break; + default: + if (nfs4_async_handle_error(task, server) == -EAGAIN) { + rpc_restart_call(task); + return; + } + } + nfs4_put_open_state(state); + up(&sp->so_sema); + nfs4_put_state_owner(sp); + up_read(&server->nfs4_state->cl_sem); + kfree(calldata); +} + +static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], + .rpc_argp = &calldata->arg, + .rpc_resp = &calldata->res, + .rpc_cred = calldata->state->owner->so_cred, + }; + if (calldata->arg.open_flags != 0) + msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata); +} + /* * It is possible for data to be read/written from a mem-mapped file * after the sys_close call (which hits the vfs layer as a flush). @@ -669,102 +726,34 @@ int nfs4_do_setattr(struct nfs_server *s * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state) +int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) { - struct nfs4_state_owner *sp = state->owner; - int status = 0; - struct nfs_closeargs arg = { - .fh = NFS_FH(inode), - }; - struct nfs_closeres res; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; + struct nfs4_closedata *calldata; + int status; - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) + /* Tell caller we're done */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { + state->state = mode; return 0; - memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); + } + calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + return -ENOMEM; + calldata->inode = inode; + calldata->state = state; + calldata->arg.fh = NFS_FH(inode); /* Serialization for the sequence id */ - arg.seqid = sp->so_seqid, - status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR); - - /* hmm. we are done with the inode, and in the process of freeing - * the state_owner. we keep this around to process errors + calldata->arg.seqid = state->owner->so_seqid; + calldata->arg.open_flags = mode; + memcpy(&calldata->arg.stateid, &state->stateid, + sizeof(calldata->arg.stateid)); + status = nfs4_close_call(NFS_SERVER(inode)->client, calldata); + /* + * Return -EINPROGRESS on success in order to indicate to the + * caller that an asynchronous RPC call has been launched, and + * that it will release the semaphores on completion. */ - nfs4_increment_seqid(status, sp); - if (!status) - memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); - - return status; -} - -int nfs4_do_close(struct inode *inode, struct nfs4_state *state) -{ - struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_do_close(inode, state); - switch (err) { - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs4_state); - err = 0; - default: - state->state = 0; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; -} - -static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) -{ - struct nfs4_state_owner *sp = state->owner; - int status = 0; - struct nfs_closeargs arg = { - .fh = NFS_FH(inode), - .seqid = sp->so_seqid, - .open_flags = mode, - }; - struct nfs_closeres res; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - return 0; - memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); - status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_seqid(status, sp); - if (!status) - memcpy(&state->stateid, &res.stateid, sizeof(state->stateid)); - - return status; -} - -int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) -{ - struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_do_downgrade(inode, state, mode); - switch (err) { - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs4_state); - err = 0; - default: - state->state = mode; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + return (status == 0) ? -EINPROGRESS : status; } struct inode * @@ -785,7 +774,7 @@ nfs4_atomic_open(struct inode *dir, stru } cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); - state = nfs4_do_open(dir, &dentry->d_name, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) return (struct inode *)state; @@ -802,7 +791,7 @@ nfs4_open_revalidate(struct inode *dir, cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); state = nfs4_open_delegated(dentry->d_inode, openflags, cred); if (IS_ERR(state)) - state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred); + state = nfs4_do_open(dir, dentry, openflags, NULL, cred); put_rpccred(cred); if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) return 1; @@ -1026,7 +1015,7 @@ nfs4_proc_setattr(struct dentry *dentry, FMODE_WRITE, cred); if (IS_ERR(state)) state = nfs4_do_open(dentry->d_parent->d_inode, - &dentry->d_name, FMODE_WRITE, + dentry, FMODE_WRITE, NULL, cred); need_iput = 1; } @@ -1327,7 +1316,7 @@ static int nfs4_proc_commit(struct nfs_w */ static struct inode * -nfs4_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, +nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct inode *inode; @@ -1335,7 +1324,7 @@ nfs4_proc_create(struct inode *dir, stru struct rpc_cred *cred; cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); - state = nfs4_do_open(dir, name, flags, sattr, cred); + state = nfs4_do_open(dir, dentry, flags, sattr, cred); put_rpccred(cred); if (!IS_ERR(state)) { inode = state->inode; diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/nfs4state.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs4state.c --- linux-2.6.10-rc1/fs/nfs/nfs4state.c 2004-10-27 14:10:16.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/nfs4state.c 2004-10-27 19:08:34.000000000 -0400 @@ -445,7 +445,7 @@ nfs4_get_open_state(struct inode *inode, state->owner = owner; atomic_inc(&owner->so_count); list_add(&state->inode_states, &nfsi->open_states); - state->inode = inode; + state->inode = igrab(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); @@ -471,6 +471,7 @@ void nfs4_put_open_state(struct nfs4_sta list_del(&state->inode_states); spin_unlock(&inode->i_lock); list_del(&state->open_states); + iput(inode); BUG_ON (state->state != 0); nfs4_free_open_state(state); nfs4_put_state_owner(owner); @@ -486,7 +487,6 @@ void nfs4_close_state(struct nfs4_state struct nfs4_state_owner *owner = state->owner; struct nfs4_client *clp = owner->so_client; int newstate; - int status = 0; atomic_inc(&owner->so_count); down_read(&clp->cl_sem); @@ -508,10 +508,8 @@ void nfs4_close_state(struct nfs4_state newstate |= FMODE_WRITE; if (state->state == newstate) goto out; - if (newstate != 0) - status = nfs4_do_downgrade(inode, state, newstate); - else - status = nfs4_do_close(inode, state); + if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS) + return; } out: nfs4_put_open_state(state); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/proc.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/proc.c --- linux-2.6.10-rc1/fs/nfs/proc.c 2004-10-27 14:11:24.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/proc.c 2004-10-27 19:08:34.000000000 -0400 @@ -63,12 +63,12 @@ nfs_proc_get_root(struct nfs_server *ser dprintk("%s: call getattr\n", __FUNCTION__); fattr->valid = 0; status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); - dprintk("%s: reply getattr %d\n", __FUNCTION__, status); + dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); if (status) return status; dprintk("%s: call statfs\n", __FUNCTION__); status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0); - dprintk("%s: reply statfs %d\n", __FUNCTION__, status); + dprintk("%s: reply statfs: %d\n", __FUNCTION__, status); if (status) return status; info->rtmax = NFS_MAXDATA; @@ -96,7 +96,7 @@ nfs_proc_getattr(struct nfs_server *serv fattr->valid = 0; status = rpc_call(server->client, NFSPROC_GETATTR, fhandle, fattr, 0); - dprintk("NFS reply getattr\n"); + dprintk("NFS reply getattr: %d\n", status); return status; } @@ -114,7 +114,7 @@ nfs_proc_setattr(struct dentry *dentry, dprintk("NFS call setattr\n"); fattr->valid = 0; status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); - dprintk("NFS reply setattr\n"); + dprintk("NFS reply setattr: %d\n", status); return status; } @@ -213,15 +213,15 @@ static int nfs_proc_write(struct nfs_wri } static struct inode * -nfs_proc_create(struct inode *dir, struct qstr *name, struct iattr *sattr, +nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct nfs_fh fhandle; struct nfs_fattr fattr; struct nfs_createargs arg = { .fh = NFS_FH(dir), - .name = name->name, - .len = name->len, + .name = dentry->d_name.name, + .len = dentry->d_name.len, .sattr = sattr }; struct nfs_diropok res = { @@ -231,7 +231,7 @@ nfs_proc_create(struct inode *dir, struc int status; fattr.valid = 0; - dprintk("NFS call create %s\n", name->name); + dprintk("NFS call create %s\n", dentry->d_name.name); status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); dprintk("NFS reply create: %d\n", status); if (status == 0) { diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/read.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/read.c --- linux-2.6.10-rc1/fs/nfs/read.c 2004-10-27 14:10:15.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/read.c 2004-10-27 19:08:34.000000000 -0400 @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -39,25 +38,11 @@ static void nfs_readpage_result_partial( static void nfs_readpage_result_full(struct nfs_read_data *, int); static kmem_cache_t *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; +mempool_t *nfs_rdata_mempool; #define MIN_POOL_READ (32) -static struct nfs_read_data *nfs_readdata_alloc(void) -{ - struct nfs_read_data *p; - p = (struct nfs_read_data *)mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); - if (p) - memset(p, 0, sizeof(*p)); - return p; -} - -static __inline__ void nfs_readdata_free(struct nfs_read_data *p) -{ - mempool_free(p, nfs_rdata_mempool); -} - -static void nfs_readdata_release(struct rpc_task *task) +void nfs_readdata_release(struct rpc_task *task) { struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; nfs_readdata_free(data); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/unlink.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/unlink.c --- linux-2.6.10-rc1/fs/nfs/unlink.c 2004-10-27 14:11:24.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/unlink.c 2004-10-27 19:08:33.000000000 -0400 @@ -215,7 +215,6 @@ nfs_complete_unlink(struct dentry *dentr spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; spin_unlock(&dentry->d_lock); - if (data->task.tk_rpcwait == &nfs_delete_queue) - rpc_wake_up_task(&data->task); + rpc_wake_up_task(&data->task); nfs_put_unlinkdata(data); } diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/fs/nfs/write.c linux-2.6.10-rc1-NFS_ALL/fs/nfs/write.c --- linux-2.6.10-rc1/fs/nfs/write.c 2004-10-27 14:11:06.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/fs/nfs/write.c 2004-10-27 19:08:34.000000000 -0400 @@ -61,7 +61,6 @@ #include #include #include -#include #include "delegation.h" @@ -83,49 +82,17 @@ static int nfs_wait_on_write_congestion( static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int); static kmem_cache_t *nfs_wdata_cachep; -static mempool_t *nfs_wdata_mempool; -static mempool_t *nfs_commit_mempool; +mempool_t *nfs_wdata_mempool; +mempool_t *nfs_commit_mempool; static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); -static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) -{ - struct nfs_write_data *p; - p = (struct nfs_write_data *)mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - } - return p; -} - -static __inline__ void nfs_writedata_free(struct nfs_write_data *p) -{ - mempool_free(p, nfs_wdata_mempool); -} - -static void nfs_writedata_release(struct rpc_task *task) +void nfs_writedata_release(struct rpc_task *task) { struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; nfs_writedata_free(wdata); } -static __inline__ struct nfs_write_data *nfs_commit_alloc(void) -{ - struct nfs_write_data *p; - p = (struct nfs_write_data *)mempool_alloc(nfs_commit_mempool, SLAB_NOFS); - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - } - return p; -} - -static __inline__ void nfs_commit_free(struct nfs_write_data *p) -{ - mempool_free(p, nfs_commit_mempool); -} - /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { @@ -184,11 +151,10 @@ static int nfs_writepage_sync(struct nfs int result, written = 0; struct nfs_write_data *wdata; - wdata = kmalloc(sizeof(*wdata), GFP_NOFS); + wdata = nfs_writedata_alloc(); if (!wdata) return -ENOMEM; - memset(wdata, 0, sizeof(*wdata)); wdata->flags = how; wdata->cred = ctx->cred; wdata->inode = inode; @@ -238,8 +204,7 @@ static int nfs_writepage_sync(struct nfs io_error: nfs_end_data_update_defer(inode); - - kfree(wdata); + nfs_writedata_free(wdata); return written ? written : result; } @@ -1199,7 +1164,8 @@ void nfs_writeback_done(struct rpc_task } if (time_before(complain, jiffies)) { printk(KERN_WARNING - "NFS: Server wrote less than requested.\n"); + "NFS: Server wrote zero bytes, expected %u.\n", + argp->count); complain = jiffies + 300 * HZ; } /* Can't do anything about it except throw an error. */ diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/dcache.h linux-2.6.10-rc1-NFS_ALL/include/linux/dcache.h --- linux-2.6.10-rc1/include/linux/dcache.h 2004-10-27 14:11:35.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/dcache.h 2004-10-27 19:08:34.000000000 -0400 @@ -199,6 +199,7 @@ static inline int dname_external(struct * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); +extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern void d_delete(struct dentry *); /* allocate/de-allocate */ @@ -242,6 +243,23 @@ static inline void d_add(struct dentry * d_rehash(entry); } +/** + * d_add_unique - add dentry to hash queues without aliasing + * @entry: dentry to add + * @inode: The inode to attach to this dentry + * + * This adds the entry to the hash queues and initializes @inode. + * The entry was actually filled in earlier during d_alloc(). + */ +static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *res; + + res = d_instantiate_unique(entry, inode); + d_rehash(res != NULL ? res : entry); + return res; +} + /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/fs.h linux-2.6.10-rc1-NFS_ALL/include/linux/fs.h --- linux-2.6.10-rc1/include/linux/fs.h 2004-10-27 14:10:25.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/fs.h 2004-10-27 19:08:34.000000000 -0400 @@ -1174,11 +1174,6 @@ extern long do_mount(char *, char *, cha extern int vfs_statfs(struct super_block *, struct kstatfs *); -/* Return value for VFS lock functions - tells locks.c to lock conventionally - * REALLY kosha for root NFS and nfs_lock - */ -#define LOCK_USE_CLNT 1 - #define FLOCK_VERIFY_READ 1 #define FLOCK_VERIFY_WRITE 2 diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/nfs_fs.h linux-2.6.10-rc1-NFS_ALL/include/linux/nfs_fs.h --- linux-2.6.10-rc1/include/linux/nfs_fs.h 2004-10-27 14:10:27.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/nfs_fs.h 2004-10-27 19:08:34.000000000 -0400 @@ -30,6 +30,7 @@ #include #include #include +#include /* * Enable debugging support for nfs client. @@ -201,6 +202,7 @@ struct nfs_inode { #define NFS_INO_INVALID_ATTR 0x0008 /* cached attrs are invalid */ #define NFS_INO_INVALID_DATA 0x0010 /* cached data is invalid */ #define NFS_INO_INVALID_ATIME 0x0020 /* cached atime is invalid */ +#define NFS_INO_INVALID_ACCESS 0x0040 /* cached access cred invalid */ static inline struct nfs_inode *NFS_I(struct inode *inode) { @@ -239,7 +241,7 @@ static inline int nfs_caches_unstable(st static inline void NFS_CACHEINV(struct inode *inode) { if (!nfs_caches_unstable(inode)) - NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR; + NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; } static inline int nfs_server_capable(struct inode *inode, int cap) @@ -424,6 +426,44 @@ static inline int nfs_wb_page(struct ino return nfs_wb_page_priority(inode, page, 0); } +/* + * Allocate and free nfs_write_data structures + */ +extern mempool_t *nfs_wdata_mempool; +extern mempool_t *nfs_commit_mempool; + +static inline struct nfs_write_data *nfs_writedata_alloc(void) +{ + struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS); + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + } + return p; +} + +static inline void nfs_writedata_free(struct nfs_write_data *p) +{ + mempool_free(p, nfs_wdata_mempool); +} + +extern void nfs_writedata_release(struct rpc_task *task); + +static inline struct nfs_write_data *nfs_commit_alloc(void) +{ + struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); + } + return p; +} + +static inline void nfs_commit_free(struct nfs_write_data *p) +{ + mempool_free(p, nfs_commit_mempool); +} + /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) @@ -439,6 +479,26 @@ extern int nfs_pagein_list(struct list_ extern void nfs_readpage_result(struct rpc_task *); /* + * Allocate and free nfs_read_data structures + */ +extern mempool_t *nfs_rdata_mempool; + +static inline struct nfs_read_data *nfs_readdata_alloc(void) +{ + struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS); + if (p) + memset(p, 0, sizeof(*p)); + return p; +} + +static inline void nfs_readdata_free(struct nfs_read_data *p) +{ + mempool_free(p, nfs_rdata_mempool); +} + +extern void nfs_readdata_release(struct rpc_task *task); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -651,8 +711,7 @@ extern int nfs4_proc_setclientid_confirm extern int nfs4_open_reclaim(struct nfs4_state_owner *, struct nfs4_state *); extern int nfs4_proc_async_renew(struct nfs4_client *); extern int nfs4_proc_renew(struct nfs4_client *); -extern int nfs4_do_close(struct inode *, struct nfs4_state *); -extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode); +extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *); extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/nfs_xdr.h linux-2.6.10-rc1-NFS_ALL/include/linux/nfs_xdr.h --- linux-2.6.10-rc1/include/linux/nfs_xdr.h 2004-10-27 14:11:07.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/nfs_xdr.h 2004-10-27 19:08:34.000000000 -0400 @@ -681,7 +681,7 @@ struct nfs_rpc_ops { int (*read) (struct nfs_read_data *); int (*write) (struct nfs_write_data *); int (*commit) (struct nfs_write_data *); - struct inode * (*create) (struct inode *, struct qstr *, + struct inode * (*create) (struct inode *, struct dentry *, struct iattr *, int); int (*remove) (struct inode *, struct qstr *); int (*unlink_setup) (struct rpc_message *, diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/sunrpc/auth.h linux-2.6.10-rc1-NFS_ALL/include/linux/sunrpc/auth.h --- linux-2.6.10-rc1/include/linux/sunrpc/auth.h 2004-10-27 14:10:40.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/sunrpc/auth.h 2004-10-27 19:08:34.000000000 -0400 @@ -51,7 +51,6 @@ struct rpc_cred { }; #define RPCAUTH_CRED_LOCKED 0x0001 #define RPCAUTH_CRED_UPTODATE 0x0002 -#define RPCAUTH_CRED_DEAD 0x0004 #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 @@ -133,7 +132,6 @@ int rpcauth_unwrap_resp(struct rpc_tas int rpcauth_refreshcred(struct rpc_task *); void rpcauth_invalcred(struct rpc_task *); int rpcauth_uptodatecred(struct rpc_task *); -int rpcauth_deadcred(struct rpc_task *); void rpcauth_init_credcache(struct rpc_auth *); void rpcauth_free_credcache(struct rpc_auth *); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/include/linux/sunrpc/sched.h linux-2.6.10-rc1-NFS_ALL/include/linux/sunrpc/sched.h --- linux-2.6.10-rc1/include/linux/sunrpc/sched.h 2004-10-27 14:10:59.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/include/linux/sunrpc/sched.h 2004-10-27 19:08:34.000000000 -0400 @@ -11,7 +11,9 @@ #include #include +#include #include +#include #include /* @@ -25,11 +27,18 @@ struct rpc_message { struct rpc_cred * rpc_cred; /* Credentials */ }; +struct rpc_wait_queue; +struct rpc_wait { + struct list_head list; /* wait queue links */ + struct list_head links; /* Links to related tasks */ + wait_queue_head_t waitq; /* sync: sleep on this q */ + struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */ +}; + /* * This is the RPC task struct */ struct rpc_task { - struct list_head tk_list; /* wait queue links */ #ifdef RPC_DEBUG unsigned long tk_magic; /* 0xf00baa */ #endif @@ -37,7 +46,6 @@ struct rpc_task { struct rpc_clnt * tk_client; /* RPC client */ struct rpc_rqst * tk_rqstp; /* RPC request */ int tk_status; /* result of last operation */ - struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */ /* * RPC call state @@ -70,13 +78,18 @@ struct rpc_task { * you have a pathological interest in kernel oopses. */ struct timer_list tk_timer; /* kernel timer */ - wait_queue_head_t tk_wait; /* sync: sleep on this q */ unsigned long tk_timeout; /* timeout for rpc_sleep() */ unsigned short tk_flags; /* misc flags */ unsigned char tk_active : 1;/* Task has been activated */ unsigned char tk_priority : 2;/* Task priority */ unsigned long tk_runstate; /* Task run status */ - struct list_head tk_links; /* links to related tasks */ + struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could + * be any workqueue + */ + union { + struct work_struct tk_work; /* Async task work queue */ + struct rpc_wait tk_wait; /* RPC wait */ + } u; #ifdef RPC_DEBUG unsigned short tk_pid; /* debugging aid */ #endif @@ -87,11 +100,11 @@ struct rpc_task { /* support walking a list of tasks on a wait queue */ #define task_for_each(task, pos, head) \ list_for_each(pos, head) \ - if ((task=list_entry(pos, struct rpc_task, tk_list)),1) + if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1) #define task_for_first(task, head) \ if (!list_empty(head) && \ - ((task=list_entry((head)->next, struct rpc_task, tk_list)),1)) + ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1)) /* .. and walking list of all tasks */ #define alltask_for_each(task, pos, head) \ @@ -126,22 +139,39 @@ typedef void (*rpc_action)(struct rpc_ #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) #define RPC_TASK_UNINTERRUPTIBLE(t) ((t)->tk_flags & RPC_TASK_NOINTR) -#define RPC_TASK_SLEEPING 0 -#define RPC_TASK_RUNNING 1 -#define RPC_IS_SLEEPING(t) (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) -#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +#define RPC_TASK_RUNNING 0 +#define RPC_TASK_QUEUED 1 +#define RPC_TASK_WAKEUP 2 +#define RPC_TASK_HAS_TIMER 3 +#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) #define rpc_set_running(t) (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) -#define rpc_clear_running(t) (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +#define rpc_test_and_set_running(t) \ + (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +#define rpc_clear_running(t) \ + do { \ + smp_mb__before_clear_bit(); \ + clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ + smp_mb__after_clear_bit(); \ + } while (0) -#define rpc_set_sleeping(t) (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) +#define RPC_IS_QUEUED(t) (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) +#define rpc_set_queued(t) (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) +#define rpc_clear_queued(t) \ + do { \ + smp_mb__before_clear_bit(); \ + clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \ + smp_mb__after_clear_bit(); \ + } while (0) -#define rpc_clear_sleeping(t) \ +#define rpc_start_wakeup(t) \ + (test_and_set_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate) == 0) +#define rpc_finish_wakeup(t) \ do { \ smp_mb__before_clear_bit(); \ - clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \ + clear_bit(RPC_TASK_WAKEUP, &(t)->tk_runstate); \ smp_mb__after_clear_bit(); \ - } while(0) + } while (0) /* * Task priorities. @@ -157,6 +187,7 @@ typedef void (*rpc_action)(struct rpc_ * RPC synchronization objects */ struct rpc_wait_queue { + spinlock_t lock; struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */ unsigned long cookie; /* cookie of last task serviced */ unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ @@ -177,6 +208,7 @@ struct rpc_wait_queue { #ifndef RPC_DEBUG # define RPC_WAITQ_INIT(var,qname) { \ + .lock = SPIN_LOCK_UNLOCKED, \ .tasks = { \ [0] = LIST_HEAD_INIT(var.tasks[0]), \ [1] = LIST_HEAD_INIT(var.tasks[1]), \ @@ -185,6 +217,7 @@ struct rpc_wait_queue { } #else # define RPC_WAITQ_INIT(var,qname) { \ + .lock = SPIN_LOCK_UNLOCKED, \ .tasks = { \ [0] = LIST_HEAD_INIT(var.tasks[0]), \ [1] = LIST_HEAD_INIT(var.tasks[1]), \ @@ -209,13 +242,10 @@ void rpc_killall_tasks(struct rpc_clnt int rpc_execute(struct rpc_task *); void rpc_run_child(struct rpc_task *parent, struct rpc_task *child, rpc_action action); -int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *); -void rpc_remove_wait_queue(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, rpc_action action, rpc_action timer); -void rpc_add_timer(struct rpc_task *, rpc_action); void rpc_wake_up_task(struct rpc_task *); void rpc_wake_up(struct rpc_wait_queue *); struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/net/sunrpc/auth.c linux-2.6.10-rc1-NFS_ALL/net/sunrpc/auth.c --- linux-2.6.10-rc1/net/sunrpc/auth.c 2004-10-27 14:10:40.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/net/sunrpc/auth.c 2004-10-27 19:08:34.000000000 -0400 @@ -214,8 +214,6 @@ retry: list_for_each_safe(pos, next, &auth->au_credcache[nr]) { struct rpc_cred *entry; entry = list_entry(pos, struct rpc_cred, cr_hash); - if (entry->cr_flags & RPCAUTH_CRED_DEAD) - continue; if (rpcauth_prune_expired(entry, &free)) continue; if (entry->cr_ops->crmatch(acred, entry, taskflags)) { @@ -307,9 +305,6 @@ put_rpccred(struct rpc_cred *cred) if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) return; - if ((cred->cr_flags & RPCAUTH_CRED_DEAD) && !list_empty(&cred->cr_hash)) - list_del_init(&cred->cr_hash); - if (list_empty(&cred->cr_hash)) { spin_unlock(&rpc_credcache_lock); rpcauth_crdestroy(cred); @@ -413,10 +408,3 @@ rpcauth_uptodatecred(struct rpc_task *ta return !(task->tk_msg.rpc_cred) || (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE); } - -int -rpcauth_deadcred(struct rpc_task *task) -{ - return !(task->tk_msg.rpc_cred) || - (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_DEAD); -} diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/net/sunrpc/auth_gss/auth_gss.c linux-2.6.10-rc1-NFS_ALL/net/sunrpc/auth_gss/auth_gss.c --- linux-2.6.10-rc1/net/sunrpc/auth_gss/auth_gss.c 2004-10-27 14:10:34.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/net/sunrpc/auth_gss/auth_gss.c 2004-10-27 19:08:34.000000000 -0400 @@ -480,12 +480,14 @@ gss_pipe_downcall(struct file *filp, con if (!cred) goto err; if (gss_err) - cred->cr_flags |= RPCAUTH_CRED_DEAD; + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; else gss_cred_set_ctx(cred, ctx); spin_lock(&gss_auth->lock); gss_msg = __gss_find_upcall(gss_auth, acred.uid); if (gss_msg) { + if (gss_err) + gss_msg->msg.errno = -EACCES; __gss_unhash_msg(gss_msg); spin_unlock(&gss_auth->lock); gss_release_msg(gss_msg); @@ -740,7 +742,9 @@ gss_marshal(struct rpc_task *task, u32 * maj_stat = gss_get_mic(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, &verf_buf, &mic); - if(maj_stat != 0){ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) { + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + } else if (maj_stat != 0) { printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); goto out_put_ctx; } @@ -779,6 +783,7 @@ gss_validate(struct rpc_task *task, u32 struct xdr_netobj mic; u32 flav,len; u32 service; + u32 maj_stat; dprintk("RPC: %4u gss_validate\n", task->tk_pid); @@ -794,8 +799,11 @@ gss_validate(struct rpc_task *task, u32 mic.data = (u8 *)p; mic.len = len; - if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state)) - goto out_bad; + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat) + goto out_bad; service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type, gss_cred->gc_flavor); switch (service) { @@ -821,11 +829,10 @@ out_bad: } static inline int -gss_wrap_req_integ(struct gss_cl_ctx *ctx, - kxdrproc_t encode, void *rqstp, u32 *p, void *obj) +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) { - struct rpc_rqst *req = (struct rpc_rqst *)rqstp; - struct xdr_buf *snd_buf = &req->rq_snd_buf; + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; struct xdr_buf integ_buf; u32 *integ_len = NULL; struct xdr_netobj mic; @@ -836,7 +843,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct integ_len = p++; offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; - *p++ = htonl(req->rq_seqno); + *p++ = htonl(rqstp->rq_seqno); status = encode(rqstp, p, obj); if (status) @@ -858,7 +865,9 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct maj_stat = gss_get_mic(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, &integ_buf, &mic); status = -EIO; /* XXX? */ - if (maj_stat) + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) return status; q = xdr_encode_opaque(p, NULL, mic.len); @@ -894,7 +903,8 @@ gss_wrap_req(struct rpc_task *task, status = encode(rqstp, p, obj); goto out; case RPC_GSS_SVC_INTEGRITY: - status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj); + status = gss_wrap_req_integ(cred, ctx, encode, + rqstp, p, obj); goto out; case RPC_GSS_SVC_PRIVACY: default: @@ -907,11 +917,10 @@ out: } static inline int -gss_unwrap_resp_integ(struct gss_cl_ctx *ctx, - kxdrproc_t decode, void *rqstp, u32 **p, void *obj) +gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) { - struct rpc_rqst *req = (struct rpc_rqst *)rqstp; - struct xdr_buf *rcv_buf = &req->rq_rcv_buf; + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; struct xdr_buf integ_buf; struct xdr_netobj mic; u32 data_offset, mic_offset; @@ -926,7 +935,7 @@ gss_unwrap_resp_integ(struct gss_cl_ctx mic_offset = integ_len + data_offset; if (mic_offset > rcv_buf->len) return status; - if (ntohl(*(*p)++) != req->rq_seqno) + if (ntohl(*(*p)++) != rqstp->rq_seqno) return status; if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, @@ -938,6 +947,8 @@ gss_unwrap_resp_integ(struct gss_cl_ctx maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic, NULL); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) return status; return 0; @@ -962,8 +973,7 @@ gss_unwrap_resp(struct rpc_task *task, case RPC_GSS_SVC_NONE: goto out_decode; case RPC_GSS_SVC_INTEGRITY: - status = gss_unwrap_resp_integ(ctx, decode, - rqstp, &p, obj); + status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p); if (status) goto out; break; diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/net/sunrpc/clnt.c linux-2.6.10-rc1-NFS_ALL/net/sunrpc/clnt.c --- linux-2.6.10-rc1/net/sunrpc/clnt.c 2004-10-27 14:10:27.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/net/sunrpc/clnt.c 2004-10-27 19:08:34.000000000 -0400 @@ -928,7 +928,7 @@ call_refreshresult(struct rpc_task *task task->tk_action = call_reserve; if (status >= 0 && rpcauth_uptodatecred(task)) return; - if (rpcauth_deadcred(task)) { + if (status == -EACCES) { rpc_exit(task, -EACCES); return; } diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/net/sunrpc/sched.c linux-2.6.10-rc1-NFS_ALL/net/sunrpc/sched.c --- linux-2.6.10-rc1/net/sunrpc/sched.c 2004-10-27 14:10:47.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/net/sunrpc/sched.c 2004-10-27 19:08:34.000000000 -0400 @@ -41,13 +41,7 @@ static mempool_t *rpc_buffer_mempool; static void __rpc_default_timer(struct rpc_task *task); static void rpciod_killall(void); - -/* - * When an asynchronous RPC task is activated within a bottom half - * handler, or while executing another RPC task, it is put on - * schedq, and rpciod is woken up. - */ -static RPC_WAITQ(schedq, "schedq"); +static void rpc_async_schedule(void *); /* * RPC tasks that create another task (e.g. for contacting the portmapper) @@ -68,26 +62,18 @@ static LIST_HEAD(all_tasks); /* * rpciod-related stuff */ -static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle); -static DECLARE_COMPLETION(rpciod_killer); static DECLARE_MUTEX(rpciod_sema); static unsigned int rpciod_users; -static pid_t rpciod_pid; -static int rpc_inhibit; +static struct workqueue_struct *rpciod_workqueue; /* - * Spinlock for wait queues. Access to the latter also has to be - * interrupt-safe in order to allow timers to wake up sleeping tasks. - */ -static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED; -/* * Spinlock for other critical sections of code. */ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED; /* * Disable the timer for a given RPC task. Should be called with - * rpc_queue_lock and bh_disabled in order to avoid races within + * queue->lock and bh_disabled in order to avoid races within * rpc_run_timer(). */ static inline void @@ -105,19 +91,19 @@ __rpc_disable_timer(struct rpc_task *tas * without calling del_timer_sync(). The latter could cause a * deadlock if called while we're holding spinlocks... */ -static void -rpc_run_timer(struct rpc_task *task) +static void rpc_run_timer(struct rpc_task *task) { void (*callback)(struct rpc_task *); - spin_lock_bh(&rpc_queue_lock); callback = task->tk_timeout_fn; task->tk_timeout_fn = NULL; - spin_unlock_bh(&rpc_queue_lock); - if (callback) { + if (callback && RPC_IS_QUEUED(task)) { dprintk("RPC: %4d running timer\n", task->tk_pid); callback(task); } + smp_mb__before_clear_bit(); + clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + smp_mb__after_clear_bit(); } /* @@ -136,29 +122,21 @@ __rpc_add_timer(struct rpc_task *task, r task->tk_timeout_fn = timer; else task->tk_timeout_fn = __rpc_default_timer; + set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); mod_timer(&task->tk_timer, jiffies + task->tk_timeout); } /* - * Set up a timer for an already sleeping task. - */ -void rpc_add_timer(struct rpc_task *task, rpc_action timer) -{ - spin_lock_bh(&rpc_queue_lock); - if (!RPC_IS_RUNNING(task)) - __rpc_add_timer(task, timer); - spin_unlock_bh(&rpc_queue_lock); -} - -/* * Delete any timer for the current task. Because we use del_timer_sync(), - * this function should never be called while holding rpc_queue_lock. + * this function should never be called while holding queue->lock. */ static inline void rpc_delete_timer(struct rpc_task *task) { - if (del_timer_sync(&task->tk_timer)) + if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { + del_singleshot_timer_sync(&task->tk_timer); dprintk("RPC: %4d deleting timer\n", task->tk_pid); + } } /* @@ -169,16 +147,17 @@ static void __rpc_add_wait_queue_priorit struct list_head *q; struct rpc_task *t; + INIT_LIST_HEAD(&task->u.tk_wait.links); q = &queue->tasks[task->tk_priority]; if (unlikely(task->tk_priority > queue->maxpriority)) q = &queue->tasks[queue->maxpriority]; - list_for_each_entry(t, q, tk_list) { + list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_cookie == task->tk_cookie) { - list_add_tail(&task->tk_list, &t->tk_links); + list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); return; } } - list_add_tail(&task->tk_list, q); + list_add_tail(&task->u.tk_wait.list, q); } /* @@ -189,37 +168,21 @@ static void __rpc_add_wait_queue_priorit * improve overall performance. * Everyone else gets appended to the queue to ensure proper FIFO behavior. */ -static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) { - if (task->tk_rpcwait == queue) - return 0; + BUG_ON (RPC_IS_QUEUED(task)); - if (task->tk_rpcwait) { - printk(KERN_WARNING "RPC: doubly enqueued task!\n"); - return -EWOULDBLOCK; - } if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task); else if (RPC_IS_SWAPPER(task)) - list_add(&task->tk_list, &queue->tasks[0]); + list_add(&task->u.tk_wait.list, &queue->tasks[0]); else - list_add_tail(&task->tk_list, &queue->tasks[0]); - task->tk_rpcwait = queue; + list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); + task->u.tk_wait.rpc_waitq = queue; + rpc_set_queued(task); dprintk("RPC: %4d added to queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); - - return 0; -} - -int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task) -{ - int result; - - spin_lock_bh(&rpc_queue_lock); - result = __rpc_add_wait_queue(q, task); - spin_unlock_bh(&rpc_queue_lock); - return result; } /* @@ -229,12 +192,12 @@ static void __rpc_remove_wait_queue_prio { struct rpc_task *t; - if (!list_empty(&task->tk_links)) { - t = list_entry(task->tk_links.next, struct rpc_task, tk_list); - list_move(&t->tk_list, &task->tk_list); - list_splice_init(&task->tk_links, &t->tk_links); + if (!list_empty(&task->u.tk_wait.links)) { + t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); + list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); + list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); } - list_del(&task->tk_list); + list_del(&task->u.tk_wait.list); } /* @@ -243,31 +206,17 @@ static void __rpc_remove_wait_queue_prio */ static void __rpc_remove_wait_queue(struct rpc_task *task) { - struct rpc_wait_queue *queue = task->tk_rpcwait; - - if (!queue) - return; + struct rpc_wait_queue *queue; + queue = task->u.tk_wait.rpc_waitq; if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); else - list_del(&task->tk_list); - task->tk_rpcwait = NULL; - + list_del(&task->u.tk_wait.list); dprintk("RPC: %4d removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); } -void -rpc_remove_wait_queue(struct rpc_task *task) -{ - if (!task->tk_rpcwait) - return; - spin_lock_bh(&rpc_queue_lock); - __rpc_remove_wait_queue(task); - spin_unlock_bh(&rpc_queue_lock); -} - static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { queue->priority = priority; @@ -290,6 +239,7 @@ static void __rpc_init_priority_wait_que { int i; + spin_lock_init(&queue->lock); for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) INIT_LIST_HEAD(&queue->tasks[i]); queue->maxpriority = maxprio; @@ -316,34 +266,31 @@ EXPORT_SYMBOL(rpc_init_wait_queue); * Note: If the task is ASYNC, this must be called with * the spinlock held to protect the wait queue operation. */ -static inline void -rpc_make_runnable(struct rpc_task *task) +static void rpc_make_runnable(struct rpc_task *task) { - if (task->tk_timeout_fn) { - printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n"); + int do_ret; + + BUG_ON(task->tk_timeout_fn); + do_ret = rpc_test_and_set_running(task); + rpc_clear_queued(task); + if (do_ret) return; - } - rpc_set_running(task); if (RPC_IS_ASYNC(task)) { - if (RPC_IS_SLEEPING(task)) { - int status; - status = __rpc_add_wait_queue(&schedq, task); - if (status < 0) { - printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); - task->tk_status = status; - return; - } - rpc_clear_sleeping(task); - wake_up(&rpciod_idle); + int status; + + INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task); + status = queue_work(task->tk_workqueue, &task->u.tk_work); + if (status < 0) { + printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); + task->tk_status = status; + return; } - } else { - rpc_clear_sleeping(task); - wake_up(&task->tk_wait); - } + } else + wake_up(&task->u.tk_wait.waitq); } /* - * Place a newly initialized task on the schedq. + * Place a newly initialized task on the workqueue. */ static inline void rpc_schedule_run(struct rpc_task *task) @@ -352,33 +299,18 @@ rpc_schedule_run(struct rpc_task *task) if (RPC_IS_ACTIVATED(task)) return; task->tk_active = 1; - rpc_set_sleeping(task); rpc_make_runnable(task); } /* - * For other people who may need to wake the I/O daemon - * but should (for now) know nothing about its innards - */ -void rpciod_wake_up(void) -{ - if(rpciod_pid==0) - printk(KERN_ERR "rpciod: wot no daemon?\n"); - wake_up(&rpciod_idle); -} - -/* * Prepare for sleeping on a wait queue. * By always appending tasks to the list we ensure FIFO behavior. * NB: An RPC task will only receive interrupt-driven events as long * as it's on a wait queue. */ -static void -__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, +static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, rpc_action timer) { - int status; - dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid, rpc_qname(q), jiffies); @@ -388,49 +320,36 @@ __rpc_sleep_on(struct rpc_wait_queue *q, } /* Mark the task as being activated if so needed */ - if (!RPC_IS_ACTIVATED(task)) { + if (!RPC_IS_ACTIVATED(task)) task->tk_active = 1; - rpc_set_sleeping(task); - } - status = __rpc_add_wait_queue(q, task); - if (status) { - printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); - task->tk_status = status; - } else { - rpc_clear_running(task); - if (task->tk_callback) { - dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid); - BUG(); - } - task->tk_callback = action; - __rpc_add_timer(task, timer); - } + __rpc_add_wait_queue(q, task); + + BUG_ON(task->tk_callback != NULL); + task->tk_callback = action; + __rpc_add_timer(task, timer); } -void -rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, rpc_action action, rpc_action timer) { /* * Protect the queue operations. */ - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&q->lock); __rpc_sleep_on(q, task, action, timer); - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&q->lock); } /** - * __rpc_wake_up_task - wake up a single rpc_task + * __rpc_do_wake_up_task - wake up a single rpc_task * @task: task to be woken up * - * Caller must hold rpc_queue_lock + * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void -__rpc_wake_up_task(struct rpc_task *task) +static void __rpc_do_wake_up_task(struct rpc_task *task) { - dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n", - task->tk_pid, jiffies, rpc_inhibit); + dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies); #ifdef RPC_DEBUG if (task->tk_magic != 0xf00baa) { @@ -445,12 +364,9 @@ __rpc_wake_up_task(struct rpc_task *task printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); return; } - if (RPC_IS_RUNNING(task)) - return; __rpc_disable_timer(task); - if (task->tk_rpcwait != &schedq) - __rpc_remove_wait_queue(task); + __rpc_remove_wait_queue(task); rpc_make_runnable(task); @@ -458,6 +374,18 @@ __rpc_wake_up_task(struct rpc_task *task } /* + * Wake up the specified task + */ +static void __rpc_wake_up_task(struct rpc_task *task) +{ + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) + __rpc_do_wake_up_task(task); + rpc_finish_wakeup(task); + } +} + +/* * Default timeout handler if none specified by user */ static void @@ -471,14 +399,18 @@ __rpc_default_timer(struct rpc_task *tas /* * Wake up the specified task */ -void -rpc_wake_up_task(struct rpc_task *task) +void rpc_wake_up_task(struct rpc_task *task) { - if (RPC_IS_RUNNING(task)) - return; - spin_lock_bh(&rpc_queue_lock); - __rpc_wake_up_task(task); - spin_unlock_bh(&rpc_queue_lock); + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) { + struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; + + spin_lock_bh(&queue->lock); + __rpc_do_wake_up_task(task); + spin_unlock_bh(&queue->lock); + } + rpc_finish_wakeup(task); + } } /* @@ -494,11 +426,11 @@ static struct rpc_task * __rpc_wake_up_n */ q = &queue->tasks[queue->priority]; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, tk_list); + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); if (queue->cookie == task->tk_cookie) { if (--queue->nr) goto out; - list_move_tail(&task->tk_list, q); + list_move_tail(&task->u.tk_wait.list, q); } /* * Check if we need to switch queues. @@ -516,7 +448,7 @@ static struct rpc_task * __rpc_wake_up_n else q = q - 1; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, tk_list); + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); @@ -541,14 +473,14 @@ struct rpc_task * rpc_wake_up_next(struc struct rpc_task *task = NULL; dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&queue->lock); if (RPC_IS_PRIORITY(queue)) task = __rpc_wake_up_next_priority(queue); else { task_for_first(task, &queue->tasks[0]) __rpc_wake_up_task(task); } - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&queue->lock); return task; } @@ -557,25 +489,25 @@ struct rpc_task * rpc_wake_up_next(struc * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * - * Grabs rpc_queue_lock + * Grabs queue->lock */ void rpc_wake_up(struct rpc_wait_queue *queue) { struct rpc_task *task; struct list_head *head; - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { while (!list_empty(head)) { - task = list_entry(head->next, struct rpc_task, tk_list); + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); __rpc_wake_up_task(task); } if (head == &queue->tasks[0]) break; head--; } - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&queue->lock); } /** @@ -583,18 +515,18 @@ void rpc_wake_up(struct rpc_wait_queue * * @queue: rpc_wait_queue on which the tasks are sleeping * @status: status value to set * - * Grabs rpc_queue_lock + * Grabs queue->lock */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { struct list_head *head; struct rpc_task *task; - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { while (!list_empty(head)) { - task = list_entry(head->next, struct rpc_task, tk_list); + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); task->tk_status = status; __rpc_wake_up_task(task); } @@ -602,7 +534,7 @@ void rpc_wake_up_status(struct rpc_wait_ break; head--; } - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&queue->lock); } /* @@ -626,22 +558,23 @@ __rpc_atrun(struct rpc_task *task) /* * This is the RPC `scheduler' (or rather, the finite state machine). */ -static int -__rpc_execute(struct rpc_task *task) +static int __rpc_execute(struct rpc_task *task) { int status = 0; dprintk("RPC: %4d rpc_execute flgs %x\n", task->tk_pid, task->tk_flags); - if (!RPC_IS_RUNNING(task)) { - printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n"); - return 0; - } + BUG_ON(RPC_IS_QUEUED(task)); restarted: while (1) { /* + * Garbage collection of pending timers... + */ + rpc_delete_timer(task); + + /* * Execute any pending callback. */ if (RPC_DO_CALLBACK(task)) { @@ -657,7 +590,9 @@ __rpc_execute(struct rpc_task *task) */ save_callback=task->tk_callback; task->tk_callback=NULL; + lock_kernel(); save_callback(task); + unlock_kernel(); } /* @@ -665,43 +600,35 @@ __rpc_execute(struct rpc_task *task) * tk_action may be NULL when the task has been killed * by someone else. */ - if (RPC_IS_RUNNING(task)) { - /* - * Garbage collection of pending timers... - */ - rpc_delete_timer(task); + if (!RPC_IS_QUEUED(task)) { if (!task->tk_action) break; + lock_kernel(); task->tk_action(task); - /* micro-optimization to avoid spinlock */ - if (RPC_IS_RUNNING(task)) - continue; + unlock_kernel(); } /* - * Check whether task is sleeping. + * Lockless check for whether task is sleeping or not. */ - spin_lock_bh(&rpc_queue_lock); - if (!RPC_IS_RUNNING(task)) { - rpc_set_sleeping(task); - if (RPC_IS_ASYNC(task)) { - spin_unlock_bh(&rpc_queue_lock); + if (!RPC_IS_QUEUED(task)) + continue; + rpc_clear_running(task); + if (RPC_IS_ASYNC(task)) { + /* Careful! we may have raced... */ + if (RPC_IS_QUEUED(task)) return 0; - } + if (rpc_test_and_set_running(task)) + return 0; + continue; } - spin_unlock_bh(&rpc_queue_lock); - if (!RPC_IS_SLEEPING(task)) - continue; /* sync task: sleep here */ dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); - if (current->pid == rpciod_pid) - printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); - if (RPC_TASK_UNINTERRUPTIBLE(task)) { - __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task)); + __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); } else { - __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status); + __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); /* * When a sync task receives a signal, it exits with * -ERESTARTSYS. In order to catch any callbacks that @@ -715,11 +642,14 @@ __rpc_execute(struct rpc_task *task) rpc_wake_up_task(task); } } + rpc_set_running(task); dprintk("RPC: %4d sync task resuming\n", task->tk_pid); } if (task->tk_exit) { + lock_kernel(); task->tk_exit(task); + unlock_kernel(); /* If tk_action is non-null, the user wants us to restart */ if (task->tk_action) { if (!RPC_ASSASSINATED(task)) { @@ -738,7 +668,6 @@ __rpc_execute(struct rpc_task *task) /* Release all resources associated with the task */ rpc_release_task(task); - return status; } @@ -754,57 +683,16 @@ __rpc_execute(struct rpc_task *task) int rpc_execute(struct rpc_task *task) { - int status = -EIO; - if (rpc_inhibit) { - printk(KERN_INFO "RPC: execution inhibited!\n"); - goto out_release; - } - - status = -EWOULDBLOCK; - if (task->tk_active) { - printk(KERN_ERR "RPC: active task was run twice!\n"); - goto out_err; - } + BUG_ON(task->tk_active); task->tk_active = 1; rpc_set_running(task); return __rpc_execute(task); - out_release: - rpc_release_task(task); - out_err: - return status; } -/* - * This is our own little scheduler for async RPC tasks. - */ -static void -__rpc_schedule(void) +static void rpc_async_schedule(void *arg) { - struct rpc_task *task; - int count = 0; - - dprintk("RPC: rpc_schedule enter\n"); - while (1) { - - task_for_first(task, &schedq.tasks[0]) { - __rpc_remove_wait_queue(task); - spin_unlock_bh(&rpc_queue_lock); - - __rpc_execute(task); - spin_lock_bh(&rpc_queue_lock); - } else { - break; - } - - if (++count >= 200 || need_resched()) { - count = 0; - spin_unlock_bh(&rpc_queue_lock); - schedule(); - spin_lock_bh(&rpc_queue_lock); - } - } - dprintk("RPC: rpc_schedule leave\n"); + __rpc_execute((struct rpc_task *)arg); } /* @@ -862,7 +750,6 @@ void rpc_init_task(struct rpc_task *task task->tk_client = clnt; task->tk_flags = flags; task->tk_exit = callback; - init_waitqueue_head(&task->tk_wait); if (current->uid != current->fsuid || current->gid != current->fsgid) task->tk_flags |= RPC_TASK_SETUID; @@ -873,7 +760,11 @@ void rpc_init_task(struct rpc_task *task task->tk_priority = RPC_PRIORITY_NORMAL; task->tk_cookie = (unsigned long)current; - INIT_LIST_HEAD(&task->tk_links); + + /* Initialize workqueue for async tasks */ + task->tk_workqueue = rpciod_workqueue; + if (!RPC_IS_ASYNC(task)) + init_waitqueue_head(&task->u.tk_wait.waitq); /* Add to global list of all tasks */ spin_lock(&rpc_sched_lock); @@ -944,8 +835,7 @@ cleanup: goto out; } -void -rpc_release_task(struct rpc_task *task) +void rpc_release_task(struct rpc_task *task) { dprintk("RPC: %4d release task\n", task->tk_pid); @@ -963,19 +853,9 @@ rpc_release_task(struct rpc_task *task) list_del(&task->tk_task); spin_unlock(&rpc_sched_lock); - /* Protect the execution below. */ - spin_lock_bh(&rpc_queue_lock); - - /* Disable timer to prevent zombie wakeup */ - __rpc_disable_timer(task); - - /* Remove from any wait queue we're still on */ - __rpc_remove_wait_queue(task); - + BUG_ON (RPC_IS_QUEUED(task)); task->tk_active = 0; - spin_unlock_bh(&rpc_queue_lock); - /* Synchronously delete any running timer */ rpc_delete_timer(task); @@ -1005,10 +885,9 @@ rpc_release_task(struct rpc_task *task) * queue 'childq'. If so returns a pointer to the parent. * Upon failure returns NULL. * - * Caller must hold rpc_queue_lock + * Caller must hold childq.lock */ -static inline struct rpc_task * -rpc_find_parent(struct rpc_task *child) +static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) { struct rpc_task *task, *parent; struct list_head *le; @@ -1021,17 +900,16 @@ rpc_find_parent(struct rpc_task *child) return NULL; } -static void -rpc_child_exit(struct rpc_task *child) +static void rpc_child_exit(struct rpc_task *child) { struct rpc_task *parent; - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&childq.lock); if ((parent = rpc_find_parent(child)) != NULL) { parent->tk_status = child->tk_status; __rpc_wake_up_task(parent); } - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&childq.lock); } /* @@ -1054,22 +932,20 @@ fail: return NULL; } -void -rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) +void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) { - spin_lock_bh(&rpc_queue_lock); + spin_lock_bh(&childq.lock); /* N.B. Is it possible for the child to have already finished? */ __rpc_sleep_on(&childq, task, func, NULL); rpc_schedule_run(child); - spin_unlock_bh(&rpc_queue_lock); + spin_unlock_bh(&childq.lock); } /* * Kill all tasks for the given client. * XXX: kill their descendants as well? */ -void -rpc_killall_tasks(struct rpc_clnt *clnt) +void rpc_killall_tasks(struct rpc_clnt *clnt) { struct rpc_task *rovr; struct list_head *le; @@ -1091,93 +967,14 @@ rpc_killall_tasks(struct rpc_clnt *clnt) static DECLARE_MUTEX_LOCKED(rpciod_running); -static inline int -rpciod_task_pending(void) -{ - return !list_empty(&schedq.tasks[0]); -} - - -/* - * This is the rpciod kernel thread - */ -static int -rpciod(void *ptr) -{ - int rounds = 0; - - lock_kernel(); - /* - * Let our maker know we're running ... - */ - rpciod_pid = current->pid; - up(&rpciod_running); - - daemonize("rpciod"); - allow_signal(SIGKILL); - - dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid); - spin_lock_bh(&rpc_queue_lock); - while (rpciod_users) { - DEFINE_WAIT(wait); - if (signalled()) { - spin_unlock_bh(&rpc_queue_lock); - rpciod_killall(); - flush_signals(current); - spin_lock_bh(&rpc_queue_lock); - } - __rpc_schedule(); - if (current->flags & PF_FREEZE) { - spin_unlock_bh(&rpc_queue_lock); - refrigerator(PF_FREEZE); - spin_lock_bh(&rpc_queue_lock); - } - - if (++rounds >= 64) { /* safeguard */ - spin_unlock_bh(&rpc_queue_lock); - schedule(); - rounds = 0; - spin_lock_bh(&rpc_queue_lock); - } - - dprintk("RPC: rpciod back to sleep\n"); - prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE); - if (!rpciod_task_pending() && !signalled()) { - spin_unlock_bh(&rpc_queue_lock); - schedule(); - rounds = 0; - spin_lock_bh(&rpc_queue_lock); - } - finish_wait(&rpciod_idle, &wait); - dprintk("RPC: switch to rpciod\n"); - } - spin_unlock_bh(&rpc_queue_lock); - - dprintk("RPC: rpciod shutdown commences\n"); - if (!list_empty(&all_tasks)) { - printk(KERN_ERR "rpciod: active tasks at shutdown?!\n"); - rpciod_killall(); - } - - dprintk("RPC: rpciod exiting\n"); - unlock_kernel(); - - rpciod_pid = 0; - complete_and_exit(&rpciod_killer, 0); - return 0; -} - -static void -rpciod_killall(void) +static void rpciod_killall(void) { unsigned long flags; while (!list_empty(&all_tasks)) { clear_thread_flag(TIF_SIGPENDING); rpc_killall_tasks(NULL); - spin_lock_bh(&rpc_queue_lock); - __rpc_schedule(); - spin_unlock_bh(&rpc_queue_lock); + flush_workqueue(rpciod_workqueue); if (!list_empty(&all_tasks)) { dprintk("rpciod_killall: waiting for tasks to exit\n"); yield(); @@ -1195,28 +992,30 @@ rpciod_killall(void) int rpciod_up(void) { + struct workqueue_struct *wq; int error = 0; down(&rpciod_sema); - dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users); + dprintk("rpciod_up: users %d\n", rpciod_users); rpciod_users++; - if (rpciod_pid) + if (rpciod_workqueue) goto out; /* * If there's no pid, we should be the first user. */ if (rpciod_users > 1) - printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users); + printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users); /* * Create the rpciod thread and wait for it to start. */ - error = kernel_thread(rpciod, NULL, 0); - if (error < 0) { - printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error); + error = -ENOMEM; + wq = create_workqueue("rpciod"); + if (wq == NULL) { + printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error); rpciod_users--; goto out; } - down(&rpciod_running); + rpciod_workqueue = wq; error = 0; out: up(&rpciod_sema); @@ -1227,20 +1026,21 @@ void rpciod_down(void) { down(&rpciod_sema); - dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users); + dprintk("rpciod_down sema %d\n", rpciod_users); if (rpciod_users) { if (--rpciod_users) goto out; } else - printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid); + printk(KERN_WARNING "rpciod_down: no users??\n"); - if (!rpciod_pid) { + if (!rpciod_workqueue) { dprintk("rpciod_down: Nothing to do!\n"); goto out; } + rpciod_killall(); - kill_proc(rpciod_pid, SIGKILL, 1); - wait_for_completion(&rpciod_killer); + destroy_workqueue(rpciod_workqueue); + rpciod_workqueue = NULL; out: up(&rpciod_sema); } @@ -1258,7 +1058,12 @@ void rpc_show_tasks(void) } printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " "-rpcwait -action- --exit--\n"); - alltask_for_each(t, le, &all_tasks) + alltask_for_each(t, le, &all_tasks) { + const char *rpc_waitq = "none"; + + if (RPC_IS_QUEUED(t)) + rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", t->tk_pid, (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), @@ -1266,8 +1071,9 @@ void rpc_show_tasks(void) t->tk_client, (t->tk_client ? t->tk_client->cl_prog : 0), t->tk_rqstp, t->tk_timeout, - rpc_qname(t->tk_rpcwait), + rpc_waitq, t->tk_action, t->tk_exit); + } spin_unlock(&rpc_sched_lock); } #endif diff -u --recursive --new-file --show-c-function linux-2.6.10-rc1/net/sunrpc/xprt.c linux-2.6.10-rc1-NFS_ALL/net/sunrpc/xprt.c --- linux-2.6.10-rc1/net/sunrpc/xprt.c 2004-10-27 14:11:06.000000000 -0400 +++ linux-2.6.10-rc1-NFS_ALL/net/sunrpc/xprt.c 2004-10-27 19:08:34.000000000 -0400 @@ -891,7 +891,8 @@ tcp_read_xid(struct rpc_xprt *xprt, skb_ xprt->tcp_flags &= ~XPRT_COPY_XID; xprt->tcp_flags |= XPRT_COPY_DATA; xprt->tcp_copied = 4; - dprintk("RPC: reading reply for XID %08x\n", xprt->tcp_xid); + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); tcp_check_recm(xprt); } @@ -911,7 +912,7 @@ tcp_read_request(struct rpc_xprt *xprt, if (!req) { xprt->tcp_flags &= ~XPRT_COPY_DATA; dprintk("RPC: XID %08x request not found!\n", - xprt->tcp_xid); + ntohl(xprt->tcp_xid)); spin_unlock(&xprt->sock_lock); return; } @@ -1101,7 +1102,7 @@ xprt_write_space(struct sock *sk) goto out; spin_lock_bh(&xprt->sock_lock); - if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) + if (xprt->snd_task) rpc_wake_up_task(xprt->snd_task); spin_unlock_bh(&xprt->sock_lock); out: @@ -1360,7 +1361,7 @@ xprt_request_init(struct rpc_task *task, req->rq_xprt = xprt; req->rq_xid = xprt_alloc_xid(xprt); dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, - req, req->rq_xid); + req, ntohl(req->rq_xid)); } /*