Subject: [PATCH] NFS: break single global lock into per-inode lock Description: Break the nfs_wreq_lock into per-inode locks. This helps prevent a heavy read or write workload on one file from interfering with workloads against other NFS files. Note that there is still some serialization due to the big kernel lock. Test-plan: Run multi-threaded multi-file tests on large-scale SMP and NUMA clients. Look for performance regressions or stability problems, such as hanging mount points or applications, oopses, or system deadlocks. fs/nfs/inode.c | 1 fs/nfs/pagelist.c | 15 +-------- fs/nfs/write.c | 65 ++++++++++++++++++++------------------- include/linux/nfs_fs.h | 1 include/linux/nfs_page.h | 8 +--- 5 files changed, 42 insertions(+), 48 deletions(-) Applies-to: 2.6.8-rc3 Signed-off-by: Chuck Lever Generated-at: Mon, 09 Aug 2004 22:29:55 -0400 diff -X /home/cel/src/linux/dont-diff -Naurp 03-nfs_put_super/fs/nfs/inode.c 04-nfsi-req-lock/fs/nfs/inode.c --- 03-nfs_put_super/fs/nfs/inode.c 2004-08-06 18:24:35.864307000 -0400 +++ 04-nfsi-req-lock/fs/nfs/inode.c 2004-08-06 18:24:52.805376000 -0400 @@ -1737,6 +1737,7 @@ static void init_once(void * foo, kmem_c if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { inode_init_once(&nfsi->vfs_inode); + spin_lock_init(&nfsi->req_lock); INIT_LIST_HEAD(&nfsi->dirty); INIT_LIST_HEAD(&nfsi->commit); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); diff -X /home/cel/src/linux/dont-diff -Naurp 03-nfs_put_super/fs/nfs/pagelist.c 04-nfsi-req-lock/fs/nfs/pagelist.c --- 03-nfs_put_super/fs/nfs/pagelist.c 2004-06-16 01:20:03.000000000 -0400 +++ 04-nfsi-req-lock/fs/nfs/pagelist.c 2004-08-06 17:49:22.535633000 -0400 @@ -21,11 +21,6 @@ #define NFS_PARANOIA 1 -/* - * Spinlock - */ -spinlock_t nfs_wreq_lock = SPIN_LOCK_UNLOCKED; - static kmem_cache_t *nfs_page_cachep; static inline struct nfs_page * @@ -95,7 +90,7 @@ nfs_create_request(struct file *file, st req->wb_pgbase = offset; req->wb_bytes = count; req->wb_inode = inode; - req->wb_count = 1; + atomic_set(&req->wb_count, 1); server->rpc_ops->request_init(req, file); return req; @@ -137,12 +132,8 @@ void nfs_clear_request(struct nfs_page * void nfs_release_request(struct nfs_page *req) { - spin_lock(&nfs_wreq_lock); - if (--req->wb_count) { - spin_unlock(&nfs_wreq_lock); + if (!atomic_dec_and_test(&req->wb_count)) return; - } - spin_unlock(&nfs_wreq_lock); #ifdef NFS_PARANOIA BUG_ON (!list_empty(&req->wb_list)); @@ -254,7 +245,7 @@ nfs_coalesce_requests(struct list_head * * If the number of requests is set to 0, the entire address_space * starting at index idx_start, is scanned. * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the nfs_wreq_lock when calling this function + * You must be holding the inode's req_lock when calling this function */ int nfs_scan_list(struct list_head *head, struct list_head *dst, diff -X /home/cel/src/linux/dont-diff -Naurp 03-nfs_put_super/fs/nfs/write.c 04-nfsi-req-lock/fs/nfs/write.c --- 03-nfs_put_super/fs/nfs/write.c 2004-06-16 01:19:43.000000000 -0400 +++ 04-nfsi-req-lock/fs/nfs/write.c 2004-08-06 17:49:22.541627000 -0400 @@ -389,7 +389,7 @@ nfs_inode_add_request(struct inode *inod nfs_begin_data_update(inode); } nfsi->npages++; - req->wb_count++; + atomic_inc(&req->wb_count); return 0; } @@ -399,21 +399,20 @@ nfs_inode_add_request(struct inode *inod static void nfs_inode_remove_request(struct nfs_page *req) { - struct nfs_inode *nfsi; - struct inode *inode; + struct inode *inode = req->wb_inode; + struct nfs_inode *nfsi = NFS_I(inode); BUG_ON (!NFS_WBACK_BUSY(req)); - spin_lock(&nfs_wreq_lock); - inode = req->wb_inode; - nfsi = NFS_I(inode); + + spin_lock(&nfsi->req_lock); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; if (!nfsi->npages) { - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); nfs_end_data_update_defer(inode); iput(inode); } else - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); nfs_clear_request(req); nfs_release_request(req); } @@ -429,7 +428,7 @@ _nfs_find_request(struct inode *inode, u req = (struct nfs_page*)radix_tree_lookup(&nfsi->nfs_page_tree, index); if (req) - req->wb_count++; + atomic_inc(&req->wb_count); return req; } @@ -437,10 +436,11 @@ static struct nfs_page * nfs_find_request(struct inode *inode, unsigned long index) { struct nfs_page *req; + struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); req = _nfs_find_request(inode, index); - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); return req; } @@ -453,10 +453,10 @@ nfs_mark_request_dirty(struct nfs_page * struct inode *inode = req->wb_inode; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); inc_page_state(nr_dirty); mark_inode_dirty(inode); } @@ -481,10 +481,10 @@ nfs_mark_request_commit(struct nfs_page struct inode *inode = req->wb_inode; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); nfs_list_add_request(req, &nfsi->commit); nfsi->ncommit++; - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); inc_page_state(nr_unstable); mark_inode_dirty(inode); } @@ -509,7 +509,7 @@ nfs_wait_on_requests(struct inode *inode else idx_end = idx_start + npages - 1; - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); next = idx_start; while (radix_tree_gang_lookup(&nfsi->nfs_page_tree, (void **)&req, next, 1)) { if (req->wb_index > idx_end) @@ -519,16 +519,16 @@ nfs_wait_on_requests(struct inode *inode if (!NFS_WBACK_BUSY(req)) continue; - req->wb_count++; - spin_unlock(&nfs_wreq_lock); + atomic_inc(&req->wb_count); + spin_unlock(&nfsi->req_lock); error = nfs_wait_on_request(req); nfs_release_request(req); if (error < 0) return error; - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); res++; } - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); return res; } @@ -624,6 +624,7 @@ nfs_update_request(struct file* file, st unsigned int offset, unsigned int bytes) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req, *new = NULL; unsigned long rqend, end; @@ -635,19 +636,19 @@ nfs_update_request(struct file* file, st /* Loop over all inode entries and see if we find * A request for the page we wish to update */ - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); req = _nfs_find_request(inode, page->index); if (req) { if (!nfs_lock_request_dontget(req)) { int error; - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); error = nfs_wait_on_request(req); nfs_release_request(req); if (error < 0) return ERR_PTR(error); continue; } - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); if (new) nfs_release_request(new); break; @@ -658,15 +659,15 @@ nfs_update_request(struct file* file, st nfs_lock_request_dontget(new); error = nfs_inode_add_request(inode, new); if (error) { - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); nfs_unlock_request(new); return ERR_PTR(error); } - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); nfs_mark_request_dirty(new); return new; } - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); new = nfs_create_request(file, inode, page, offset, bytes); if (IS_ERR(new)) @@ -1347,13 +1348,14 @@ nfs_commit_done(struct rpc_task *task) int nfs_flush_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { + struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); int res, error = 0; - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); res = nfs_scan_dirty(inode, &head, idx_start, npages); - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); if (res) error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how); if (error < 0) @@ -1365,18 +1367,19 @@ int nfs_flush_inode(struct inode *inode, int nfs_commit_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { + struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); int res, error = 0; - spin_lock(&nfs_wreq_lock); + spin_lock(&nfsi->req_lock); res = nfs_scan_commit(inode, &head, idx_start, npages); if (res) { res += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); error = nfs_commit_list(&head, how); } else - spin_unlock(&nfs_wreq_lock); + spin_unlock(&nfsi->req_lock); if (error < 0) return error; return res; diff -X /home/cel/src/linux/dont-diff -Naurp 03-nfs_put_super/include/linux/nfs_fs.h 04-nfsi-req-lock/include/linux/nfs_fs.h --- 03-nfs_put_super/include/linux/nfs_fs.h 2004-06-16 01:19:13.000000000 -0400 +++ 04-nfsi-req-lock/include/linux/nfs_fs.h 2004-08-06 17:49:22.544625000 -0400 @@ -148,6 +148,7 @@ struct nfs_inode { /* * This is the list of dirty unwritten pages. */ + spinlock_t req_lock; struct list_head dirty; struct list_head commit; struct radix_tree_root nfs_page_tree; diff -X /home/cel/src/linux/dont-diff -Naurp 03-nfs_put_super/include/linux/nfs_page.h 04-nfsi-req-lock/include/linux/nfs_page.h --- 03-nfs_put_super/include/linux/nfs_page.h 2004-06-16 01:18:57.000000000 -0400 +++ 04-nfsi-req-lock/include/linux/nfs_page.h 2004-08-06 17:49:22.548624000 -0400 @@ -40,8 +40,8 @@ struct nfs_page { unsigned long wb_index; /* Offset >> PAGE_CACHE_SHIFT */ unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ wb_pgbase, /* Start of page data */ - wb_bytes, /* Length of request */ - wb_count; /* reference count */ + wb_bytes; /* Length of request */ + atomic_t wb_count; /* reference count */ unsigned long wb_flags; struct nfs_writeverf wb_verf; /* Commit cookie */ }; @@ -65,8 +65,6 @@ extern int nfs_coalesce_requests(struct unsigned int); extern int nfs_wait_on_request(struct nfs_page *); -extern spinlock_t nfs_wreq_lock; - /* * Lock the page of an asynchronous request without incrementing the wb_count */ @@ -86,7 +84,7 @@ nfs_lock_request(struct nfs_page *req) { if (test_and_set_bit(PG_BUSY, &req->wb_flags)) return 0; - req->wb_count++; + atomic_inc(&req->wb_count); return 1; }