From: Trond Myklebust Date: Mon, 2 Apr 2007 19:29:52 -0400 NFS: Fix a race when doing NFS write coalescing Currently we do write coalescing in a very inefficient manner: one pass in generic_writepages() in order to lock the pages for writing, then one pass in nfs_flush_mapping() and/or nfs_sync_mapping_wait() in order to gather the locked pages for coalescing into RPC requests of size "wsize". In fact, it turns out there is actually a deadlock possible here since we only start I/O on the second pass. If the user signals the process while we're in nfs_sync_mapping_wait(), for instance, then we may exit before starting I/O on all the requests that have been queued up. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 92 ---------------------------- fs/nfs/write.c | 149 ++++++++++++++------------------------------- include/linux/nfs_page.h | 8 -- include/linux/writeback.h | 2 + 4 files changed, 50 insertions(+), 201 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 094537d..ea1a85d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -17,7 +17,6 @@ #include #include #include -#include #define NFS_PARANOIA 1 @@ -354,25 +353,6 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, } /** - * nfs_pageio_add_list - Split coalesced requests out from a list. - * @desc: destination io descriptor - * @head: source list - * - * Moves a maximum of 'nmax' elements from one list to another. - * The elements are checked to ensure that they form a contiguous set - * of pages, and that the RPC credentials are the same. - */ -void nfs_pageio_add_list(struct nfs_pageio_descriptor *desc, - struct list_head *head) -{ - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - if (!nfs_pageio_add_request(desc, req)) - break; - } -} - -/** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ @@ -383,78 +363,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) #define NFS_SCAN_MAXENTRIES 16 /** - * nfs_scan_dirty - Scan the radix tree for dirty requests - * @mapping: pointer to address space - * @wbc: writeback_control structure - * @dst: Destination list - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space - * starting at index idx_start, is scanned. - * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's req_lock when calling this function - */ -long nfs_scan_dirty(struct address_space *mapping, - struct writeback_control *wbc, - struct list_head *dst) -{ - struct nfs_inode *nfsi = NFS_I(mapping->host); - struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; - struct nfs_page *req; - pgoff_t idx_start, idx_end; - long res = 0; - int found, i; - - if (nfsi->ndirty == 0) - return 0; - if (wbc->range_cyclic) { - idx_start = 0; - idx_end = ULONG_MAX; - } else if (wbc->range_end == 0) { - idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; - idx_end = ULONG_MAX; - } else { - idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; - idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - for (;;) { - unsigned int toscan = NFS_SCAN_MAXENTRIES; - - found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, - (void **)&pgvec[0], idx_start, toscan, - NFS_PAGE_TAG_DIRTY); - - /* Did we make progress? */ - if (found <= 0) - break; - - for (i = 0; i < found; i++) { - req = pgvec[i]; - if (!wbc->range_cyclic && req->wb_index > idx_end) - goto out; - - /* Try to lock request and mark it for writeback */ - if (!nfs_set_page_writeback_locked(req)) - goto next; - radix_tree_tag_clear(&nfsi->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_DIRTY); - nfsi->ndirty--; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - res++; - if (res == LONG_MAX) - goto out; -next: - idx_start = req->wb_index + 1; - } - } -out: - WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)); - return res; -} - -/** * nfs_scan_list - Scan a list for matching requests * @nfsi: NFS inode * @head: One of the NFS inode request lists diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e533d01..a8d1955 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -38,7 +38,8 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*, struct page *, unsigned int, unsigned int); -static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, + struct inode *inode, int ioflags); static const struct rpc_call_ops nfs_write_partial_ops; static const struct rpc_call_ops nfs_write_full_ops; static const struct rpc_call_ops nfs_commit_ops; @@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) - return FLUSH_HIGHPRI; + return FLUSH_HIGHPRI | FLUSH_STABLE; if (wbc->for_kupdate) return FLUSH_LOWPRI; return 0; @@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page) * was not tagged. * May also return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_mark_flush(struct page *page) +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page) { struct nfs_page *req; struct nfs_inode *nfsi = NFS_I(page->mapping->host); @@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page) * request as dirty (in which case we don't care). */ spin_unlock(req_lock); + /* Prevent deadlock! */ + nfs_pageio_complete(pgio); ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) @@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page) /* This request is marked for commit */ spin_unlock(req_lock); nfs_unlock_request(req); + nfs_pageio_complete(pgio); return 1; } - if (nfs_set_page_writeback(page) == 0) { - nfs_list_remove_request(req); - /* add the request to the inode's dirty list. */ - radix_tree_tag_set(&nfsi->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_DIRTY); - nfs_list_add_request(req, &nfsi->dirty); - nfsi->ndirty++; - spin_unlock(req_lock); - __mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES); - } else + if (nfs_set_page_writeback(page) != 0) { spin_unlock(req_lock); + BUG(); + } + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, + NFS_PAGE_TAG_WRITEBACK); ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); - nfs_unlock_request(req); + spin_unlock(req_lock); + nfs_pageio_add_request(pgio, req); return ret; } @@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page) */ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) { + struct nfs_pageio_descriptor mypgio, *pgio; struct nfs_open_context *ctx; struct inode *inode = page->mapping->host; unsigned offset; @@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - err = nfs_page_mark_flush(page); + if (wbc->for_writepages) + pgio = wbc->fs_private; + else { + nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc)); + pgio = &mypgio; + } + + err = nfs_page_async_flush(pgio, page); if (err <= 0) goto out; err = 0; @@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc put_nfs_open_context(ctx); if (err != 0) goto out; - err = nfs_page_mark_flush(page); + err = nfs_page_async_flush(pgio, page); if (err > 0) err = 0; out: if (!wbc->for_writepages) - nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc)); + nfs_pageio_complete(pgio); return err; } @@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct nfs_pageio_descriptor pgio; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); + wbc->fs_private = &pgio; err = generic_writepages(mapping, wbc); + nfs_pageio_complete(&pgio); if (err) return err; - err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); - if (err < 0) - goto out; - nfs_add_stats(inode, NFSIOS_WRITEPAGES, err); - err = 0; -out: - return err; + if (pgio.pg_error) + return pgio.pg_error; + return 0; } /* @@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st return res; } -static void nfs_cancel_dirty_list(struct list_head *head) -{ - struct nfs_page *req; - while(!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_inode_remove_request(req); - nfs_clear_page_writeback(req); - } -} - static void nfs_cancel_commit_list(struct list_head *head) { struct nfs_page *req; @@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou return -ENOMEM; } -static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how) +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags) { - struct nfs_pageio_descriptor desc; - int wpages = NFS_SERVER(inode)->wpages; int wsize = NFS_SERVER(inode)->wsize; - /* For single writes, FLUSH_STABLE is more efficient */ - if (npages <= wpages && npages == NFS_I(inode)->npages - && nfs_list_entry(head->next)->wb_bytes <= wsize) - how |= FLUSH_STABLE; - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&desc, inode, nfs_flush_multi, wsize, how); + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else - nfs_pageio_init(&desc, inode, nfs_flush_one, wsize, how); - nfs_pageio_add_list(&desc, head); - nfs_pageio_complete(&desc); - if (desc.pg_error == 0) - return 0; - while (!list_empty(head)) { - struct nfs_page *req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_writeback(req); - } - return desc.pg_error; + nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); } /* @@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_call_done = nfs_commit_done, .rpc_release = nfs_commit_release, }; -#else -static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) -{ - return 0; -} -#endif - -static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) -{ - struct nfs_inode *nfsi = NFS_I(mapping->host); - LIST_HEAD(head); - long res; - - spin_lock(&nfsi->req_lock); - res = nfs_scan_dirty(mapping, wbc, &head); - spin_unlock(&nfsi->req_lock); - if (res) { - int error = nfs_flush_list(mapping->host, &head, res, how); - if (error < 0) - return error; - } - return res; -} -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) int nfs_commit_inode(struct inode *inode, int how) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } +#else +static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) +{ + return 0; +} #endif long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) @@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr ret = nfs_wait_on_requests_locked(inode, idx_start, npages); if (ret != 0) continue; - pages = nfs_scan_dirty(mapping, wbc, &head); - if (pages != 0) { - spin_unlock(&nfsi->req_lock); - if (how & FLUSH_INVALIDATE) { - nfs_cancel_dirty_list(&head); - ret = pages; - } else - ret = nfs_flush_list(inode, &head, pages, how); - spin_lock(&nfsi->req_lock); - continue; - } - if (wbc->pages_skipped != 0) - continue; if (nocommit) break; pages = nfs_scan_commit(inode, &head, idx_start, npages); @@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode) }; int ret; - ret = generic_writepages(mapping, &wbc); + ret = nfs_writepages(mapping, &wbc); if (ret < 0) goto out; ret = nfs_sync_mapping_wait(mapping, &wbc, 0); @@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo }; int ret; - if (!(how & FLUSH_NOWRITEPAGE)) { - ret = generic_writepages(mapping, &wbc); - if (ret < 0) - goto out; - } + ret = nfs_writepages(mapping, &wbc); + if (ret < 0) + goto out; ret = nfs_sync_mapping_wait(mapping, &wbc, how); if (ret >= 0) return 0; @@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how) int ret; BUG_ON(!PageLocked(page)); - if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) { + if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index b8b7bca..e556e57 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -21,8 +21,7 @@ /* * Valid flags for the radix tree */ -#define NFS_PAGE_TAG_DIRTY 0 -#define NFS_PAGE_TAG_WRITEBACK 1 +#define NFS_PAGE_TAG_WRITEBACK 0 /* * Valid flags for a dirty buffer @@ -72,9 +71,6 @@ extern void nfs_clear_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req); -extern long nfs_scan_dirty(struct address_space *mapping, - struct writeback_control *wbc, - struct list_head *dst); extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, struct list_head *dst, unsigned long idx_start, unsigned int npages); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, @@ -84,8 +80,6 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); -extern void nfs_pageio_add_list(struct nfs_pageio_descriptor *, - struct list_head *); extern void nfs_pageio_complete(struct nfs_pageio_descriptor *desc); extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0c78f7f..daa6c12 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -59,6 +59,8 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ + + void *fs_private; /* For use by ->writepages() */ }; /*