From: Trond Myklebust Date: 1163453024 -0500 SUNRPC: Fix a potential race in rpc_wake_up_task() Use RCU to ensure that we can safely call rpc_finish_wakeup after we've called __rpc_do_wake_up_task. If not, there is a theoretical race, in which the rpc_task finishes executing, and gets freed first. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 8 +++++++- fs/nfs/write.c | 20 ++++++++++++++++---- include/linux/nfs_fs.h | 7 ++----- include/linux/sunrpc/sched.h | 2 ++ net/sunrpc/sched.c | 30 ++++++++++++++++++++---------- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c2e49c3..8b58bbf 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -65,13 +65,19 @@ struct nfs_read_data *nfs_readdata_alloc return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +static void nfs_readdata_rcu_free(struct rcu_head *head) { + struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } +static void nfs_readdata_free(struct nfs_read_data *rdata) +{ + call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free); +} + void nfs_readdata_release(void *data) { nfs_readdata_free(data); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 883dd4a..29d8820 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -102,13 +102,19 @@ struct nfs_write_data *nfs_commit_alloc( return p; } -void nfs_commit_free(struct nfs_write_data *p) +void nfs_commit_rcu_free(struct rcu_head *head) { + struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +void nfs_commit_free(struct nfs_write_data *wdata) +{ + call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free); +} + struct nfs_write_data *nfs_writedata_alloc(size_t len) { unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -131,13 +137,19 @@ struct nfs_write_data *nfs_writedata_all return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +static void nfs_writedata_rcu_free(struct rcu_head *head) { + struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } +static void nfs_writedata_free(struct nfs_write_data *wdata) +{ + call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free); +} + void nfs_writedata_release(void *wdata) { nfs_writedata_free(wdata); @@ -258,7 +270,7 @@ static int nfs_writepage_sync(struct nfs io_error: nfs_end_data_update(inode); end_page_writeback(page); - nfs_writedata_free(wdata); + nfs_writedata_release(wdata); return written ? written : result; } @@ -1043,7 +1055,7 @@ out_bad: while (!list_empty(&list)) { data = list_entry(list.next, struct nfs_write_data, pages); list_del(&data->pages); - nfs_writedata_free(data); + nfs_writedata_release(data); } nfs_mark_request_dirty(req); nfs_clear_page_writeback(req); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 45228c1..abde286 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -428,11 +428,6 @@ extern int nfs_updatepage(struct file * extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); extern void nfs_writedata_release(void *); -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -struct nfs_write_data *nfs_commit_alloc(void); -void nfs_commit_free(struct nfs_write_data *p); -#endif - /* * Try to write back everything synchronously (but check the * return value!) @@ -440,6 +435,8 @@ #endif extern int nfs_sync_inode_wait(struct inode *, unsigned long, unsigned int, int); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) extern int nfs_commit_inode(struct inode *, int); +extern struct nfs_write_data *nfs_commit_alloc(void); +extern void nfs_commit_free(struct nfs_write_data *wdata); extern void nfs_commit_release(void *wdata); #else static inline int diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 9fdb8c9..14fc813 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -11,6 +11,7 @@ #define _LINUX_SUNRPC_SCHED_H_ #include #include +#include #include #include #include @@ -85,6 +86,7 @@ #endif union { struct work_struct tk_work; /* Async task work queue */ struct rpc_wait tk_wait; /* RPC wait */ + struct rcu_head tk_rcu; /* for task deletion */ } u; unsigned short tk_timeouts; /* maj timeouts */ diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 66d0136..6b808c0 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -427,16 +427,19 @@ __rpc_default_timer(struct rpc_task *tas */ void rpc_wake_up_task(struct rpc_task *task) { + rcu_read_lock_bh(); if (rpc_start_wakeup(task)) { if (RPC_IS_QUEUED(task)) { struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; - spin_lock_bh(&queue->lock); + /* Note: we're already in a bh-safe context */ + spin_lock(&queue->lock); __rpc_do_wake_up_task(task); - spin_unlock_bh(&queue->lock); + spin_unlock(&queue->lock); } rpc_finish_wakeup(task); } + rcu_read_unlock_bh(); } /* @@ -499,14 +502,16 @@ struct rpc_task * rpc_wake_up_next(struc struct rpc_task *task = NULL; dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); - spin_lock_bh(&queue->lock); + rcu_read_lock_bh(); + spin_lock(&queue->lock); if (RPC_IS_PRIORITY(queue)) task = __rpc_wake_up_next_priority(queue); else { task_for_first(task, &queue->tasks[0]) __rpc_wake_up_task(task); } - spin_unlock_bh(&queue->lock); + spin_unlock(&queue->lock); + rcu_read_unlock_bh(); return task; } @@ -522,7 +527,8 @@ void rpc_wake_up(struct rpc_wait_queue * struct rpc_task *task, *next; struct list_head *head; - spin_lock_bh(&queue->lock); + rcu_read_lock_bh(); + spin_lock(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) @@ -531,7 +537,8 @@ void rpc_wake_up(struct rpc_wait_queue * break; head--; } - spin_unlock_bh(&queue->lock); + spin_unlock(&queue->lock); + rcu_read_unlock_bh(); } /** @@ -546,7 +553,8 @@ void rpc_wake_up_status(struct rpc_wait_ struct rpc_task *task, *next; struct list_head *head; - spin_lock_bh(&queue->lock); + rcu_read_lock_bh(); + spin_lock(&queue->lock); head = &queue->tasks[queue->maxpriority]; for (;;) { list_for_each_entry_safe(task, next, head, u.tk_wait.list) { @@ -557,7 +565,8 @@ void rpc_wake_up_status(struct rpc_wait_ break; head--; } - spin_unlock_bh(&queue->lock); + spin_unlock(&queue->lock); + rcu_read_unlock_bh(); } static void __rpc_atrun(struct rpc_task *task) @@ -817,8 +826,9 @@ rpc_alloc_task(void) return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); } -static void rpc_free_task(struct rpc_task *task) +static void rpc_free_task(struct rcu_head *rcu) { + struct rpc_task *task = container_of(rcu, struct rpc_task, u.tk_rcu); dprintk("RPC: %4d freeing task\n", task->tk_pid); mempool_free(task, rpc_task_mempool); } @@ -872,7 +882,7 @@ void rpc_put_task(struct rpc_task *task) task->tk_client = NULL; } if (task->tk_flags & RPC_TASK_DYNAMIC) - rpc_free_task(task); + call_rcu_bh(&task->u.tk_rcu, rpc_free_task); if (tk_ops->rpc_release) tk_ops->rpc_release(calldata); }