[pnfs] [PATCH 2/2] NFS: Fix a pNFS deadlock due to violations of RPC asynchronous i/o rules
Benny Halevy
bhalevy at panasas.com
Tue Jul 15 05:31:04 EDT 2008
I applied this patch onto my 2.6.26 tree.
This required minor touch-ups since it seems like the version you
sent is bases off an older version of the tree.
Please see commit 4ce44a74aaa8a6a5b02e6d3d8cee9de50879a7d7
Benny
On Jul. 15, 2008, 6:39 +0300, Trond Myklebust <Trond.Myklebust at netapp.com> wrote:
> The current implementation of layoutget for files is a horror that does not
> respect the basic rules for processes that can run under rpciod.
>
> In particular it is quite illegal for an RPC callback function that is
> running under rpciod to make synchronous rpc calls, as is evident in the
> following trace:
>
> INFO: task rpciod/3:2289 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> rpciod/3 D 0000000000000000 0 2289 2
> ffff810129803740 0000000000000046 0000000000000000 ffffffff80389a12
> 0000000000000082 ffff810129c38bb0 ffff81012fb70a70 ffff810129c38f10
> 00000003805707a0 0000000100321d61 ffffffffffffffff ffffffffffffffff
> Call Trace:
> [<ffffffff80389a12>] serial8250_console_putchar+0x0/0xa9
> [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
> [<ffffffff882b3c59>] :sunrpc:rpc_wait_bit_killable+0x2a/0x31
> [<ffffffff80473d82>] __wait_on_bit+0x40/0x6f
> [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
> [<ffffffff80473e1d>] out_of_line_wait_on_bit+0x6c/0x78
> [<ffffffff80244fe2>] wake_bit_function+0x0/0x23
> [<ffffffff80242388>] __queue_work+0x23/0x33
> [<ffffffff882b40ce>] :sunrpc:__rpc_execute+0xee/0x21a
> [<ffffffff882adc62>] :sunrpc:rpc_run_task+0x4f/0x56
> [<ffffffff882adcfe>] :sunrpc:rpc_call_sync+0x3e/0x5b
> [<ffffffff882add5a>] :sunrpc:rpc_ping+0x3f/0x55
> [<ffffffff882aec51>] :sunrpc:rpc_create+0x552/0x5c7
> [<ffffffff882b3a48>] :sunrpc:rpc_put_task+0x6a/0x7e
> [<ffffffff88330c80>] :nfs:nfs_create_rpc_client+0x9c/0xe2
> [<ffffffff883312aa>] :nfs:nfs4_set_client+0x111/0x222
> [<ffffffff885b57f5>]
> :nfslayoutdriver:nfs4_pnfs_device_item_get+0x6cb/0xa5c
> [<ffffffff80474e76>] _spin_lock_bh+0x9/0x1f
> [<ffffffff80286e32>] cache_alloc_refill+0x15a/0x1da
> [<ffffffff885b478b>] :nfslayoutdriver:filelayout_alloc_lseg+0x26e/0x3b6
> [<ffffffff8835e4ca>] :nfs:pnfs_get_layout_done+0x17f/0x43c
> [<ffffffff8834b790>] :nfs:nfs4_pnfs_layoutget_done+0x73/0xa1
> [<ffffffff882b3c8d>] :sunrpc:rpc_exit_task+0x2d/0x5e
> [<ffffffff882b405b>] :sunrpc:__rpc_execute+0x7b/0x21a
> [<ffffffff882b41fa>] :sunrpc:rpc_async_schedule+0x0/0xc
> [<ffffffff80241f79>] run_workqueue+0x7f/0x104
> [<ffffffff802427f2>] worker_thread+0xd5/0xe0
> [<ffffffff80244fb4>] autoremove_wake_function+0x0/0x2e
> [<ffffffff8024271d>] worker_thread+0x0/0xe0
> [<ffffffff80244e8a>] kthread+0x47/0x75
> [<ffffffff8022ef54>] schedule_tail+0x27/0x5c
> [<ffffffff8020ca38>] child_rip+0xa/0x12
> [<ffffffff80244e43>] kthread+0x0/0x75
> [<ffffffff8020ca2e>] child_rip+0x0/0x12
>
> The fix is to move the post-rpc processing out of the asynchronous RPC
> callback function, and into the main nfs4_proc_pnfs_layoutget() (after the
> completion of the RPC call)..
>
> Signed-off-by: Trond Myklebust <Trond.Myklebust at netapp.com>
> ---
>
> fs/nfs/nfs4proc.c | 5 ++++-
> fs/nfs/pnfs.c | 53 +++++++++++++++++++++++++++++++++++------------------
> fs/nfs/pnfs.h | 2 ++
> 3 files changed, 41 insertions(+), 19 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 40f5eef..972ad85 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5186,8 +5186,11 @@ static int nfs4_proc_pnfs_layoutget(struct nfs4_pnfs_layoutget *lgp)
> goto out;
> }
> status = nfs4_wait_for_completion_rpc_task(task);
> - if (status == 0)
> + if (status == 0) {
> status = lgp->status;
> + if (status == 0)
> + status = pnfs_layout_process(NFS_I(ino)->current_layout, lgp);
> + }
> rpc_put_task(task);
> out:
> dprintk("<-- %s\n", __func__);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 79208e6..bb7e5de 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -978,10 +978,6 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
>
> dprintk("-->%s\n", __func__);
>
> - spin_lock(&nfsi->lo_lock);
> -
> - BUG_ON(nfsi->current_layout != lo);
> -
> lgp->status = rpc_status;
> if (rpc_status) {
> dprintk("%s: ERROR retrieving layout %d\n",
> @@ -1023,14 +1019,42 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
> goto get_out;
> }
>
> + lgp->status = 0;
> +
> +get_out:
> + /* remember that get layout failed and don't try again */
> + if (lgp->status < 0)
> + nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
> +
> + dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
> + __func__, lgp->status, nfsi->pnfs_layout_state, lseg);
> + return;
> +}
> +
> +int
> +pnfs_layout_process(struct pnfs_layout_type *lo,
> + struct nfs4_pnfs_layoutget *lgp)
> +{
> + struct nfs4_pnfs_layoutget_res *res = &lgp->res;
> + struct pnfs_layout_segment *lseg;
> + struct nfs_inode *nfsi = NFS_I(lo->inode);
> + int status = 0;
> +
> +
> + /* FIXME: This lock is just so wrong... */
> + spin_lock(&nfsi->lo_lock);
> +
> + /* FIXME: WTF? */
> + BUG_ON(nfsi->current_layout != lo);
> +
> +
> /* Inject layout blob into I/O device driver */
> lseg = pnfs_inject_layout(lo, res, lgp->lsegpp != NULL);
> if (IS_ERR(lseg)) {
> - lgp->status = PTR_ERR(lseg);
> - lseg = NULL;
> + status = PTR_ERR(lseg);
> printk(KERN_ERR "%s: ERROR! Could not inject layout (%d)\n",
> - __func__, lgp->status);
> - goto get_out;
> + __func__, status);
> + goto out;
> }
>
> if (res->return_on_close) {
> @@ -1038,22 +1062,15 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
> if (!lo->roc_iomode)
> lo->roc_iomode = IOMODE_ANY;
> }
> - lgp->status = 0;
> -
> -get_out:
> - /* remember that get layout failed and don't try again */
> - if (lgp->status < 0)
> - nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
> - spin_unlock(&nfsi->lo_lock);
>
> /* Done processing layoutget. Set the layout stateid */
> pnfs_set_layout_stateid(lo, &res->stateid);
>
> - dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
> - __func__, lgp->status, nfsi->pnfs_layout_state, lseg);
> if (lgp->lsegpp)
> *lgp->lsegpp = lseg;
> - return;
> +out:
> + spin_unlock(&nfsi->lo_lock);
> + return status;
> }
>
> size_t
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index e9e9032..77d39c6 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -57,6 +57,8 @@ void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
> ssize_t pnfs_file_write(struct file *, const char __user *, size_t, loff_t *);
> void pnfs_get_layout_done(struct pnfs_layout_type *,
> struct nfs4_pnfs_layoutget *, int);
> +int pnfs_layout_process(struct pnfs_layout_type *lo,
> + struct nfs4_pnfs_layoutget *lgp);
> void pnfs_layout_release(struct pnfs_layout_type *);
> int _pnfs_write_begin(struct inode *inode, struct page *page,
> loff_t pos, unsigned len, void **fsdata);
>
More information about the pNFS
mailing list