[pnfs] [PATCH 2/2] NFS: Fix a pNFS deadlock due to violations of RPC asynchronous i/o rules

Benny Halevy bhalevy at panasas.com
Tue Jul 15 05:31:04 EDT 2008


I applied this patch onto my 2.6.26 tree.
This required minor touch-ups since it seems like the version you
sent is bases off an older version of the tree.

Please see commit 4ce44a74aaa8a6a5b02e6d3d8cee9de50879a7d7

Benny

On Jul. 15, 2008, 6:39 +0300, Trond Myklebust <Trond.Myklebust at netapp.com> wrote:
> The current implementation of layoutget for files is a horror that does not
> respect the basic rules for processes that can run under rpciod.
> 
> In particular it is quite illegal for an RPC callback function that is
> running under rpciod to make synchronous rpc calls, as is evident in the
> following trace:
> 
> INFO: task rpciod/3:2289 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> rpciod/3      D 0000000000000000     0  2289      2
>  ffff810129803740 0000000000000046 0000000000000000 ffffffff80389a12
>  0000000000000082 ffff810129c38bb0 ffff81012fb70a70 ffff810129c38f10
>  00000003805707a0 0000000100321d61 ffffffffffffffff ffffffffffffffff
> Call Trace:
>  [<ffffffff80389a12>] serial8250_console_putchar+0x0/0xa9
>  [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
>  [<ffffffff882b3c59>] :sunrpc:rpc_wait_bit_killable+0x2a/0x31
>  [<ffffffff80473d82>] __wait_on_bit+0x40/0x6f
>  [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
>  [<ffffffff80473e1d>] out_of_line_wait_on_bit+0x6c/0x78
>  [<ffffffff80244fe2>] wake_bit_function+0x0/0x23
>  [<ffffffff80242388>] __queue_work+0x23/0x33
>  [<ffffffff882b40ce>] :sunrpc:__rpc_execute+0xee/0x21a
>  [<ffffffff882adc62>] :sunrpc:rpc_run_task+0x4f/0x56
>  [<ffffffff882adcfe>] :sunrpc:rpc_call_sync+0x3e/0x5b
>  [<ffffffff882add5a>] :sunrpc:rpc_ping+0x3f/0x55
>  [<ffffffff882aec51>] :sunrpc:rpc_create+0x552/0x5c7
>  [<ffffffff882b3a48>] :sunrpc:rpc_put_task+0x6a/0x7e
>  [<ffffffff88330c80>] :nfs:nfs_create_rpc_client+0x9c/0xe2
>  [<ffffffff883312aa>] :nfs:nfs4_set_client+0x111/0x222
>  [<ffffffff885b57f5>]
> :nfslayoutdriver:nfs4_pnfs_device_item_get+0x6cb/0xa5c
>  [<ffffffff80474e76>] _spin_lock_bh+0x9/0x1f
>  [<ffffffff80286e32>] cache_alloc_refill+0x15a/0x1da
>  [<ffffffff885b478b>] :nfslayoutdriver:filelayout_alloc_lseg+0x26e/0x3b6
>  [<ffffffff8835e4ca>] :nfs:pnfs_get_layout_done+0x17f/0x43c
>  [<ffffffff8834b790>] :nfs:nfs4_pnfs_layoutget_done+0x73/0xa1
>  [<ffffffff882b3c8d>] :sunrpc:rpc_exit_task+0x2d/0x5e
>  [<ffffffff882b405b>] :sunrpc:__rpc_execute+0x7b/0x21a
>  [<ffffffff882b41fa>] :sunrpc:rpc_async_schedule+0x0/0xc
>  [<ffffffff80241f79>] run_workqueue+0x7f/0x104
>  [<ffffffff802427f2>] worker_thread+0xd5/0xe0
>  [<ffffffff80244fb4>] autoremove_wake_function+0x0/0x2e
>  [<ffffffff8024271d>] worker_thread+0x0/0xe0
>  [<ffffffff80244e8a>] kthread+0x47/0x75
>  [<ffffffff8022ef54>] schedule_tail+0x27/0x5c
>  [<ffffffff8020ca38>] child_rip+0xa/0x12
>  [<ffffffff80244e43>] kthread+0x0/0x75
>  [<ffffffff8020ca2e>] child_rip+0x0/0x12
> 
> The fix is to move the post-rpc processing out of the asynchronous RPC
> callback function, and into the main nfs4_proc_pnfs_layoutget() (after the
> completion of the RPC call)..
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust at netapp.com>
> ---
> 
>  fs/nfs/nfs4proc.c |    5 ++++-
>  fs/nfs/pnfs.c     |   53 +++++++++++++++++++++++++++++++++++------------------
>  fs/nfs/pnfs.h     |    2 ++
>  3 files changed, 41 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 40f5eef..972ad85 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5186,8 +5186,11 @@ static int nfs4_proc_pnfs_layoutget(struct nfs4_pnfs_layoutget *lgp)
>  		goto out;
>  	}
>  	status = nfs4_wait_for_completion_rpc_task(task);
> -	if (status == 0)
> +	if (status == 0) {
>  		status = lgp->status;
> +		if (status == 0)
> +			status = pnfs_layout_process(NFS_I(ino)->current_layout, lgp);
> +	}
>  	rpc_put_task(task);
>  out:
>  	dprintk("<-- %s\n", __func__);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 79208e6..bb7e5de 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -978,10 +978,6 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
>  
>  	dprintk("-->%s\n", __func__);
>  
> -	spin_lock(&nfsi->lo_lock);
> -
> -	BUG_ON(nfsi->current_layout != lo);
> -
>  	lgp->status = rpc_status;
>  	if (rpc_status) {
>  		dprintk("%s: ERROR retrieving layout %d\n",
> @@ -1023,14 +1019,42 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
>  		goto get_out;
>  	}
>  
> +	lgp->status = 0;
> +
> +get_out:
> +	/* remember that get layout failed and don't try again */
> +	if (lgp->status < 0)
> +		nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
> +
> +	dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
> +		__func__, lgp->status, nfsi->pnfs_layout_state, lseg);
> +	return;
> +}
> +
> +int
> +pnfs_layout_process(struct pnfs_layout_type *lo,
> +		    struct nfs4_pnfs_layoutget *lgp)
> +{
> +	struct nfs4_pnfs_layoutget_res *res = &lgp->res;
> +	struct pnfs_layout_segment *lseg;
> +	struct nfs_inode *nfsi = NFS_I(lo->inode);
> +	int status = 0;
> +
> +
> +	/* FIXME: This lock is just so wrong... */
> +	spin_lock(&nfsi->lo_lock);
> +
> +	/* FIXME: WTF? */
> +	BUG_ON(nfsi->current_layout != lo);
> +
> +
>  	/* Inject layout blob into I/O device driver */
>  	lseg = pnfs_inject_layout(lo, res, lgp->lsegpp != NULL);
>  	if (IS_ERR(lseg)) {
> -		lgp->status = PTR_ERR(lseg);
> -		lseg = NULL;
> +		status = PTR_ERR(lseg);
>  		printk(KERN_ERR "%s: ERROR!  Could not inject layout (%d)\n",
> -			__func__, lgp->status);
> -		goto get_out;
> +				__func__, status);
> +		goto out;
>  	}
>  
>  	if (res->return_on_close) {
> @@ -1038,22 +1062,15 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
>  		if (!lo->roc_iomode)
>  			lo->roc_iomode = IOMODE_ANY;
>  	}
> -	lgp->status = 0;
> -
> -get_out:
> -	/* remember that get layout failed and don't try again */
> -	if (lgp->status < 0)
> -		nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
> -	spin_unlock(&nfsi->lo_lock);
>  
>  	/* Done processing layoutget. Set the layout stateid */
>  	pnfs_set_layout_stateid(lo, &res->stateid);
>  
> -	dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
> -		__func__, lgp->status, nfsi->pnfs_layout_state, lseg);
>  	if (lgp->lsegpp)
>  		*lgp->lsegpp = lseg;
> -	return;
> +out:
> +	spin_unlock(&nfsi->lo_lock);
> +	return status;
>  }
>  
>  size_t
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index e9e9032..77d39c6 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -57,6 +57,8 @@ void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
>  ssize_t pnfs_file_write(struct file *, const char __user *, size_t, loff_t *);
>  void pnfs_get_layout_done(struct pnfs_layout_type *,
>  			  struct nfs4_pnfs_layoutget *, int);
> +int pnfs_layout_process(struct pnfs_layout_type *lo,
> +		    struct nfs4_pnfs_layoutget *lgp);
>  void pnfs_layout_release(struct pnfs_layout_type *);
>  int _pnfs_write_begin(struct inode *inode, struct page *page,
>  		      loff_t pos, unsigned len, void **fsdata);
> 



More information about the pNFS mailing list