[pnfs] [PATCH 2/2] NFS: Fix a pNFS deadlock due to violations of RPC asynchronous i/o rules

Trond Myklebust Trond.Myklebust at netapp.com
Mon Jul 14 23:39:25 EDT 2008


The current implementation of layoutget for files is a horror that does not
respect the basic rules for processes that can run under rpciod.

In particular it is quite illegal for an RPC callback function that is
running under rpciod to make synchronous rpc calls, as is evident in the
following trace:

INFO: task rpciod/3:2289 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
rpciod/3      D 0000000000000000     0  2289      2
 ffff810129803740 0000000000000046 0000000000000000 ffffffff80389a12
 0000000000000082 ffff810129c38bb0 ffff81012fb70a70 ffff810129c38f10
 00000003805707a0 0000000100321d61 ffffffffffffffff ffffffffffffffff
Call Trace:
 [<ffffffff80389a12>] serial8250_console_putchar+0x0/0xa9
 [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
 [<ffffffff882b3c59>] :sunrpc:rpc_wait_bit_killable+0x2a/0x31
 [<ffffffff80473d82>] __wait_on_bit+0x40/0x6f
 [<ffffffff882b3c2f>] :sunrpc:rpc_wait_bit_killable+0x0/0x31
 [<ffffffff80473e1d>] out_of_line_wait_on_bit+0x6c/0x78
 [<ffffffff80244fe2>] wake_bit_function+0x0/0x23
 [<ffffffff80242388>] __queue_work+0x23/0x33
 [<ffffffff882b40ce>] :sunrpc:__rpc_execute+0xee/0x21a
 [<ffffffff882adc62>] :sunrpc:rpc_run_task+0x4f/0x56
 [<ffffffff882adcfe>] :sunrpc:rpc_call_sync+0x3e/0x5b
 [<ffffffff882add5a>] :sunrpc:rpc_ping+0x3f/0x55
 [<ffffffff882aec51>] :sunrpc:rpc_create+0x552/0x5c7
 [<ffffffff882b3a48>] :sunrpc:rpc_put_task+0x6a/0x7e
 [<ffffffff88330c80>] :nfs:nfs_create_rpc_client+0x9c/0xe2
 [<ffffffff883312aa>] :nfs:nfs4_set_client+0x111/0x222
 [<ffffffff885b57f5>]
:nfslayoutdriver:nfs4_pnfs_device_item_get+0x6cb/0xa5c
 [<ffffffff80474e76>] _spin_lock_bh+0x9/0x1f
 [<ffffffff80286e32>] cache_alloc_refill+0x15a/0x1da
 [<ffffffff885b478b>] :nfslayoutdriver:filelayout_alloc_lseg+0x26e/0x3b6
 [<ffffffff8835e4ca>] :nfs:pnfs_get_layout_done+0x17f/0x43c
 [<ffffffff8834b790>] :nfs:nfs4_pnfs_layoutget_done+0x73/0xa1
 [<ffffffff882b3c8d>] :sunrpc:rpc_exit_task+0x2d/0x5e
 [<ffffffff882b405b>] :sunrpc:__rpc_execute+0x7b/0x21a
 [<ffffffff882b41fa>] :sunrpc:rpc_async_schedule+0x0/0xc
 [<ffffffff80241f79>] run_workqueue+0x7f/0x104
 [<ffffffff802427f2>] worker_thread+0xd5/0xe0
 [<ffffffff80244fb4>] autoremove_wake_function+0x0/0x2e
 [<ffffffff8024271d>] worker_thread+0x0/0xe0
 [<ffffffff80244e8a>] kthread+0x47/0x75
 [<ffffffff8022ef54>] schedule_tail+0x27/0x5c
 [<ffffffff8020ca38>] child_rip+0xa/0x12
 [<ffffffff80244e43>] kthread+0x0/0x75
 [<ffffffff8020ca2e>] child_rip+0x0/0x12

The fix is to move the post-rpc processing out of the asynchronous RPC
callback function, and into the main nfs4_proc_pnfs_layoutget() (after the
completion of the RPC call)..

Signed-off-by: Trond Myklebust <Trond.Myklebust at netapp.com>
---

 fs/nfs/nfs4proc.c |    5 ++++-
 fs/nfs/pnfs.c     |   53 +++++++++++++++++++++++++++++++++++------------------
 fs/nfs/pnfs.h     |    2 ++
 3 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 40f5eef..972ad85 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5186,8 +5186,11 @@ static int nfs4_proc_pnfs_layoutget(struct nfs4_pnfs_layoutget *lgp)
 		goto out;
 	}
 	status = nfs4_wait_for_completion_rpc_task(task);
-	if (status == 0)
+	if (status == 0) {
 		status = lgp->status;
+		if (status == 0)
+			status = pnfs_layout_process(NFS_I(ino)->current_layout, lgp);
+	}
 	rpc_put_task(task);
 out:
 	dprintk("<-- %s\n", __func__);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 79208e6..bb7e5de 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -978,10 +978,6 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
 
 	dprintk("-->%s\n", __func__);
 
-	spin_lock(&nfsi->lo_lock);
-
-	BUG_ON(nfsi->current_layout != lo);
-
 	lgp->status = rpc_status;
 	if (rpc_status) {
 		dprintk("%s: ERROR retrieving layout %d\n",
@@ -1023,14 +1019,42 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
 		goto get_out;
 	}
 
+	lgp->status = 0;
+
+get_out:
+	/* remember that get layout failed and don't try again */
+	if (lgp->status < 0)
+		nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
+
+	dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
+		__func__, lgp->status, nfsi->pnfs_layout_state, lseg);
+	return;
+}
+
+int
+pnfs_layout_process(struct pnfs_layout_type *lo,
+		    struct nfs4_pnfs_layoutget *lgp)
+{
+	struct nfs4_pnfs_layoutget_res *res = &lgp->res;
+	struct pnfs_layout_segment *lseg;
+	struct nfs_inode *nfsi = NFS_I(lo->inode);
+	int status = 0;
+
+
+	/* FIXME: This lock is just so wrong... */
+	spin_lock(&nfsi->lo_lock);
+
+	/* FIXME: WTF? */
+	BUG_ON(nfsi->current_layout != lo);
+
+
 	/* Inject layout blob into I/O device driver */
 	lseg = pnfs_inject_layout(lo, res, lgp->lsegpp != NULL);
 	if (IS_ERR(lseg)) {
-		lgp->status = PTR_ERR(lseg);
-		lseg = NULL;
+		status = PTR_ERR(lseg);
 		printk(KERN_ERR "%s: ERROR!  Could not inject layout (%d)\n",
-			__func__, lgp->status);
-		goto get_out;
+				__func__, status);
+		goto out;
 	}
 
 	if (res->return_on_close) {
@@ -1038,22 +1062,15 @@ pnfs_get_layout_done(struct pnfs_layout_type *lo,
 		if (!lo->roc_iomode)
 			lo->roc_iomode = IOMODE_ANY;
 	}
-	lgp->status = 0;
-
-get_out:
-	/* remember that get layout failed and don't try again */
-	if (lgp->status < 0)
-		nfsi->pnfs_layout_state |= NFS_INO_LAYOUT_FAILED;
-	spin_unlock(&nfsi->lo_lock);
 
 	/* Done processing layoutget. Set the layout stateid */
 	pnfs_set_layout_stateid(lo, &res->stateid);
 
-	dprintk("%s end (err:%d) state 0x%lx lseg %p\n",
-		__func__, lgp->status, nfsi->pnfs_layout_state, lseg);
 	if (lgp->lsegpp)
 		*lgp->lsegpp = lseg;
-	return;
+out:
+	spin_unlock(&nfsi->lo_lock);
+	return status;
 }
 
 size_t
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e9e9032..77d39c6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -57,6 +57,8 @@ void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
 ssize_t pnfs_file_write(struct file *, const char __user *, size_t, loff_t *);
 void pnfs_get_layout_done(struct pnfs_layout_type *,
 			  struct nfs4_pnfs_layoutget *, int);
+int pnfs_layout_process(struct pnfs_layout_type *lo,
+		    struct nfs4_pnfs_layoutget *lgp);
 void pnfs_layout_release(struct pnfs_layout_type *);
 int _pnfs_write_begin(struct inode *inode, struct page *page,
 		      loff_t pos, unsigned len, void **fsdata);



More information about the pNFS mailing list