[pnfs] [PATCH 18/18] pnfs client flush_one io operation
William A. (Andy) Adamson
andros at citi.umich.edu
Tue Jan 8 14:06:43 EST 2008
This patch is incomplete - I did not free nor bump refcounts on the
wb_private (thanks fred).
I'll send an update.
-->Andy
On 1/7/08, andros at umich.edu <andros at umich.edu> wrote:
>
> From: Andy Adamson <andros at umich.edu>
>
> Add a flush one io_ops entry point for pnfs.
> Filelayout uses the flush routine to setup data server I/O
> in the NFS page cache
>
> Add kref to struct nfs4_pnfs_dserver which is passed in
> nfs_page->wb_private
> from filelayout_flush_one to the filelayout_write_pagelist
> and not looked up again.
>
> Signed-off by: Andy Adamson<andros at umich.edu>
> ---
> fs/nfs/nfs4filelayout.c | 226
> ++++++++++++++++++++++++++++++++++++++------
> fs/nfs/nfs4filelayout.h | 3 +-
> fs/nfs/nfs4filelayoutdev.c | 2 +-
> fs/nfs/pnfs.c | 17 ++++
> fs/nfs/pnfs.h | 1 +
> fs/nfs/write.c | 7 +-
> include/linux/nfs4_pnfs.h | 2 +
> include/linux/nfs_page.h | 1 +
> 8 files changed, 225 insertions(+), 34 deletions(-)
>
> diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
> index d8876a9..3a0a6fa 100644
> --- a/fs/nfs/nfs4filelayout.c
> +++ b/fs/nfs/nfs4filelayout.c
> @@ -76,10 +76,14 @@ extern void nfs_initiate_commit(struct nfs_write_data
> *, struct rpc_clnt *, int)
> extern void nfs_read_validate(struct rpc_task *task, void *calldata);
> extern void nfs_readdata_release(void *data);
> extern void nfs_write_validate(struct rpc_task *task, void *calldata);
> +extern int nfs_flush_one(struct inode *, struct list_head *, unsigned
> int, size_t, int);
>
> /* Callback operations to the pNFS client */
> struct pnfs_client_operations *pnfs_callback_ops;
>
> +/* Forward declaration */
> +ssize_t filelayout_get_stripesize(struct pnfs_layout_type*, struct
> inode*);
> +
> /* Initialize a mountpoint by retrieving the list of
> * available devices for it.
> * Return the pnfs_mount_type structure so the
> @@ -340,7 +344,167 @@ ssize_t filelayout_read_pagelist(
> return status;
> }
>
> -/* Perform sync or async writes.
> +void
> +print_ds(struct nfs4_pnfs_ds *ds)
> +{
> + dprintk(" ds->ds_ip_addr %x\n", htonl(ds->ds_ip_addr));
> + dprintk(" ds->ds_port %hu\n", ntohs(ds->ds_port));
> + dprintk(" ds->ds_clp %p\n", ds->ds_clp);
> + dprintk(" ds->ds_count %d\n", atomic_read(&ds->ds_count));
> +}
> +
> +static struct nfs4_pnfs_dserver *
> +filelayout_create_dserver(void)
> +{
> + struct nfs4_pnfs_dserver *local;
> +
> + dprintk("--> %s\n", __func__);
> + local = kzalloc(sizeof(*local), GFP_KERNEL);
> + if (!local) {
> + return NULL;
> + }
> + kref_init(&local->ref);
> + return local;
> +}
> +
> +static void filelayout_free_dserver(struct kref *kref)
> +{
> + struct nfs4_pnfs_dserver *dserver;
> + dserver = container_of(kref, struct nfs4_pnfs_dserver, ref);
> +
> + dprintk("--> %s dserver %p\n", __func__, dserver);
> + kfree(dserver);
> +}
> +
> +static void filelayout_release_dserver(struct nfs4_pnfs_dserver *dserver)
> +{
> + dprintk("--> %s dserver %p\n", __func__, dserver);
> + kref_put(&dserver->ref, filelayout_free_dserver);
> +}
> +
> +static void filelayout_get_dserver(struct nfs4_pnfs_dserver *dserver)
> +{
> + dprintk("--> %s\n", __func__);
> + kref_get(&dserver->ref);
> +}
> +
> +/*
> +* feed nfs_flush_one with per data server pages.
> +*
> +* Assume stripesz >= PAGE_SIZE.
> +* TODO: If stripesz < PAGE_SIZE, use i/o through MDS
> +*
> +*/
> +int filelayout_flush_one(struct inode *inode, struct list_head *head,
> unsigned int npages, size_t count, int how)
> +{
> + struct pnfs_layout_type *ltype = NFS_I(inode)->current_layout;
> + struct nfs4_filelayout *nfslay = ltype->layoutid;
> + struct nfs4_pnfs_dserver *dserver = NULL;
> + struct nfs4_pnfs_ds *ds = NULL; /* current stripe data server */
> + struct nfs_page* req;
> + loff_t dsoffset = 0;
> + size_t stripesz, reqcount, dstotal = 0;
> + struct list_head *dslist;
> + int status = -ENOMEM, use_ds = 0, ndspages = 0;
> +
> + dprintk("--> %s npages %d, count %Zd, ltype %p nfslay %p\n",
> __func__, npages, count, ltype, nfslay);
> +
> + dslist = kmalloc(sizeof(*dslist), GFP_KERNEL);
> + if (!dslist)
> + return status;
> + INIT_LIST_HEAD(dslist);
> +
> + stripesz = filelayout_get_stripesize(NFS_I(inode)->current_layout,
> inode);
> + dprintk("%s stripesize %Zd\n", __func__, stripesz);
> + /* split up the list according to DS */
> + while(!list_empty(head)) {
> +next_ds:
> + req = nfs_list_entry(head->next);
> +
> + if (use_ds)
> + goto use_ds;
> + /* reset for new data server */
> + dstotal = 0;
> + ndspages = 0;
> +
> + status = -ENOMEM;
> + dserver = filelayout_create_dserver();
> + if (!dserver) {
> + dprintk("%s failed to get dserver. status %d\n",
> + __FUNCTION__, status);
> + goto out;
> + }
> +
> + /* get the data server that serves this stripe */
> + status = nfs4_pnfs_dserver_get(inode, nfslay, dsoffset,
> + stripesz, dserver);
> +
> + if (status != 0) {
> + dprintk("%s failed to get dataserver. status
> %d\n",
> + __FUNCTION__, status);
> + status = -EIO;
> + goto out;
> + }
> + /* just try the first multipath data server */
> + ds = dserver->dev->ds_list[0];
> +
> + use_ds = 1;
> +use_ds:
> + filelayout_get_dserver(dserver);
> +
> + reqcount = count < PAGE_SIZE? count: PAGE_SIZE;
> + count -= reqcount;
> + dstotal += reqcount;
> +
> + req->wb_devip = ds->ds_ip_addr;
> + req->wb_devport = ds->ds_port;
> + req->wb_private = dserver;
> +
> + /* move request to dslist */
> + nfs_list_remove_request(req);
> + nfs_list_add_request(req, dslist);
> + ndspages++;
> + npages--;
> +
> + if (dstotal == stripesz)
> + dsoffset += dstotal;
> +
> + if (count == 0 || npages == 0 || dstotal == stripesz) {
> + use_ds = 0;
> + goto send;
> + }
> + }
> + if (!ds) {
> + status = -EIO;
> + goto out;
> + }
> +
> +send:
> + /* XXX should recover to send through MDS */
> + dprintk("%s Send: ndspages %d dstotal %Zd list_empty(head) %d \n",
> + __func__, ndspages, dstotal, list_empty(head));
> + status = nfs_flush_one(inode, dslist, ndspages, dstotal, how);
> + if (status < 0)
> + goto out;
> +
> + /* XXX should be BUG_ON(!list_empty(dslist)); */
> + if (!list_empty(head) && npages > 0) {
> + if (!list_empty(dslist)) {
> + printk("%s ERROR! dslist NOT EMPTY\n", __func__);
> + status = -EIO;
> + goto out;
> + }
> + dprintk("%s next_ds\n", __func__);
> + goto next_ds;
> + }
> +
> +out:
> + kfree(dslist);
> + dprintk("<-- %s npages %d (should be zero!)\n", __func__, npages);
> + return status;
> +}
> +
> +/* Perform async writes.
> *
> * TODO: See filelayout_read_pagelist.
> */
> @@ -356,52 +520,51 @@ ssize_t filelayout_write_pagelist(
> struct nfs_write_data *data)
> {
> struct nfs4_filelayout *nfslay = (struct nfs4_filelayout
> *)layoutid->layoutid;
> - struct nfs4_pnfs_dserver dserver;
> + struct nfs4_pnfs_dserver *dserver = NULL;
> struct nfs4_pnfs_ds *ds;
> - struct nfs_page *req;
> + struct nfs_page* req = NULL;
> struct list_head *h;
> - int status;
>
> - /* Retrieve the correct rpc_client for the byte range */
> - status = nfs4_pnfs_dserver_get(inode,
> - nfslay,
> - offset,
> - count,
> - &dserver);
> - if (status) {
> - dprintk("%s failed to get dataserver\n",
> - __FUNCTION__);
> - data->ds_nfs_client = NULL;
> - return -EIO;
> - } else {
> - /* just try the first data server for the index.. */
> - ds = dserver.dev->ds_list[0];
> - data->pnfs_client = ds->ds_clp->cl_rpcclient;
> - data->ds_nfs_client = ds->ds_clp;
> - data->args.fh = dserver.fh;
> - }
> - dprintk("%s set wb_devip: wb_devport %x:%hu\n", __FUNCTION__,
> - htonl(ds->ds_ip_addr), ntohs(ds->ds_port));
> + dprintk("--> %s nr_pages %d offset:count %Lu:%Zu\n", __func__,
> + nr_pages, offset, count);
>
> + /* Retrieve the correct rpc_client for the byte range */
> list_for_each(h, &data->pages) {
> req = list_entry(h, struct nfs_page, wb_list);
> - req->wb_devip = ds->ds_ip_addr;
> - req->wb_devport = ds->ds_port;
> + break;
> }
> + BUG_ON(!req);
>
> - /* Now get the file offset on the dserver
> - * Set the write offset to this offset, and
> - * save the original offset in orig_offset
> - * the offset will be reset in the call_ops->rpc_call_done()
> routine.
> + dserver = (struct nfs4_pnfs_dserver *)req->wb_private;
> + BUG_ON(!dserver);
> +
> + /* use the first multipath data server */
> + ds = dserver->dev->ds_list[0];
> + dprintk("%s USE DS:\n", __func__);
> + print_ds(ds);
> +
> + data->pnfs_client = ds->ds_clp->cl_rpcclient;
> + data->ds_nfs_client = ds->ds_clp;
> + data->args.fh = dserver->fh;
> +
> + dprintk("%s set wb_devip: wb_devport %x:%hu\n", __FUNCTION__,
> + htonl(ds->ds_ip_addr),ntohs(ds->ds_port));
> +
> + /* Get the file offset on the dserver. Set the write offset to
> + * this offset and save the original offset.
> */
> data->args.offset = filelayout_get_dserver_offset(offset, nfslay);
> data->orig_offset = offset;
>
> - /* Perform an asynchronous write */
> + /* Perform an asynchronous write The offset will be reset in the
> + * call_ops->rpc_call_done() routine
> + */
> BUG_ON(data->pnfsflags & PNFS_ISSYNC);
> nfs_initiate_write(data, data->pnfs_client,
> &filelayout_write_call_ops, sync);
>
> + filelayout_release_dserver(dserver);
> +
> return 0;
> }
>
> @@ -686,6 +849,7 @@ struct layoutdriver_io_operations
> filelayout_io_operations = {
> .commit = filelayout_commit,
> .read_pagelist = filelayout_read_pagelist,
> .write_pagelist = filelayout_write_pagelist,
> + .flush_one = filelayout_flush_one,
> .set_layout = filelayout_set_layout,
> .alloc_layout = filelayout_alloc_layout,
> .free_layout = filelayout_free_layout,
> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> index fd099c7..c3eb2a4 100644
> --- a/fs/nfs/nfs4filelayout.h
> +++ b/fs/nfs/nfs4filelayout.h
> @@ -12,6 +12,7 @@
> #ifndef FS_NFS_NFS4FILELAYOUT_H
> #define FS_NFS_NFS4FILELAYOUT_H
>
> +#include <linux/kref.h>
> #include <linux/nfs4_pnfs.h>
> #include <linux/nfs4_session.h>
> #include <linux/pnfs_xdr.h>
> @@ -77,7 +78,7 @@ struct nfs4_pnfs_devlist {
> struct nfs4_pnfs_dserver {
> struct nfs_fh *fh;
> struct nfs4_pnfs_dev *dev;
> - u32 dev_id;
> + struct kref ref;
> };
>
> struct nfs4_filelayout {
> diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
> index 9d38cc8..cf683a0 100644
> --- a/fs/nfs/nfs4filelayoutdev.c
> +++ b/fs/nfs/nfs4filelayoutdev.c
> @@ -253,7 +253,7 @@ out_put:
> }
>
> /* Assumes lock is held */
> -static int
> +int
> unhash_ds(struct nfs4_pnfs_ds *ds)
> {
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 3f5ee35..574c600 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -63,6 +63,8 @@ extern int nfs4_pnfs_getdeviceinfo(struct inode *inode,
> u32 dev_id,
> struct pnfs_device *res);
> extern void nfs_initiate_commit(struct nfs_write_data *data,
> struct rpc_clnt *clnt, int how);
> +extern int nfs_flush_one(struct inode *inode, struct list_head *head,
> + unsigned int npages, size_t count, int
> how);
>
> struct pnfs_client_operations pnfs_ops;
>
> @@ -957,6 +959,21 @@ pnfs_writeback_done(struct nfs_write_data *data,
> ssize_t status)
> data->call_ops->rpc_release(data);
> }
>
> +int
> +pnfs_flush_one(struct inode *inode, struct list_head *head, unsigned int
> npages, size_t count, int how)
> +{
> + struct nfs_inode* nfsi = NFS_I(inode);
> + struct nfs_server* nfss = NFS_SERVER(inode);
> + struct layoutdriver_io_operations *io_ops;
> +
> + if (nfsi->current_layout != NULL &&
> + (nfss->pnfs_curr_ld->ld_io_ops->flush_one)) {
> + io_ops = nfss->pnfs_curr_ld->ld_io_ops;
> + return io_ops->flush_one(inode, head, npages, count, how);
> + } else
> + return nfs_flush_one(inode, head, npages, count, how);
> +}
> +
> /*
> * Call the appropriate parallel I/O subsystem write function.
> * If no I/O device driver exists, or one does match the returned
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index b314aff..88f88d5 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -53,6 +53,7 @@ void pnfs_commit_done_norpc(struct rpc_task *, void *);
> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
> struct nfs_open_context *, struct list_head *, size_t *);
> void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode
> *);
> void pnfs_update_layout_commit(struct inode *, struct list_head *,
> pgoff_t, unsigned int);
> +int pnfs_flush_one(struct inode *, struct list_head *, unsigned int,
> size_t, int);
>
> #endif /* CONFIG_PNFS */
>
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 4231915..45624d0 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -934,7 +934,7 @@ out_bad:
> * This is the case if nfs_updatepage detects a conflicting request
> * that has been written but not committed.
> */
> -static int nfs_flush_one(struct inode *inode, struct list_head *head,
> unsigned int npages, size_t count, int how)
> +int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned
> int npages, size_t count, int how)
> {
> struct nfs_page *req;
> struct page **pages;
> @@ -986,7 +986,11 @@ static void nfs_pageio_init_write(struct
> nfs_pageio_descriptor *pgio,
> if (wsize < PAGE_CACHE_SIZE)
> nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize,
> ioflags);
> else
> +#ifdef CONFIG_PNFS
> + nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize,
> ioflags);
> +#else
> nfs_pageio_init(pgio, inode, nfs_flush_one, wsize,
> ioflags);
> +#endif /* CONFIG_PNFS */
> }
>
> #ifdef CONFIG_PNFS
> @@ -1678,6 +1682,7 @@ EXPORT_SYMBOL(nfs_execute_write);
> EXPORT_SYMBOL(nfs_write_validate);
> EXPORT_SYMBOL(nfs_writedata_release);
> EXPORT_SYMBOL(nfs_flush_task_priority);
> +EXPORT_SYMBOL(nfs_flush_one);
> EXPORT_SYMBOL(nfs_commit_rpcsetup);
> EXPORT_SYMBOL(nfs_initiate_write);
> EXPORT_SYMBOL(nfs_initiate_commit);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index c5dd321..83aefc5 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -45,6 +45,8 @@ struct layoutdriver_io_operations {
> */
> ssize_t (*read_pagelist) (struct pnfs_layout_type *layoutid,
> struct inode *, struct page **pages, unsigned int pgbase, unsigned nr_pages,
> loff_t offset, size_t count, struct nfs_read_data *nfs_data);
> ssize_t (*write_pagelist) (struct pnfs_layout_type *layoutid,
> struct inode *, struct page **pages, unsigned int pgbase, unsigned nr_pages,
> loff_t offset, size_t count, int sync, struct nfs_write_data *nfs_data);
> + int (*flush_one) (struct inode *inode, struct list_head *head,
> unsigned int npages, size_t count, int how);
> +
>
> /* Functions that do not use the pagecache.
> * If use_pagecache == 0, then these functions must be
> implemented.
> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> index fe82716..cc5f1d0 100644
> --- a/include/linux/nfs_page.h
> +++ b/include/linux/nfs_page.h
> @@ -47,6 +47,7 @@ struct nfs_page {
> #ifdef CONFIG_PNFS
> unsigned int wb_devip; /* pNFS data server IP
> addr */
> unsigned int wb_devport; /* pNFS data server port
> */
> + void *wb_private;
> #endif
> };
>
> --
> 1.5.0.2
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://linux-nfs.org/pipermail/pnfs/attachments/20080108/ebb396e0/attachment-0001.htm
More information about the pNFS
mailing list