[pnfs] [PATCH 18/18] pnfs client flush_one io operation
William A. (Andy) Adamson
andros at citi.umich.edu
Mon Jan 14 16:27:28 EST 2008
hi dean
i did not want to support sending less than a page to each DS, and trond
agrees that supporting a stripe size of less than 4096 bytes is not worth
the effort. this is why there is not a pnfs_flush_multi function, just a
pnfs_flush_one() to replace nfs_flush_one().
-->Andy
On Jan 14, 2008 3:44 PM, Dean Hildebrand <seattleplus at gmail.com> wrote:
>
>
> >
> >
> >> +/*
> >> +* feed nfs_flush_one with per data server pages.
> >> +*
> >> +* Assume stripesz >= PAGE_SIZE.
> >> +* TODO: If stripesz < PAGE_SIZE, use i/o through MDS
> >>
> > How does this TODO relate to the threshold values? Why not just rely
> > on the threshold? I would like to ensure that there is an option of
> > load balancing I/O less than a page across the data servers.
> >
> Wait, sorry, I think I misunderstood this TODO. Do you just mean that
> basically a stripe size of < PAGE_SIZE is invalid and in that case you
> will just send all I/O to the MDS?
yes!
>
> Dean
> >> +*
> >> +*/
> >> +int filelayout_flush_one(struct inode *inode, struct list_head
> >> *head, unsigned int npages, size_t count, int how)
> >> +{
> >> + struct pnfs_layout_type *ltype = NFS_I(inode)->current_layout;
> >> + struct nfs4_filelayout *nfslay = ltype->layoutid;
> >> + struct nfs4_pnfs_dserver *dserver = NULL;
> >> + struct nfs4_pnfs_ds *ds = NULL; /* current stripe data server */
> >> + struct nfs_page* req;
> >> + loff_t dsoffset = 0;
> >> + size_t stripesz, reqcount, dstotal = 0;
> >> + struct list_head *dslist;
> >> + int status = -ENOMEM, use_ds = 0, ndspages = 0;
> >> +
> >> + dprintk("--> %s npages %d, count %Zd, ltype %p nfslay %p\n",
> >> __func__, npages, count, ltype, nfslay);
> >> +
> >> + dslist = kmalloc(sizeof(*dslist), GFP_KERNEL);
> >> + if (!dslist)
> >> + return status;
> >> + INIT_LIST_HEAD(dslist);
> >> +
> >> + stripesz =
> >> filelayout_get_stripesize(NFS_I(inode)->current_layout, inode);
> >> + dprintk("%s stripesize %Zd\n", __func__, stripesz);
> >> + /* split up the list according to DS */
> >> + while(!list_empty(head)) {
> >> +next_ds:
> >> + req = nfs_list_entry(head->next);
> >> +
> >> + if (use_ds)
> >> + goto use_ds;
> >> + /* reset for new data server */
> >> + dstotal = 0;
> >> + ndspages = 0;
> >> +
> >> + status = -ENOMEM;
> >> + dserver = filelayout_create_dserver();
> >> + if (!dserver) {
> >> + dprintk("%s failed to get dserver. status %d\n",
> >> + __FUNCTION__, status);
> >> + goto out;
> >> + }
> >> +
> >> + /* get the data server that serves this stripe */
> >> + status = nfs4_pnfs_dserver_get(inode, nfslay, dsoffset,
> >> + stripesz, dserver);
> >> +
> >> + if (status != 0) {
> >> + dprintk("%s failed to get dataserver. status %d\n",
> >> + __FUNCTION__, status);
> >> + status = -EIO;
> >> + goto out;
> >> + }
> >> + /* just try the first multipath data server */
> >> + ds = dserver->dev->ds_list[0];
> >> +
> >> + use_ds = 1;
> >> +use_ds:
> >> + filelayout_get_dserver(dserver);
> >> +
> >> + reqcount = count < PAGE_SIZE? count: PAGE_SIZE;
> >> + count -= reqcount;
> >> + dstotal += reqcount;
> >> +
> >> + req->wb_devip = ds->ds_ip_addr;
> >> + req->wb_devport = ds->ds_port;
> >> + req->wb_private = dserver;
> >> +
> >> + /* move request to dslist */
> >> + nfs_list_remove_request(req);
> >> + nfs_list_add_request(req, dslist);
> >> + ndspages++;
> >> + npages--;
> >> +
> >> + if (dstotal == stripesz)
> >> + dsoffset += dstotal;
> >> +
> >> + if (count == 0 || npages == 0 || dstotal == stripesz) {
> >> + use_ds = 0;
> >> + goto send;
> >> + }
> >> + }
> >> + if (!ds) {
> >> + status = -EIO;
> >> + goto out;
> >> + }
> >> +
> >> +send:
> >> + /* XXX should recover to send through MDS */
> >> + dprintk("%s Send: ndspages %d dstotal %Zd list_empty(head) %d \n",
> >> + __func__, ndspages, dstotal, list_empty(head));
> >> + status = nfs_flush_one(inode, dslist, ndspages, dstotal, how);
> >> + if (status < 0)
> >> + goto out;
> >> +
> >> + /* XXX should be BUG_ON(!list_empty(dslist)); */
> >> + if (!list_empty(head) && npages > 0) {
> >> + if (!list_empty(dslist)) {
> >> + printk("%s ERROR! dslist NOT EMPTY\n", __func__);
> >> + status = -EIO;
> >> + goto out;
> >> + }
> >> + dprintk("%s next_ds\n", __func__);
> >> + goto next_ds;
> >> + }
> >> +
> >> +out:
> >> + kfree(dslist);
> >> + dprintk("<-- %s npages %d (should be zero!)\n", __func__, npages);
> >> + return status;
> >> +}
> >> +
> >> +/* Perform async writes.
> >> *
> >> * TODO: See filelayout_read_pagelist.
> >> */
> >> @@ -356,52 +520,51 @@ ssize_t filelayout_write_pagelist(
> >> struct nfs_write_data *data)
> >> {
> >> struct nfs4_filelayout *nfslay = (struct nfs4_filelayout
> >> *)layoutid->layoutid;
> >> - struct nfs4_pnfs_dserver dserver;
> >> + struct nfs4_pnfs_dserver *dserver = NULL;
> >> struct nfs4_pnfs_ds *ds;
> >> - struct nfs_page *req;
> >> + struct nfs_page* req = NULL;
> >> struct list_head *h;
> >> - int status;
> >>
> >> - /* Retrieve the correct rpc_client for the byte range */
> >> - status = nfs4_pnfs_dserver_get(inode,
> >> - nfslay,
> >> - offset,
> >> - count,
> >> - &dserver);
> >> - if (status) {
> >> - dprintk("%s failed to get dataserver\n",
> >> - __FUNCTION__);
> >> - data->ds_nfs_client = NULL;
> >> - return -EIO;
> >> - } else {
> >> - /* just try the first data server for the index.. */
> >> - ds = dserver.dev->ds_list[0];
> >> - data->pnfs_client = ds->ds_clp->cl_rpcclient;
> >> - data->ds_nfs_client = ds->ds_clp;
> >> - data->args.fh = dserver.fh;
> >> - }
> >> - dprintk("%s set wb_devip: wb_devport %x:%hu\n", __FUNCTION__,
> >> - htonl(ds->ds_ip_addr), ntohs(ds->ds_port));
> >> + dprintk("--> %s nr_pages %d offset:count %Lu:%Zu\n", __func__,
> >> + nr_pages, offset, count);
> >>
> >> + /* Retrieve the correct rpc_client for the byte range */
> >> list_for_each(h, &data->pages) {
> >> req = list_entry(h, struct nfs_page, wb_list);
> >> - req->wb_devip = ds->ds_ip_addr;
> >> - req->wb_devport = ds->ds_port;
> >> + break;
> >> }
> >> + BUG_ON(!req);
> >>
> >> - /* Now get the file offset on the dserver
> >> - * Set the write offset to this offset, and
> >> - * save the original offset in orig_offset
> >> - * the offset will be reset in the call_ops->rpc_call_done()
> >> routine.
> >> + dserver = (struct nfs4_pnfs_dserver *)req->wb_private;
> >> + BUG_ON(!dserver);
> >> +
> >> + /* use the first multipath data server */
> >> + ds = dserver->dev->ds_list[0];
> >> + dprintk("%s USE DS:\n", __func__);
> >> + print_ds(ds);
> >> +
> >> + data->pnfs_client = ds->ds_clp->cl_rpcclient;
> >> + data->ds_nfs_client = ds->ds_clp;
> >> + data->args.fh = dserver->fh;
> >> +
> >> + dprintk("%s set wb_devip: wb_devport %x:%hu\n", __FUNCTION__,
> >> + htonl(ds->ds_ip_addr),ntohs(ds->ds_port));
> >> +
> >> + /* Get the file offset on the dserver. Set the write offset to
> >> + * this offset and save the original offset.
> >> */
> >> data->args.offset = filelayout_get_dserver_offset(offset, nfslay);
> >> data->orig_offset = offset;
> >>
> >> - /* Perform an asynchronous write */
> >> + /* Perform an asynchronous write The offset will be reset in the
> >> + * call_ops->rpc_call_done() routine
> >> + */
> >> BUG_ON(data->pnfsflags & PNFS_ISSYNC);
> >> nfs_initiate_write(data, data->pnfs_client,
> >> &filelayout_write_call_ops, sync);
> >>
> >> + filelayout_release_dserver(dserver);
> >> +
> >> return 0;
> >> }
> >>
> >> @@ -686,6 +849,7 @@ struct layoutdriver_io_operations
> >> filelayout_io_operations = {
> >> .commit = filelayout_commit,
> >> .read_pagelist = filelayout_read_pagelist,
> >> .write_pagelist = filelayout_write_pagelist,
> >> + .flush_one = filelayout_flush_one,
> >> .set_layout = filelayout_set_layout,
> >> .alloc_layout = filelayout_alloc_layout,
> >> .free_layout = filelayout_free_layout,
> >> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> >> index fd099c7..c3eb2a4 100644
> >> --- a/fs/nfs/nfs4filelayout.h
> >> +++ b/fs/nfs/nfs4filelayout.h
> >> @@ -12,6 +12,7 @@
> >> #ifndef FS_NFS_NFS4FILELAYOUT_H
> >> #define FS_NFS_NFS4FILELAYOUT_H
> >>
> >> +#include <linux/kref.h>
> >> #include <linux/nfs4_pnfs.h>
> >> #include <linux/nfs4_session.h>
> >> #include <linux/pnfs_xdr.h>
> >> @@ -77,7 +78,7 @@ struct nfs4_pnfs_devlist {
> >> struct nfs4_pnfs_dserver {
> >> struct nfs_fh *fh;
> >> struct nfs4_pnfs_dev *dev;
> >> - u32 dev_id;
> >> + struct kref ref;
> >> };
> >>
> >> struct nfs4_filelayout {
> >> diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
> >> index 9d38cc8..cf683a0 100644
> >> --- a/fs/nfs/nfs4filelayoutdev.c
> >> +++ b/fs/nfs/nfs4filelayoutdev.c
> >> @@ -253,7 +253,7 @@ out_put:
> >> }
> >>
> >> /* Assumes lock is held */
> >> -static int
> >> +int
> >> unhash_ds(struct nfs4_pnfs_ds *ds)
> >> {
> >>
> >> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> >> index 3f5ee35..574c600 100644
> >> --- a/fs/nfs/pnfs.c
> >> +++ b/fs/nfs/pnfs.c
> >> @@ -63,6 +63,8 @@ extern int nfs4_pnfs_getdeviceinfo(struct inode
> >> *inode, u32 dev_id,
> >> struct pnfs_device *res);
> >> extern void nfs_initiate_commit(struct nfs_write_data *data,
> >> struct rpc_clnt *clnt, int how);
> >> +extern int nfs_flush_one(struct inode *inode, struct list_head *head,
> >> + unsigned int npages, size_t count, int how);
> >>
> >> struct pnfs_client_operations pnfs_ops;
> >>
> >> @@ -957,6 +959,21 @@ pnfs_writeback_done(struct nfs_write_data *data,
> >> ssize_t status)
> >> data->call_ops->rpc_release(data);
> >> }
> >>
> >> +int
> >> +pnfs_flush_one(struct inode *inode, struct list_head *head, unsigned
> >> int npages, size_t count, int how)
> >> +{
> >> + struct nfs_inode* nfsi = NFS_I(inode);
> >> + struct nfs_server* nfss = NFS_SERVER(inode);
> >> + struct layoutdriver_io_operations *io_ops;
> >> +
> >> + if (nfsi->current_layout != NULL &&
> >> + (nfss->pnfs_curr_ld->ld_io_ops->flush_one)) {
> >> + io_ops = nfss->pnfs_curr_ld->ld_io_ops;
> >> + return io_ops->flush_one(inode, head, npages, count, how);
> >> + } else
> >> + return nfs_flush_one(inode, head, npages, count, how);
> >> +}
> >> +
> >> /*
> >> * Call the appropriate parallel I/O subsystem write function.
> >> * If no I/O device driver exists, or one does match the returned
> >> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> >> index b314aff..88f88d5 100644
> >> --- a/fs/nfs/pnfs.h
> >> +++ b/fs/nfs/pnfs.h
> >> @@ -53,6 +53,7 @@ void pnfs_commit_done_norpc(struct rpc_task *, void
> >> *);
> >> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct
> >> inode *, struct nfs_open_context *, struct list_head *, size_t *);
> >> void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct
> >> inode *);
> >> void pnfs_update_layout_commit(struct inode *, struct list_head *,
> >> pgoff_t, unsigned int);
> >> +int pnfs_flush_one(struct inode *, struct list_head *, unsigned int,
> >> size_t, int);
> >>
> >> #endif /* CONFIG_PNFS */
> >>
> >> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> >> index 4231915..45624d0 100644
> >> --- a/fs/nfs/write.c
> >> +++ b/fs/nfs/write.c
> >> @@ -934,7 +934,7 @@ out_bad:
> >> * This is the case if nfs_updatepage detects a conflicting request
> >> * that has been written but not committed.
> >> */
> >> -static int nfs_flush_one(struct inode *inode, struct list_head
> >> *head, unsigned int npages, size_t count, int how)
> >> +int nfs_flush_one(struct inode *inode, struct list_head *head,
> >> unsigned int npages, size_t count, int how)
> >> {
> >> struct nfs_page *req;
> >> struct page **pages;
> >> @@ -986,7 +986,11 @@ static void nfs_pageio_init_write(struct
> >> nfs_pageio_descriptor *pgio,
> >> if (wsize < PAGE_CACHE_SIZE)
> >> nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
> >> else
> >> +#ifdef CONFIG_PNFS
> >> + nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize, ioflags);
> >> +#else
> >> nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
> >> +#endif /* CONFIG_PNFS */
> >> }
> >>
> >> #ifdef CONFIG_PNFS
> >> @@ -1678,6 +1682,7 @@ EXPORT_SYMBOL(nfs_execute_write);
> >> EXPORT_SYMBOL(nfs_write_validate);
> >> EXPORT_SYMBOL(nfs_writedata_release);
> >> EXPORT_SYMBOL(nfs_flush_task_priority);
> >> +EXPORT_SYMBOL(nfs_flush_one);
> >> EXPORT_SYMBOL(nfs_commit_rpcsetup);
> >> EXPORT_SYMBOL(nfs_initiate_write);
> >> EXPORT_SYMBOL(nfs_initiate_commit);
> >> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> >> index c5dd321..83aefc5 100644
> >> --- a/include/linux/nfs4_pnfs.h
> >> +++ b/include/linux/nfs4_pnfs.h
> >> @@ -45,6 +45,8 @@ struct layoutdriver_io_operations {
> >> */
> >> ssize_t (*read_pagelist) (struct pnfs_layout_type *layoutid,
> >> struct inode *, struct page **pages, unsigned int pgbase, unsigned
> >> nr_pages, loff_t offset, size_t count, struct nfs_read_data *nfs_data);
> >> ssize_t (*write_pagelist) (struct pnfs_layout_type *layoutid,
> >> struct inode *, struct page **pages, unsigned int pgbase, unsigned
> >> nr_pages, loff_t offset, size_t count, int sync, struct
> >> nfs_write_data *nfs_data);
> >> + int (*flush_one) (struct inode *inode, struct list_head *head,
> >> unsigned int npages, size_t count, int how);
> >> +
> >>
> >> /* Functions that do not use the pagecache.
> >> * If use_pagecache == 0, then these functions must be
> implemented.
> >> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> >> index fe82716..cc5f1d0 100644
> >> --- a/include/linux/nfs_page.h
> >> +++ b/include/linux/nfs_page.h
> >> @@ -47,6 +47,7 @@ struct nfs_page {
> >> #ifdef CONFIG_PNFS
> >> unsigned int wb_devip; /* pNFS data server IP addr */
> >> unsigned int wb_devport; /* pNFS data server port */
> >> + void *wb_private;
> >> #endif
> >> };
> >>
> >>
> _______________________________________________
> pNFS mailing list
> pNFS at linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://linux-nfs.org/pipermail/pnfs/attachments/20080114/ad468be7/attachment-0001.htm
More information about the pNFS
mailing list