[pnfs] [PATCH 18/18] pnfs client flush_one io operation
William A. (Andy) Adamson
andros at citi.umich.edu
Tue Jan 15 11:25:30 EST 2008
On Jan 14, 2008 6:00 PM, Dean Hildebrand <seattleplus at gmail.com> wrote:
> William A. (Andy) Adamson wrote:
> > hi dean
> >
> > i did not want to support sending less than a page to each DS, and
> > trond agrees that supporting a stripe size of less than 4096 bytes is
> > not worth the effort. this is why there is not a pnfs_flush_multi
> > function, just a pnfs_flush_one() to replace nfs_flush_one().
>
>
> Hi Andy, could you clarify this a bit? I totally agree that supporting
> a stripe size < 4096 is unecessary.
and that is all i'm saying. we don't have a pnfs case for wsize <
PAGE_CACHE_SIZE below.
static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
struct inode *inode, int ioflags)
{
int wsize = NFS_SERVER(inode)->wsize;
#ifdef CONFIG_PNFS
pnfs_pageio_init_write(pgio, inode);
#endif /* CONFIG_PNFS */
if (wsize < PAGE_CACHE_SIZE)
nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize,
ioflags);
else
#ifdef CONFIG_PNFS
nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize,
ioflags);
#else
nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
#endif /* CONFIG_PNFS */
}
> But regarding writes < 4096, it
> seems there are cases where sending less than a page to a data server
> just makes sense.
of course - and it will happen - the first stripe worth of data gets sent to
the server caveat
threshold.
For example, what happens if I write 1,000,100 bytes
> across 11 data servers using a stripe size of 100,000 bytes. The last
> 100 bytes should be written to the "11th" data server, not the MDS.
> Dean
> >
> >
> > -->Andy
> >
> > On Jan 14, 2008 3:44 PM, Dean Hildebrand <seattleplus at gmail.com
> > <mailto:seattleplus at gmail.com>> wrote:
> >
> >
> >
> > >
> > >
> > >> +/*
> > >> +* feed nfs_flush_one with per data server pages.
> > >> +*
> > >> +* Assume stripesz >= PAGE_SIZE.
> > >> +* TODO: If stripesz < PAGE_SIZE, use i/o through MDS
> > >>
> > > How does this TODO relate to the threshold values? Why not just
> > rely
> > > on the threshold? I would like to ensure that there is an option
> of
> > > load balancing I/O less than a page across the data servers.
> > >
> > Wait, sorry, I think I misunderstood this TODO. Do you just mean
> that
> > basically a stripe size of < PAGE_SIZE is invalid and in that case
> you
> > will just send all I/O to the MDS?
> >
> >
> > yes!
> >
> >
> >
> >
> > Dean
> > >> +*
> > >> +*/
> > >> +int filelayout_flush_one(struct inode *inode, struct list_head
> > >> *head, unsigned int npages, size_t count, int how)
> > >> +{
> > >> + struct pnfs_layout_type *ltype =
> > NFS_I(inode)->current_layout;
> > >> + struct nfs4_filelayout *nfslay = ltype->layoutid;
> > >> + struct nfs4_pnfs_dserver *dserver = NULL;
> > >> + struct nfs4_pnfs_ds *ds = NULL; /* current stripe data
> > server */
> > >> + struct nfs_page* req;
> > >> + loff_t dsoffset = 0;
> > >> + size_t stripesz, reqcount, dstotal = 0;
> > >> + struct list_head *dslist;
> > >> + int status = -ENOMEM, use_ds = 0, ndspages = 0;
> > >> +
> > >> + dprintk("--> %s npages %d, count %Zd, ltype %p nfslay %p\n",
> > >> __func__, npages, count, ltype, nfslay);
> > >> +
> > >> + dslist = kmalloc(sizeof(*dslist), GFP_KERNEL);
> > >> + if (!dslist)
> > >> + return status;
> > >> + INIT_LIST_HEAD(dslist);
> > >> +
> > >> + stripesz =
> > >> filelayout_get_stripesize(NFS_I(inode)->current_layout, inode);
> > >> + dprintk("%s stripesize %Zd\n", __func__, stripesz);
> > >> + /* split up the list according to DS */
> > >> + while(!list_empty(head)) {
> > >> +next_ds:
> > >> + req = nfs_list_entry(head->next);
> > >> +
> > >> + if (use_ds)
> > >> + goto use_ds;
> > >> + /* reset for new data server */
> > >> + dstotal = 0;
> > >> + ndspages = 0;
> > >> +
> > >> + status = -ENOMEM;
> > >> + dserver = filelayout_create_dserver();
> > >> + if (!dserver) {
> > >> + dprintk("%s failed to get dserver. status %d\n",
> > >> + __FUNCTION__, status);
> > >> + goto out;
> > >> + }
> > >> +
> > >> + /* get the data server that serves this stripe */
> > >> + status = nfs4_pnfs_dserver_get(inode, nfslay, dsoffset,
> > >> + stripesz, dserver);
> > >> +
> > >> + if (status != 0) {
> > >> + dprintk("%s failed to get dataserver. status %d\n",
> > >> + __FUNCTION__, status);
> > >> + status = -EIO;
> > >> + goto out;
> > >> + }
> > >> + /* just try the first multipath data server */
> > >> + ds = dserver->dev->ds_list[0];
> > >> +
> > >> + use_ds = 1;
> > >> +use_ds:
> > >> + filelayout_get_dserver(dserver);
> > >> +
> > >> + reqcount = count < PAGE_SIZE? count: PAGE_SIZE;
> > >> + count -= reqcount;
> > >> + dstotal += reqcount;
> > >> +
> > >> + req->wb_devip = ds->ds_ip_addr;
> > >> + req->wb_devport = ds->ds_port;
> > >> + req->wb_private = dserver;
> > >> +
> > >> + /* move request to dslist */
> > >> + nfs_list_remove_request(req);
> > >> + nfs_list_add_request(req, dslist);
> > >> + ndspages++;
> > >> + npages--;
> > >> +
> > >> + if (dstotal == stripesz)
> > >> + dsoffset += dstotal;
> > >> +
> > >> + if (count == 0 || npages == 0 || dstotal == stripesz) {
> > >> + use_ds = 0;
> > >> + goto send;
> > >> + }
> > >> + }
> > >> + if (!ds) {
> > >> + status = -EIO;
> > >> + goto out;
> > >> + }
> > >> +
> > >> +send:
> > >> + /* XXX should recover to send through MDS */
> > >> + dprintk("%s Send: ndspages %d dstotal %Zd list_empty(head)
> > %d \n",
> > >> + __func__, ndspages, dstotal, list_empty(head));
> > >> + status = nfs_flush_one(inode, dslist, ndspages, dstotal,
> how);
> > >> + if (status < 0)
> > >> + goto out;
> > >> +
> > >> + /* XXX should be BUG_ON(!list_empty(dslist)); */
> > >> + if (!list_empty(head) && npages > 0) {
> > >> + if (!list_empty(dslist)) {
> > >> + printk("%s ERROR! dslist NOT EMPTY\n", __func__);
> > >> + status = -EIO;
> > >> + goto out;
> > >> + }
> > >> + dprintk("%s next_ds\n", __func__);
> > >> + goto next_ds;
> > >> + }
> > >> +
> > >> +out:
> > >> + kfree(dslist);
> > >> + dprintk("<-- %s npages %d (should be zero!)\n", __func__,
> > npages);
> > >> + return status;
> > >> +}
> > >> +
> > >> +/* Perform async writes.
> > >> *
> > >> * TODO: See filelayout_read_pagelist.
> > >> */
> > >> @@ -356,52 +520,51 @@ ssize_t filelayout_write_pagelist(
> > >> struct nfs_write_data *data)
> > >> {
> > >> struct nfs4_filelayout *nfslay = (struct nfs4_filelayout
> > >> *)layoutid->layoutid;
> > >> - struct nfs4_pnfs_dserver dserver;
> > >> + struct nfs4_pnfs_dserver *dserver = NULL;
> > >> struct nfs4_pnfs_ds *ds;
> > >> - struct nfs_page *req;
> > >> + struct nfs_page* req = NULL;
> > >> struct list_head *h;
> > >> - int status;
> > >>
> > >> - /* Retrieve the correct rpc_client for the byte range */
> > >> - status = nfs4_pnfs_dserver_get(inode,
> > >> - nfslay,
> > >> - offset,
> > >> - count,
> > >> - &dserver);
> > >> - if (status) {
> > >> - dprintk("%s failed to get dataserver\n",
> > >> - __FUNCTION__);
> > >> - data->ds_nfs_client = NULL;
> > >> - return -EIO;
> > >> - } else {
> > >> - /* just try the first data server for the index.. */
> > >> - ds = dserver.dev->ds_list[0];
> > >> - data->pnfs_client = ds->ds_clp->cl_rpcclient;
> > >> - data->ds_nfs_client = ds->ds_clp;
> > >> - data-> args.fh = dserver.fh;
> > >> - }
> > >> - dprintk("%s set wb_devip: wb_devport %x:%hu\n",
> __FUNCTION__,
> > >> - htonl(ds->ds_ip_addr), ntohs(ds->ds_port));
> > >> + dprintk("--> %s nr_pages %d offset:count %Lu:%Zu\n",
> > __func__,
> > >> + nr_pages, offset, count);
> > >>
> > >> + /* Retrieve the correct rpc_client for the byte range */
> > >> list_for_each(h, &data->pages) {
> > >> req = list_entry(h, struct nfs_page, wb_list);
> > >> - req->wb_devip = ds->ds_ip_addr;
> > >> - req->wb_devport = ds->ds_port;
> > >> + break;
> > >> }
> > >> + BUG_ON(!req);
> > >>
> > >> - /* Now get the file offset on the dserver
> > >> - * Set the write offset to this offset, and
> > >> - * save the original offset in orig_offset
> > >> - * the offset will be reset in the call_ops->rpc_call_done()
> > >> routine.
> > >> + dserver = (struct nfs4_pnfs_dserver *)req->wb_private;
> > >> + BUG_ON(!dserver);
> > >> +
> > >> + /* use the first multipath data server */
> > >> + ds = dserver->dev->ds_list[0];
> > >> + dprintk("%s USE DS:\n", __func__);
> > >> + print_ds(ds);
> > >> +
> > >> + data->pnfs_client = ds->ds_clp->cl_rpcclient;
> > >> + data->ds_nfs_client = ds->ds_clp;
> > >> + data->args.fh = dserver->fh;
> > >> +
> > >> + dprintk("%s set wb_devip: wb_devport %x:%hu\n",
> __FUNCTION__,
> > >> + htonl(ds->ds_ip_addr),ntohs(ds->ds_port));
> > >> +
> > >> + /* Get the file offset on the dserver. Set the write offset
> to
> > >> + * this offset and save the original offset.
> > >> */
> > >> data->args.offset = filelayout_get_dserver_offset(offset,
> > nfslay);
> > >> data->orig_offset = offset;
> > >>
> > >> - /* Perform an asynchronous write */
> > >> + /* Perform an asynchronous write The offset will be reset
> > in the
> > >> + * call_ops->rpc_call_done() routine
> > >> + */
> > >> BUG_ON(data->pnfsflags & PNFS_ISSYNC);
> > >> nfs_initiate_write(data, data->pnfs_client,
> > >> &filelayout_write_call_ops, sync);
> > >>
> > >> + filelayout_release_dserver(dserver);
> > >> +
> > >> return 0;
> > >> }
> > >>
> > >> @@ -686,6 +849,7 @@ struct layoutdriver_io_operations
> > >> filelayout_io_operations = {
> > >> .commit = filelayout_commit,
> > >> .read_pagelist = filelayout_read_pagelist,
> > >> .write_pagelist = filelayout_write_pagelist,
> > >> + .flush_one = filelayout_flush_one,
> > >> .set_layout = filelayout_set_layout,
> > >> .alloc_layout = filelayout_alloc_layout,
> > >> .free_layout = filelayout_free_layout,
> > >> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> > >> index fd099c7..c3eb2a4 100644
> > >> --- a/fs/nfs/nfs4filelayout.h
> > >> +++ b/fs/nfs/nfs4filelayout.h
> > >> @@ -12,6 +12,7 @@
> > >> #ifndef FS_NFS_NFS4FILELAYOUT_H
> > >> #define FS_NFS_NFS4FILELAYOUT_H
> > >>
> > >> +#include <linux/kref.h>
> > >> #include <linux/nfs4_pnfs.h>
> > >> #include <linux/nfs4_session.h>
> > >> #include <linux/pnfs_xdr.h>
> > >> @@ -77,7 +78,7 @@ struct nfs4_pnfs_devlist {
> > >> struct nfs4_pnfs_dserver {
> > >> struct nfs_fh *fh;
> > >> struct nfs4_pnfs_dev *dev;
> > >> - u32 dev_id;
> > >> + struct kref ref;
> > >> };
> > >>
> > >> struct nfs4_filelayout {
> > >> diff --git a/fs/nfs/nfs4filelayoutdev.c
> > b/fs/nfs/nfs4filelayoutdev.c
> > >> index 9d38cc8..cf683a0 100644
> > >> --- a/fs/nfs/nfs4filelayoutdev.c
> > >> +++ b/fs/nfs/nfs4filelayoutdev.c
> > >> @@ -253,7 +253,7 @@ out_put:
> > >> }
> > >>
> > >> /* Assumes lock is held */
> > >> -static int
> > >> +int
> > >> unhash_ds(struct nfs4_pnfs_ds *ds)
> > >> {
> > >>
> > >> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> > >> index 3f5ee35..574c600 100644
> > >> --- a/fs/nfs/pnfs.c
> > >> +++ b/fs/nfs/pnfs.c
> > >> @@ -63,6 +63,8 @@ extern int nfs4_pnfs_getdeviceinfo(struct inode
> > >> *inode, u32 dev_id,
> > >> struct pnfs_device *res);
> > >> extern void nfs_initiate_commit(struct nfs_write_data *data,
> > >> struct rpc_clnt *clnt, int how);
> > >> +extern int nfs_flush_one(struct inode *inode, struct list_head
> > *head,
> > >> + unsigned int npages, size_t count, int how);
> > >>
> > >> struct pnfs_client_operations pnfs_ops;
> > >>
> > >> @@ -957,6 +959,21 @@ pnfs_writeback_done(struct nfs_write_data
> > *data,
> > >> ssize_t status)
> > >> data->call_ops->rpc_release(data);
> > >> }
> > >>
> > >> +int
> > >> +pnfs_flush_one(struct inode *inode, struct list_head *head,
> > unsigned
> > >> int npages, size_t count, int how)
> > >> +{
> > >> + struct nfs_inode* nfsi = NFS_I(inode);
> > >> + struct nfs_server* nfss = NFS_SERVER(inode);
> > >> + struct layoutdriver_io_operations *io_ops;
> > >> +
> > >> + if (nfsi->current_layout != NULL &&
> > >> + (nfss->pnfs_curr_ld->ld_io_ops->flush_one)) {
> > >> + io_ops = nfss->pnfs_curr_ld->ld_io_ops;
> > >> + return io_ops->flush_one(inode, head, npages, count,
> how);
> > >> + } else
> > >> + return nfs_flush_one(inode, head, npages, count, how);
> > >> +}
> > >> +
> > >> /*
> > >> * Call the appropriate parallel I/O subsystem write function.
> > >> * If no I/O device driver exists, or one does match the
> returned
> > >> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> > >> index b314aff..88f88d5 100644
> > >> --- a/fs/nfs/pnfs.h
> > >> +++ b/fs/nfs/pnfs.h
> > >> @@ -53,6 +53,7 @@ void pnfs_commit_done_norpc(struct rpc_task
> > *, void
> > >> *);
> > >> void pnfs_pageio_init_read(struct nfs_pageio_descriptor *,
> struct
> > >> inode *, struct nfs_open_context *, struct list_head *, size_t
> *);
> > >> void pnfs_pageio_init_write(struct nfs_pageio_descriptor *,
> struct
> > >> inode *);
> > >> void pnfs_update_layout_commit(struct inode *, struct
> > list_head *,
> > >> pgoff_t, unsigned int);
> > >> +int pnfs_flush_one(struct inode *, struct list_head *,
> > unsigned int,
> > >> size_t, int);
> > >>
> > >> #endif /* CONFIG_PNFS */
> > >>
> > >> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > >> index 4231915..45624d0 100644
> > >> --- a/fs/nfs/write.c
> > >> +++ b/fs/nfs/write.c
> > >> @@ -934,7 +934,7 @@ out_bad:
> > >> * This is the case if nfs_updatepage detects a conflicting
> > request
> > >> * that has been written but not committed.
> > >> */
> > >> -static int nfs_flush_one(struct inode *inode, struct list_head
> > >> *head, unsigned int npages, size_t count, int how)
> > >> +int nfs_flush_one(struct inode *inode, struct list_head *head,
> > >> unsigned int npages, size_t count, int how)
> > >> {
> > >> struct nfs_page *req;
> > >> struct page **pages;
> > >> @@ -986,7 +986,11 @@ static void nfs_pageio_init_write(struct
> > >> nfs_pageio_descriptor *pgio,
> > >> if (wsize < PAGE_CACHE_SIZE)
> > >> nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize,
> > ioflags);
> > >> else
> > >> +#ifdef CONFIG_PNFS
> > >> + nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize,
> > ioflags);
> > >> +#else
> > >> nfs_pageio_init(pgio, inode, nfs_flush_one, wsize,
> > ioflags);
> > >> +#endif /* CONFIG_PNFS */
> > >> }
> > >>
> > >> #ifdef CONFIG_PNFS
> > >> @@ -1678,6 +1682,7 @@ EXPORT_SYMBOL(nfs_execute_write);
> > >> EXPORT_SYMBOL(nfs_write_validate);
> > >> EXPORT_SYMBOL(nfs_writedata_release);
> > >> EXPORT_SYMBOL(nfs_flush_task_priority);
> > >> +EXPORT_SYMBOL(nfs_flush_one);
> > >> EXPORT_SYMBOL(nfs_commit_rpcsetup);
> > >> EXPORT_SYMBOL(nfs_initiate_write);
> > >> EXPORT_SYMBOL(nfs_initiate_commit);
> > >> diff --git a/include/linux/nfs4_pnfs.h
> b/include/linux/nfs4_pnfs.h
> > >> index c5dd321..83aefc5 100644
> > >> --- a/include/linux/nfs4_pnfs.h
> > >> +++ b/include/linux/nfs4_pnfs.h
> > >> @@ -45,6 +45,8 @@ struct layoutdriver_io_operations {
> > >> */
> > >> ssize_t (*read_pagelist) (struct pnfs_layout_type *layoutid,
> > >> struct inode *, struct page **pages, unsigned int pgbase,
> unsigned
> > >> nr_pages, loff_t offset, size_t count, struct nfs_read_data
> > *nfs_data);
> > >> ssize_t (*write_pagelist) (struct pnfs_layout_type
> *layoutid,
> > >> struct inode *, struct page **pages, unsigned int pgbase,
> unsigned
> > >> nr_pages, loff_t offset, size_t count, int sync, struct
> > >> nfs_write_data *nfs_data);
> > >> + int (*flush_one) (struct inode *inode, struct list_head
> *head,
> > >> unsigned int npages, size_t count, int how);
> > >> +
> > >>
> > >> /* Functions that do not use the pagecache.
> > >> * If use_pagecache == 0, then these functions must be
> > implemented.
> > >> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> > >> index fe82716..cc5f1d0 100644
> > >> --- a/include/linux/nfs_page.h
> > >> +++ b/include/linux/nfs_page.h
> > >> @@ -47,6 +47,7 @@ struct nfs_page {
> > >> #ifdef CONFIG_PNFS
> > >> unsigned int wb_devip; /* pNFS data server IP
> > addr */
> > >> unsigned int wb_devport; /* pNFS data server
> > port */
> > >> + void *wb_private;
> > >> #endif
> > >> };
> > >>
> > >>
> > _______________________________________________
> > pNFS mailing list
> > pNFS at linux-nfs.org <mailto:pNFS at linux-nfs.org>
> > http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> >
> >
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://linux-nfs.org/pipermail/pnfs/attachments/20080115/6b8d8486/attachment-0001.htm
More information about the pNFS
mailing list