[pnfs] [PATCH 18/18] pnfs client flush_one io operation

William A. (Andy) Adamson andros at citi.umich.edu
Tue Jan 15 11:25:30 EST 2008


On Jan 14, 2008 6:00 PM, Dean Hildebrand <seattleplus at gmail.com> wrote:

> William A. (Andy) Adamson wrote:
> > hi dean
> >
> > i did not want to support sending less than a page to each DS, and
> > trond agrees that supporting a stripe size of less than 4096 bytes is
> > not worth the effort. this is why there is not a pnfs_flush_multi
> > function, just a pnfs_flush_one() to replace nfs_flush_one().
>



>
> Hi Andy, could you clarify this a bit?  I totally agree that supporting
> a stripe size < 4096 is unecessary.


and that is all i'm saying. we don't have a pnfs case for wsize <
PAGE_CACHE_SIZE below.

static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
                                  struct inode *inode, int ioflags)
{
        int wsize = NFS_SERVER(inode)->wsize;

#ifdef CONFIG_PNFS
        pnfs_pageio_init_write(pgio, inode);
#endif /* CONFIG_PNFS */

        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize,
ioflags);
        else
#ifdef CONFIG_PNFS
                nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize,
ioflags);
#else
                nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
#endif /* CONFIG_PNFS */
}



> But regarding writes < 4096, it
> seems there are cases where sending less than a page to a data server
> just makes sense.


of course - and it will happen - the first stripe worth of data gets sent to
the server caveat
threshold.

 For example, what happens if I write 1,000,100 bytes
> across 11 data servers using a stripe size of 100,000 bytes.  The last
> 100 bytes should be written to the "11th" data server, not the MDS.
> Dean
> >
> >
> > -->Andy
> >
> > On Jan 14, 2008 3:44 PM, Dean Hildebrand <seattleplus at gmail.com
> > <mailto:seattleplus at gmail.com>> wrote:
> >
> >
> >
> >     >
> >     >
> >     >> +/*
> >     >> +* feed nfs_flush_one with per data server pages.
> >     >> +*
> >     >> +* Assume stripesz >= PAGE_SIZE.
> >     >> +* TODO: If stripesz < PAGE_SIZE, use i/o through MDS
> >     >>
> >     > How does this TODO relate to the threshold values?  Why not just
> >     rely
> >     > on the threshold?  I would like to ensure that there is an option
> of
> >     > load balancing I/O less than a page across the data servers.
> >     >
> >     Wait, sorry, I think I misunderstood this TODO.  Do you just mean
> that
> >     basically a stripe size of < PAGE_SIZE is invalid and in that case
> you
> >     will just send all I/O to the MDS?
> >
> >
> > yes!
> >
> >
> >
> >
> >     Dean
> >     >> +*
> >     >> +*/
> >     >> +int filelayout_flush_one(struct inode *inode, struct list_head
> >     >> *head, unsigned int npages, size_t count, int how)
> >     >> +{
> >     >> +    struct pnfs_layout_type *ltype =
> >     NFS_I(inode)->current_layout;
> >     >> +    struct nfs4_filelayout *nfslay = ltype->layoutid;
> >     >> +    struct nfs4_pnfs_dserver *dserver = NULL;
> >     >> +    struct nfs4_pnfs_ds *ds = NULL;  /* current stripe data
> >     server */
> >     >> +    struct nfs_page* req;
> >     >> +    loff_t dsoffset = 0;
> >     >> +    size_t stripesz, reqcount, dstotal = 0;
> >     >> +    struct list_head *dslist;
> >     >> +    int status = -ENOMEM, use_ds = 0, ndspages = 0;
> >     >> +
> >     >> +    dprintk("--> %s npages %d, count %Zd, ltype %p nfslay %p\n",
> >     >> __func__, npages, count, ltype, nfslay);
> >     >> +
> >     >> +    dslist = kmalloc(sizeof(*dslist), GFP_KERNEL);
> >     >> +    if (!dslist)
> >     >> +        return status;
> >     >> +    INIT_LIST_HEAD(dslist);
> >     >> +
> >     >> +    stripesz =
> >     >> filelayout_get_stripesize(NFS_I(inode)->current_layout, inode);
> >     >> +    dprintk("%s stripesize %Zd\n", __func__, stripesz);
> >     >> +    /* split up the list according to DS */
> >     >> +    while(!list_empty(head)) {
> >     >> +next_ds:
> >     >> +        req = nfs_list_entry(head->next);
> >     >> +
> >     >> +        if (use_ds)
> >     >> +            goto use_ds;
> >     >> +        /* reset for new data server */
> >     >> +        dstotal = 0;
> >     >> +        ndspages = 0;
> >     >> +
> >     >> +        status = -ENOMEM;
> >     >> +        dserver = filelayout_create_dserver();
> >     >> +        if (!dserver) {
> >     >> +            dprintk("%s failed to get dserver. status %d\n",
> >     >> +                        __FUNCTION__, status);
> >     >> +            goto out;
> >     >> +        }
> >     >> +
> >     >> +        /* get the data server that serves this stripe */
> >     >> +        status = nfs4_pnfs_dserver_get(inode, nfslay, dsoffset,
> >     >> +                stripesz, dserver);
> >     >> +
> >     >> +        if (status != 0) {
> >     >> +            dprintk("%s failed to get dataserver. status %d\n",
> >     >> +                        __FUNCTION__, status);
> >     >> +            status =  -EIO;
> >     >> +            goto out;
> >     >> +        }
> >     >> +        /* just try the first multipath data server */
> >     >> +        ds = dserver->dev->ds_list[0];
> >     >> +
> >     >> +        use_ds = 1;
> >     >> +use_ds:
> >     >> +        filelayout_get_dserver(dserver);
> >     >> +
> >     >> +        reqcount = count < PAGE_SIZE? count: PAGE_SIZE;
> >     >> +        count -= reqcount;
> >     >> +        dstotal += reqcount;
> >     >> +
> >     >> +        req->wb_devip = ds->ds_ip_addr;
> >     >> +        req->wb_devport = ds->ds_port;
> >     >> +        req->wb_private = dserver;
> >     >> +
> >     >> +        /* move request to dslist */
> >     >> +        nfs_list_remove_request(req);
> >     >> +        nfs_list_add_request(req, dslist);
> >     >> +        ndspages++;
> >     >> +        npages--;
> >     >> +
> >     >> +        if (dstotal == stripesz)
> >     >> +            dsoffset += dstotal;
> >     >> +
> >     >> +        if (count == 0 || npages == 0 || dstotal == stripesz) {
> >     >> +            use_ds = 0;
> >     >> +            goto send;
> >     >> +        }
> >     >> +    }
> >     >> +    if (!ds) {
> >     >> +        status = -EIO;
> >     >> +        goto out;
> >     >> +    }
> >     >> +
> >     >> +send:
> >     >> +    /* XXX should recover to send through MDS */
> >     >> +    dprintk("%s Send: ndspages %d dstotal %Zd list_empty(head)
> >     %d \n",
> >     >> +            __func__, ndspages, dstotal, list_empty(head));
> >     >> +    status = nfs_flush_one(inode, dslist, ndspages, dstotal,
> how);
> >     >> +    if (status < 0)
> >     >> +        goto out;
> >     >> +
> >     >> +    /* XXX should be BUG_ON(!list_empty(dslist)); */
> >     >> +    if (!list_empty(head) && npages > 0) {
> >     >> +        if (!list_empty(dslist)) {
> >     >> +            printk("%s ERROR! dslist NOT EMPTY\n", __func__);
> >     >> +            status = -EIO;
> >     >> +            goto out;
> >     >> +        }
> >     >> +        dprintk("%s next_ds\n", __func__);
> >     >> +        goto next_ds;
> >     >> +    }
> >     >> +
> >     >> +out:
> >     >> +    kfree(dslist);
> >     >> +    dprintk("<-- %s npages %d (should be zero!)\n", __func__,
> >     npages);
> >     >> +    return status;
> >     >> +}
> >     >> +
> >     >> +/* Perform async writes.
> >     >>   *
> >     >>   * TODO: See filelayout_read_pagelist.
> >     >>   */
> >     >> @@ -356,52 +520,51 @@ ssize_t filelayout_write_pagelist(
> >     >>      struct nfs_write_data *data)
> >     >>  {
> >     >>      struct nfs4_filelayout *nfslay = (struct nfs4_filelayout
> >     >> *)layoutid->layoutid;
> >     >> -    struct nfs4_pnfs_dserver dserver;
> >     >> +    struct nfs4_pnfs_dserver *dserver = NULL;
> >     >>      struct nfs4_pnfs_ds *ds;
> >     >> -    struct nfs_page *req;
> >     >> +    struct nfs_page* req = NULL;
> >     >>      struct list_head *h;
> >     >> -    int status;
> >     >>
> >     >> -    /* Retrieve the correct rpc_client for the byte range */
> >     >> -    status = nfs4_pnfs_dserver_get(inode,
> >     >> -                    nfslay,
> >     >> -                    offset,
> >     >> -                    count,
> >     >> -                    &dserver);
> >     >> -    if (status) {
> >     >> -        dprintk("%s failed to get dataserver\n",
> >     >> -                        __FUNCTION__);
> >     >> -        data->ds_nfs_client = NULL;
> >     >> -        return -EIO;
> >     >> -    } else {
> >     >> -        /* just try the first data server for the index.. */
> >     >> -        ds = dserver.dev->ds_list[0];
> >     >> -        data->pnfs_client = ds->ds_clp->cl_rpcclient;
> >     >> -        data->ds_nfs_client = ds->ds_clp;
> >     >> -        data-> args.fh = dserver.fh;
> >     >> -    }
> >     >> -    dprintk("%s set wb_devip: wb_devport %x:%hu\n",
> __FUNCTION__,
> >     >> -            htonl(ds->ds_ip_addr), ntohs(ds->ds_port));
> >     >> +    dprintk("--> %s nr_pages %d offset:count %Lu:%Zu\n",
> >     __func__,
> >     >> +                        nr_pages, offset, count);
> >     >>
> >     >> +    /* Retrieve the correct rpc_client for the byte range */
> >     >>      list_for_each(h, &data->pages) {
> >     >>          req = list_entry(h, struct nfs_page, wb_list);
> >     >> -        req->wb_devip = ds->ds_ip_addr;
> >     >> -        req->wb_devport = ds->ds_port;
> >     >> +        break;
> >     >>      }
> >     >> +    BUG_ON(!req);
> >     >>
> >     >> -    /* Now get the file offset on the dserver
> >     >> -     * Set the write offset to this offset, and
> >     >> -     * save the original offset in orig_offset
> >     >> -     * the offset will be reset in the call_ops->rpc_call_done()
> >     >> routine.
> >     >> +    dserver = (struct nfs4_pnfs_dserver *)req->wb_private;
> >     >> +    BUG_ON(!dserver);
> >     >> +
> >     >> +    /* use the first multipath data server */
> >     >> +    ds = dserver->dev->ds_list[0];
> >     >> +    dprintk("%s USE DS:\n", __func__);
> >     >> +    print_ds(ds);
> >     >> +
> >     >> +    data->pnfs_client = ds->ds_clp->cl_rpcclient;
> >     >> +    data->ds_nfs_client = ds->ds_clp;
> >     >> +    data->args.fh = dserver->fh;
> >     >> +
> >     >> +    dprintk("%s set wb_devip: wb_devport %x:%hu\n",
> __FUNCTION__,
> >     >> +    htonl(ds->ds_ip_addr),ntohs(ds->ds_port));
> >     >> +
> >     >> +    /* Get the file offset on the dserver. Set the write offset
> to
> >     >> +     * this offset and save the original offset.
> >     >>       */
> >     >>      data->args.offset = filelayout_get_dserver_offset(offset,
> >     nfslay);
> >     >>      data->orig_offset = offset;
> >     >>
> >     >> -    /* Perform an asynchronous write */
> >     >> +    /* Perform an asynchronous write The offset will be reset
> >     in the
> >     >> +     * call_ops->rpc_call_done() routine
> >     >> +     */
> >     >>      BUG_ON(data->pnfsflags & PNFS_ISSYNC);
> >     >>      nfs_initiate_write(data, data->pnfs_client,
> >     >>              &filelayout_write_call_ops, sync);
> >     >>
> >     >> +    filelayout_release_dserver(dserver);
> >     >> +
> >     >>      return 0;
> >     >>  }
> >     >>
> >     >> @@ -686,6 +849,7 @@ struct layoutdriver_io_operations
> >     >> filelayout_io_operations = {
> >     >>      .commit                  = filelayout_commit,
> >     >>      .read_pagelist           = filelayout_read_pagelist,
> >     >>      .write_pagelist          = filelayout_write_pagelist,
> >     >> +    .flush_one         = filelayout_flush_one,
> >     >>      .set_layout              = filelayout_set_layout,
> >     >>      .alloc_layout            = filelayout_alloc_layout,
> >     >>      .free_layout             = filelayout_free_layout,
> >     >> diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
> >     >> index fd099c7..c3eb2a4 100644
> >     >> --- a/fs/nfs/nfs4filelayout.h
> >     >> +++ b/fs/nfs/nfs4filelayout.h
> >     >> @@ -12,6 +12,7 @@
> >     >>  #ifndef FS_NFS_NFS4FILELAYOUT_H
> >     >>  #define FS_NFS_NFS4FILELAYOUT_H
> >     >>
> >     >> +#include <linux/kref.h>
> >     >>  #include <linux/nfs4_pnfs.h>
> >     >>  #include <linux/nfs4_session.h>
> >     >>  #include <linux/pnfs_xdr.h>
> >     >> @@ -77,7 +78,7 @@ struct nfs4_pnfs_devlist {
> >     >>  struct nfs4_pnfs_dserver {
> >     >>      struct nfs_fh        *fh;
> >     >>      struct nfs4_pnfs_dev *dev;
> >     >> -    u32 dev_id;
> >     >> +    struct kref           ref;
> >     >>  };
> >     >>
> >     >>  struct nfs4_filelayout {
> >     >> diff --git a/fs/nfs/nfs4filelayoutdev.c
> >     b/fs/nfs/nfs4filelayoutdev.c
> >     >> index 9d38cc8..cf683a0 100644
> >     >> --- a/fs/nfs/nfs4filelayoutdev.c
> >     >> +++ b/fs/nfs/nfs4filelayoutdev.c
> >     >> @@ -253,7 +253,7 @@ out_put:
> >     >>  }
> >     >>
> >     >>  /* Assumes lock is held */
> >     >> -static int
> >     >> +int
> >     >>  unhash_ds(struct nfs4_pnfs_ds *ds)
> >     >>  {
> >     >>
> >     >> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> >     >> index 3f5ee35..574c600 100644
> >     >> --- a/fs/nfs/pnfs.c
> >     >> +++ b/fs/nfs/pnfs.c
> >     >> @@ -63,6 +63,8 @@ extern int nfs4_pnfs_getdeviceinfo(struct inode
> >     >> *inode, u32 dev_id,
> >     >>                     struct pnfs_device *res);
> >     >>  extern void nfs_initiate_commit(struct nfs_write_data *data,
> >     >>                  struct rpc_clnt *clnt, int how);
> >     >> +extern int nfs_flush_one(struct inode *inode, struct list_head
> >     *head,
> >     >> +                unsigned int npages, size_t count, int how);
> >     >>
> >     >>  struct pnfs_client_operations pnfs_ops;
> >     >>
> >     >> @@ -957,6 +959,21 @@ pnfs_writeback_done(struct nfs_write_data
> >     *data,
> >     >> ssize_t status)
> >     >>      data->call_ops->rpc_release(data);
> >     >>  }
> >     >>
> >     >> +int
> >     >> +pnfs_flush_one(struct inode *inode, struct list_head *head,
> >     unsigned
> >     >> int npages, size_t count, int how)
> >     >> +{
> >     >> +    struct nfs_inode* nfsi = NFS_I(inode);
> >     >> +    struct nfs_server* nfss = NFS_SERVER(inode);
> >     >> +    struct layoutdriver_io_operations *io_ops;
> >     >> +
> >     >> +    if (nfsi->current_layout != NULL &&
> >     >> +        (nfss->pnfs_curr_ld->ld_io_ops->flush_one)) {
> >     >> +        io_ops = nfss->pnfs_curr_ld->ld_io_ops;
> >     >> +        return io_ops->flush_one(inode, head, npages, count,
> how);
> >     >> +    } else
> >     >> +        return nfs_flush_one(inode, head, npages, count, how);
> >     >> +}
> >     >> +
> >     >>  /*
> >     >>   * Call the appropriate parallel I/O subsystem write function.
> >     >>   * If no I/O device driver exists, or one does match the
> returned
> >     >> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> >     >> index b314aff..88f88d5 100644
> >     >> --- a/fs/nfs/pnfs.h
> >     >> +++ b/fs/nfs/pnfs.h
> >     >> @@ -53,6 +53,7 @@ void pnfs_commit_done_norpc(struct rpc_task
> >     *, void
> >     >> *);
> >     >>  void pnfs_pageio_init_read(struct nfs_pageio_descriptor *,
> struct
> >     >> inode *, struct nfs_open_context *, struct list_head *, size_t
> *);
> >     >>  void pnfs_pageio_init_write(struct nfs_pageio_descriptor *,
> struct
> >     >> inode *);
> >     >>  void pnfs_update_layout_commit(struct inode *, struct
> >     list_head *,
> >     >> pgoff_t, unsigned int);
> >     >> +int pnfs_flush_one(struct inode *, struct list_head *,
> >     unsigned int,
> >     >> size_t, int);
> >     >>
> >     >>  #endif /* CONFIG_PNFS */
> >     >>
> >     >> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> >     >> index 4231915..45624d0 100644
> >     >> --- a/fs/nfs/write.c
> >     >> +++ b/fs/nfs/write.c
> >     >> @@ -934,7 +934,7 @@ out_bad:
> >     >>   * This is the case if nfs_updatepage detects a conflicting
> >     request
> >     >>   * that has been written but not committed.
> >     >>   */
> >     >> -static int nfs_flush_one(struct inode *inode, struct list_head
> >     >> *head, unsigned int npages, size_t count, int how)
> >     >> +int nfs_flush_one(struct inode *inode, struct list_head *head,
> >     >> unsigned int npages, size_t count, int how)
> >     >>  {
> >     >>      struct nfs_page        *req;
> >     >>      struct page        **pages;
> >     >> @@ -986,7 +986,11 @@ static void nfs_pageio_init_write(struct
> >     >> nfs_pageio_descriptor *pgio,
> >     >>      if (wsize < PAGE_CACHE_SIZE)
> >     >>          nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize,
> >     ioflags);
> >     >>      else
> >     >> +#ifdef CONFIG_PNFS
> >     >> +        nfs_pageio_init(pgio, inode, pnfs_flush_one, wsize,
> >     ioflags);
> >     >> +#else
> >     >>          nfs_pageio_init(pgio, inode, nfs_flush_one, wsize,
> >     ioflags);
> >     >> +#endif /* CONFIG_PNFS */
> >     >>  }
> >     >>
> >     >>  #ifdef CONFIG_PNFS
> >     >> @@ -1678,6 +1682,7 @@ EXPORT_SYMBOL(nfs_execute_write);
> >     >>  EXPORT_SYMBOL(nfs_write_validate);
> >     >>  EXPORT_SYMBOL(nfs_writedata_release);
> >     >>  EXPORT_SYMBOL(nfs_flush_task_priority);
> >     >> +EXPORT_SYMBOL(nfs_flush_one);
> >     >>  EXPORT_SYMBOL(nfs_commit_rpcsetup);
> >     >>  EXPORT_SYMBOL(nfs_initiate_write);
> >     >>  EXPORT_SYMBOL(nfs_initiate_commit);
> >     >> diff --git a/include/linux/nfs4_pnfs.h
> b/include/linux/nfs4_pnfs.h
> >     >> index c5dd321..83aefc5 100644
> >     >> --- a/include/linux/nfs4_pnfs.h
> >     >> +++ b/include/linux/nfs4_pnfs.h
> >     >> @@ -45,6 +45,8 @@ struct layoutdriver_io_operations {
> >     >>       */
> >     >>      ssize_t (*read_pagelist) (struct pnfs_layout_type *layoutid,
> >     >> struct inode *, struct page **pages, unsigned int pgbase,
> unsigned
> >     >> nr_pages, loff_t offset, size_t count, struct nfs_read_data
> >     *nfs_data);
> >     >>      ssize_t (*write_pagelist) (struct pnfs_layout_type
> *layoutid,
> >     >> struct inode *, struct page **pages, unsigned int pgbase,
> unsigned
> >     >> nr_pages, loff_t offset, size_t count, int sync, struct
> >     >> nfs_write_data *nfs_data);
> >     >> +    int (*flush_one) (struct inode *inode, struct list_head
> *head,
> >     >> unsigned int npages, size_t count, int how);
> >     >> +
> >     >>
> >     >>      /* Functions that do not use the pagecache.
> >     >>       * If use_pagecache == 0, then these functions must be
> >     implemented.
> >     >> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
> >     >> index fe82716..cc5f1d0 100644
> >     >> --- a/include/linux/nfs_page.h
> >     >> +++ b/include/linux/nfs_page.h
> >     >> @@ -47,6 +47,7 @@ struct nfs_page {
> >     >>  #ifdef CONFIG_PNFS
> >     >>      unsigned int        wb_devip;    /* pNFS data server IP
> >     addr */
> >     >>      unsigned int        wb_devport;    /* pNFS data server
> >     port */
> >     >> +    void            *wb_private;
> >     >>  #endif
> >     >>  };
> >     >>
> >     >>
> >     _______________________________________________
> >     pNFS mailing list
> >     pNFS at linux-nfs.org <mailto:pNFS at linux-nfs.org>
> >     http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
> >
> >
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://linux-nfs.org/pipermail/pnfs/attachments/20080115/6b8d8486/attachment-0001.htm 


More information about the pNFS mailing list