[pnfs] [PATCH 03/11] pnfs: pnfs_write_begin
Fredric Isaman
iisaman at citi.umich.edu
Mon Apr 14 09:54:26 EDT 2008
On Sun, 13 Apr 2008, Benny Halevy wrote:
> On Apr. 12, 2008, 0:12 +0300, Dean Hildebrand <seattleplus at gmail.com> wrote:
>>
>> Fred Isaman wrote:
>>> From: Fred <iisaman at citi.umich.edu>
>>>
>>> Add hooks in the nfs_write_begin path, giving a driver the potential
>>> to read in page data around the data that is about to be copied to the page.
>>>
>>> Signed-off-by: Fred Isaman <iisaman at citi.umich.edu>
>>> Signed-off-by: Benny Halevy <bhalevy at panasas.com>
>>> ---
>>> fs/nfs/file.c | 16 +++++++--
>>> fs/nfs/pnfs.c | 74 +++++++++++++++++++++++++++++++++++++++++++++
>>> fs/nfs/pnfs.h | 53 ++++++++++++++++++++++++++++++++
>>> fs/nfs/write.c | 2 +-
>>> include/linux/nfs4_pnfs.h | 14 ++++++++-
>>> 5 files changed, 153 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>>> index aa1644e..f4c1d6b 100644
>>> --- a/fs/nfs/file.c
>>> +++ b/fs/nfs/file.c
>>> @@ -350,10 +350,17 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
>>> *pagep = page;
>>>
>>> ret = nfs_flush_incompatible(file, page);
>>> - if (ret) {
>>> - unlock_page(page);
>>> - page_cache_release(page);
>>> - }
>>> + if (ret)
>>> + goto out_err;
>>> + ret = pnfs_write_begin(file, page, pos, len, flags, fsdata);
>>> + if (ret)
>>> + goto out_err;
>>> + return 0;
>>> +
>>> + out_err:
>>> + unlock_page(page);
>>> + page_cache_release(page);
>>> + *pagep = NULL;
>>> return ret;
>>> }
>>>
>>> @@ -370,6 +377,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
>>>
>>> unlock_page(page);
>>> page_cache_release(page);
>>> + pnfs_write_end_cleanup(fsdata);
>>>
>>> if (status < 0)
>>> return status;
>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>> index ee2f10a..7be8bbe 100644
>>> --- a/fs/nfs/pnfs.c
>>> +++ b/fs/nfs/pnfs.c
>>> @@ -807,6 +807,28 @@ pnfs_has_layout(struct pnfs_layout_type *lo,
>>> return ret;
>>> }
>>>
>>> +static struct pnfs_layout_segment *
>>> +pnfs_find_get_lseg(struct inode *inode,
>>> + loff_t pos,
>>> + size_t count,
>>> + enum pnfs_iomode iomode)
>>> +{
>>> + struct nfs_inode *nfsi = NFS_I(inode);
>>> + struct pnfs_layout_segment *lseg;
>>> + struct pnfs_layout_type *lo;
>>> + struct nfs4_pnfs_layout_segment range;
>>> +
>>> + lo = get_lock_current_layout(nfsi);
>>> + if (!lo)
>>> + return NULL;
>>> + range.iomode = iomode;
>>> + range.offset = pos;
>>> + range.length = count;
>>> + lseg = pnfs_has_layout(lo, &range, 1);
>>> + put_unlock_current_layout(nfsi, lo);
>>> + return lseg;
>>> +}
>>> +
>>> /* Update the file's layout for the given range and iomode.
>>> * Layout is retreived from the server if needed.
>>> * If lsegpp is given, the appropriate layout segment is referenced and
>>> @@ -1544,6 +1566,53 @@ int _pnfs_try_to_read_data(struct nfs_read_data *data,
>>> }
>>> }
>>>
>>> +/*
>>> + * This gives the layout driver an opportunity to read in page "around"
>>> + * the data to be written. It returns 0 on success, otherwise an error code
>>> + * which will either be passed up to user, or ignored if
>>> + * some previous part of write succeeded.
>>> + * Note the range [pos, pos+len-1] is entirely within the page.
>>> + */
>>> +/* flags = AOP_FLAG_UNINTERRUPTIBLE or 0 - ??? need this ??? */
>
> If we don't need it (and we currently make no use of it) let's take
> it out of the API. We certainly don't need to start gaining fat.
>
OK
>>> +int _pnfs_write_begin(struct inode *inode, struct nfs_server *nfss,
>
> in the effort to keep parameter count low (as we did in other API calls)
> we can also take nfss out, and use NFS_SERVER(inode) instead.
> Dereferencing the inode pointer costs pretty much the same as accessing
> the extra variable on stack (assuming the cache line holding it is still cached)
> and wastes fewer cycles on pushing the value.
>
>>> + struct page *page, loff_t pos, unsigned len,
>>> + unsigned flags, struct pnfs_fsdata **fsdata)
>>> +{
>>> + struct pnfs_layout_segment *lseg;
>>> + int status = 0;
>>> +
>>> + *fsdata = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
>>> + if (!*fsdata)
>>> + return -ENOMEM;
>>> + lseg = pnfs_find_get_lseg(inode, pos, len, IOMODE_RW);
>>> + if (lseg) {
>>> + status = nfss->pnfs_curr_ld->ld_io_ops->write_begin(lseg, page,
>>> + pos, len,
>>> + *fsdata);
>>>
>> If status != E_OK on this line, will fsdata get freed? (is
>> pnfs_free_data guaranteed to be called?)
>> Dean
>
> Dean's right. write_end isn't called when write_begin returns an error,
> therefore we must clean up locally in the error case.
>
> Fred, what about the lseg == NULL case? (as returned from pnfs_find_get_lseg)
> Although we return success and fsdata will eventually get freed via
> nfs_write_end -> pnfs_write_end_cleanup it has no use and better be freed
> early on.
>
>>> + put_lseg(lseg);
>>> + }
>>> + return status;
>
> How about this:
>
> *fsdata = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
> if (!*fsdata)
> return -ENOMEM;
> lseg = pnfs_find_get_lseg(inode, pos, len, IOMODE_RW);
> if (lseg) {
> status = nfss->pnfs_curr_ld->ld_io_ops->write_begin(lseg, page,
> pos, len,
> *fsdata);
> put_lseg(lseg);
> if (!status)
> return 0;
> }
> kfree(*fsdata);
> *fsdata = NULL;
> return status;
>
That looks good.
>>> +}
>>> +
>>> +/* Given an nfs request, determine if it should be flushed before proceeding.
>>> + * It should default to returning False, returning True only if there is a
>>> + * specific reason to flush.
>>> + */
>>> +int _pnfs_do_flush(struct inode *inode, struct nfs_server *nfss,
>
> Same here regarding nfss (even passing only req should be enough)
>
OK
>>> + struct nfs_page *req, struct pnfs_fsdata *fsdata)
>>> +{
>>> + struct pnfs_layout_segment *lseg;
>>> + loff_t pos = (req->wb_index << PAGE_CACHE_SHIFT) + req->wb_offset;
>>> + int status = 0;
>>> +
>>> + lseg = pnfs_find_get_lseg(inode, pos, req->wb_bytes, IOMODE_RW);
>>> + /* Note that lseg==NULL may be useful info for do_flush */
>>> + status = nfss->pnfs_curr_ld->ld_policy_ops->do_flush(lseg, req,
>>> + fsdata);
>>> + put_lseg(lseg);
>>> + return status;
>>> +}
>>> +
>>> int _pnfs_try_to_write_data(struct nfs_write_data *data,
>>> const struct rpc_call_ops *call_ops, int how)
>>> {
>>> @@ -1864,6 +1933,11 @@ void pnfs_free_request_data(struct nfs_page *req)
>>> lo->free_request_data(req);
>>> }
>>>
>>> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
>>> +{
>>> + kfree(fsdata);
>>> +}
>>> +
>>> /* Callback operations for layout drivers.
>>> */
>>> struct pnfs_client_operations pnfs_ops = {
>>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>>> index 3014f34..0c63fea 100644
>>> --- a/fs/nfs/pnfs.h
>>> +++ b/fs/nfs/pnfs.h
>>> @@ -55,10 +55,16 @@ void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>>> void pnfs_update_layout_commit(struct inode *, struct list_head *, pgoff_t, unsigned int);
>>> int pnfs_flush_one(struct inode *, struct list_head *, unsigned int, size_t, int);
>>> void pnfs_free_request_data(struct nfs_page *req);
>>> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
>>> ssize_t pnfs_file_write(struct file *, const char __user *, size_t, loff_t *);
>>> void pnfs_get_layout_done(struct pnfs_layout_type *,
>>> struct nfs4_pnfs_layoutget *, int);
>>> void pnfs_layout_release(struct pnfs_layout_type *);
>>> +int _pnfs_write_begin(struct inode *inode, struct nfs_server *nfss,
>>> + struct page *page, loff_t pos, unsigned len,
>>> + unsigned flags, struct pnfs_fsdata **fsdata);
>>> +int _pnfs_do_flush(struct inode *inode, struct nfs_server *nfss,
>>> + struct nfs_page *req, struct pnfs_fsdata *fsdata);
>>>
>>> #define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \
>>> (srv)->pnfs_curr_ld->ld_io_ops && \
>>> @@ -109,6 +115,37 @@ static inline int pnfs_try_to_commit(struct nfs_write_data *data)
>>> return 1;
>>> }
>>>
>>> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
>>> + loff_t pos, unsigned len, unsigned flags,
>>> + void **fsdata)
>>> +{
>>> + struct inode *inode = filp->f_dentry->d_inode;
>>> + struct nfs_server *nfss = NFS_SERVER(inode);
>>> + struct pnfs_fsdata *pnfs_fsdata = NULL;
>>> + int status = 0;
>>> +
>>> + if (PNFS_EXISTS_LDIO_OP(nfss, write_begin))
>>> + status = _pnfs_write_begin(inode, nfss, page, pos, len, flags,
>>> + &pnfs_fsdata);
>>> + *fsdata = pnfs_fsdata; /* fsdata should never be NULL */
>
> Why not just pass fsdata on to _pnfs_write_begin as (struct pnfs_fsdata **)fsdata?
> (The type cast isn't even needed as the *fsdata, as a void pointer can
> be passed on to the LD's write_begin method as struct pnfs_fsdata *
> with further casting, so _pnfs_write_begin can just accept void **fsdata)
>
OK
>>> + return status;
>>> +}
>>> +
>>> +static inline int pnfs_do_flush(struct nfs_page *req, void *fsdata)
>>> +{
>>> + struct inode *inode = req->wb_context->path.dentry->d_inode;
>>> + struct nfs_server *nfss = NFS_SERVER(inode);
>
> I'd add an empty line here, between the variable definitions to the
> function code.
>
OK
>>> + if (PNFS_EXISTS_LDPOLICY_OP(nfss, do_flush))
>>> + return _pnfs_do_flush(inode, nfss, req, fsdata);
>
> The nfs_server argument is completely redundant (and not used in _pnfs_do_flush).
> req is enough.
>
I can remove it as an argument as in the prior case, but it is used (to
dereference the LD op).
>>> + else
>>> + return 0;
>>> +}
>>> +
>>> +static inline void pnfs_write_end_cleanup(void *fsdata)
>>> +{
>>> + pnfs_free_fsdata((struct pnfs_fsdata *) fsdata);
>
> No need to cast the void pointer.
>
OK
>>> +}
>>> +
>>> #else /* CONFIG_PNFS */
>>>
>>> static inline int pnfs_try_to_read_data(struct nfs_read_data *data,
>>> @@ -129,6 +166,22 @@ static inline int pnfs_try_to_commit(struct nfs_write_data *data)
>>> return 1;
>>> }
>>>
>>> +static inline int pnfs_do_flush(struct nfs_page *req, void *fsdata)
>>> +{
>>> + return 0;
>>> +}
>>> +
>>> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
>>> + loff_t pos, unsigned len, unsigned flags,
>>> + void **fsdata)
>>> +{
>>> + return 0;
>>> +}
>>> +
>>> +static inline void pnfs_write_end_cleanup(void *fsdata)
>>> +{
>>> +}
>>> +
>>> #endif /* CONFIG_PNFS */
>>>
>>> #endif /* FS_NFS_PNFS_H */
>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>> index 015e035..5616f48 100644
>>> --- a/fs/nfs/write.c
>>> +++ b/fs/nfs/write.c
>>> @@ -699,7 +699,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>>> if (req == NULL)
>>> return 0;
>>> do_flush = req->wb_page != page || req->wb_context != ctx
>>> - || !nfs_dirty_request(req);
>>> + || !nfs_dirty_request(req) || pnfs_do_flush(req, NULL);
>>> nfs_release_request(req);
>>> if (!do_flush)
>>> return 0;
>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>> index 654767a..9ed6819 100644
>>> --- a/include/linux/nfs4_pnfs.h
>>> +++ b/include/linux/nfs4_pnfs.h
>>> @@ -48,6 +48,11 @@ struct pnfs_layout_type {
>>> u8 ld_data[]; /* layout driver private data */
>>> };
>>>
>>> +struct pnfs_fsdata {
>>> + int ok_to_use_pnfs;
>>> +};
>>> +
>>> +
>>> static inline struct inode *
>>> PNFS_INODE(struct pnfs_layout_type *lo)
>>> {
>>> @@ -133,8 +138,11 @@ struct layoutdriver_io_operations {
>>> unsigned nr_pages, loff_t offset, size_t count,
>>> int sync, struct nfs_write_data *nfs_data);
>>> int (*flush_one) (struct pnfs_layout_segment *, struct list_head *head, unsigned int npages, size_t count, int how);
>>> + int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
>>> + loff_t pos, unsigned count,
>>> + struct pnfs_fsdata *fsdata);
>>> void (*free_request_data) (struct nfs_page *);
>>> -
>>> + void (*free_fsdata) (struct pnfs_fsdata *data);
>>>
>>> /* Consistency ops */
>>> /* 2 problems:
>>> @@ -173,6 +181,10 @@ struct layoutdriver_policy_operations {
>>> /* test for nfs page cache coalescing */
>>> int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
>>>
>>> + /* Test for pre-write request flushing */
>>> + int (*do_flush)(struct pnfs_layout_segment *lseg, struct nfs_page *req,
>>> + struct pnfs_fsdata *fsdata);
>>> +
>>> /* Retreive the block size of the file system. If gather_across_stripes == 1,
>>> * then the file system will gather requests into the block size.
>>> * TODO: Where will the layout driver get this info? It is hard coded in PVFS2.
>>>
>> _______________________________________________
>> pNFS mailing list
>> pNFS at linux-nfs.org
>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>
>
More information about the pNFS
mailing list