[pnfs] [PATCH 03/11] pnfs: pnfs_write_begin

Benny Halevy bhalevy at panasas.com
Sun Apr 13 09:48:48 EDT 2008


On Apr. 12, 2008, 0:12 +0300, Dean Hildebrand <seattleplus at gmail.com> wrote:
> 
> Fred Isaman wrote:
>> From: Fred <iisaman at citi.umich.edu>
>>
>> Add hooks in the nfs_write_begin path, giving a driver the potential
>> to read in page data around the data that is about to be copied to the page.
>>
>> Signed-off-by: Fred Isaman <iisaman at citi.umich.edu>
>> Signed-off-by: Benny Halevy <bhalevy at panasas.com>
>> ---
>>  fs/nfs/file.c             |   16 +++++++--
>>  fs/nfs/pnfs.c             |   74 +++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfs/pnfs.h             |   53 ++++++++++++++++++++++++++++++++
>>  fs/nfs/write.c            |    2 +-
>>  include/linux/nfs4_pnfs.h |   14 ++++++++-
>>  5 files changed, 153 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index aa1644e..f4c1d6b 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -350,10 +350,17 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
>>  	*pagep = page;
>>  
>>  	ret = nfs_flush_incompatible(file, page);
>> -	if (ret) {
>> -		unlock_page(page);
>> -		page_cache_release(page);
>> -	}
>> +	if (ret)
>> +		goto out_err;
>> +	ret = pnfs_write_begin(file, page, pos, len, flags, fsdata);
>> +	if (ret)
>> +		goto out_err;
>> +	return 0;
>> +
>> + out_err:
>> +	unlock_page(page);
>> +	page_cache_release(page);
>> +	*pagep = NULL;
>>  	return ret;
>>  }
>>  
>> @@ -370,6 +377,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
>>  
>>  	unlock_page(page);
>>  	page_cache_release(page);
>> +	pnfs_write_end_cleanup(fsdata);
>>  
>>  	if (status < 0)
>>  		return status;
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index ee2f10a..7be8bbe 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -807,6 +807,28 @@ pnfs_has_layout(struct pnfs_layout_type *lo,
>>  	return ret;
>>  }
>>  
>> +static struct pnfs_layout_segment *
>> +pnfs_find_get_lseg(struct inode *inode,
>> +		   loff_t pos,
>> +		   size_t count,
>> +		   enum pnfs_iomode iomode)
>> +{
>> +	struct nfs_inode *nfsi = NFS_I(inode);
>> +	struct pnfs_layout_segment *lseg;
>> +	struct pnfs_layout_type *lo;
>> +	struct nfs4_pnfs_layout_segment range;
>> +
>> +	lo = get_lock_current_layout(nfsi);
>> +	if (!lo)
>> +		return NULL;
>> +	range.iomode = iomode;
>> +	range.offset = pos;
>> +	range.length = count;
>> +	lseg = pnfs_has_layout(lo, &range, 1);
>> +	put_unlock_current_layout(nfsi, lo);
>> +	return lseg;
>> +}
>> +
>>  /* Update the file's layout for the given range and iomode.
>>   * Layout is retreived from the server if needed.
>>   * If lsegpp is given, the appropriate layout segment is referenced and
>> @@ -1544,6 +1566,53 @@ int _pnfs_try_to_read_data(struct nfs_read_data *data,
>>  	}
>>  }
>>  
>> +/*
>> + * This gives the layout driver an opportunity to read in page "around"
>> + * the data to be written.  It returns 0 on success, otherwise an error code
>> + * which will either be passed up to user, or ignored if
>> + * some previous part of write succeeded.
>> + * Note the range [pos, pos+len-1] is entirely within the page.
>> + */
>> +/* flags = AOP_FLAG_UNINTERRUPTIBLE or 0 - ??? need this ??? */

If we don't need it (and we currently make no use of it) let's take
it out of the API.  We certainly don't need to start gaining fat.

>> +int _pnfs_write_begin(struct inode *inode, struct nfs_server *nfss,

in the effort to keep parameter count low (as we did in other API calls)
we can also take nfss out, and use NFS_SERVER(inode) instead.
Dereferencing the inode pointer costs pretty much the same as accessing
the extra variable on stack (assuming the cache line holding it is still cached)
and wastes fewer cycles on pushing the value.

>> +		      struct page *page, loff_t pos, unsigned len,
>> +		      unsigned flags, struct pnfs_fsdata **fsdata)
>> +{
>> +	struct pnfs_layout_segment *lseg;
>> +	int status = 0;
>> +
>> +	*fsdata = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
>> +	if (!*fsdata)
>> +		return -ENOMEM;
>> +	lseg = pnfs_find_get_lseg(inode, pos, len, IOMODE_RW);
>> +	if (lseg) {
>> +		status = nfss->pnfs_curr_ld->ld_io_ops->write_begin(lseg, page,
>> +								    pos, len,
>> +								    *fsdata);
>>   
> If status != E_OK on this line, will fsdata get freed?  (is 
> pnfs_free_data guaranteed to be called?)
> Dean

Dean's right. write_end isn't called when write_begin returns an error,
therefore we must clean up locally in the error case.

Fred, what about the lseg == NULL case? (as returned from pnfs_find_get_lseg)
Although we return success and fsdata will eventually get freed via
nfs_write_end -> pnfs_write_end_cleanup it has no use and better be freed
early on.

>> +		put_lseg(lseg);
>> +	}
>> +	return status;

How about this:

	*fsdata = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
	if (!*fsdata)
		return -ENOMEM;
	lseg = pnfs_find_get_lseg(inode, pos, len, IOMODE_RW);
	if (lseg) {
		status = nfss->pnfs_curr_ld->ld_io_ops->write_begin(lseg, page,
								    pos, len,
								    *fsdata);
		put_lseg(lseg);
		if (!status)
			return 0;
	}
	kfree(*fsdata);
	*fsdata = NULL;
	return status;

>> +}
>> +
>> +/* Given an nfs request, determine if it should be flushed before proceeding.
>> + * It should default to returning False, returning True only if there is a
>> + * specific reason to flush.
>> + */
>> +int _pnfs_do_flush(struct inode *inode, struct nfs_server *nfss,

Same here regarding nfss (even passing only req should be enough)

>> +		   struct nfs_page *req, struct pnfs_fsdata *fsdata)
>> +{
>> +	struct pnfs_layout_segment *lseg;
>> +	loff_t pos = (req->wb_index << PAGE_CACHE_SHIFT) + req->wb_offset;
>> +	int status = 0;
>> +
>> +	lseg = pnfs_find_get_lseg(inode, pos, req->wb_bytes, IOMODE_RW);
>> +	/* Note that lseg==NULL may be useful info for do_flush */
>> +	status = nfss->pnfs_curr_ld->ld_policy_ops->do_flush(lseg, req,
>> +							     fsdata);
>> +	put_lseg(lseg);
>> +	return status;
>> +}
>> +
>>  int _pnfs_try_to_write_data(struct nfs_write_data *data,
>>  			    const struct rpc_call_ops *call_ops, int how)
>>  {
>> @@ -1864,6 +1933,11 @@ void pnfs_free_request_data(struct nfs_page *req)
>>  		lo->free_request_data(req);
>>  }
>>  
>> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
>> +{
>> +	kfree(fsdata);
>> +}
>> +
>>  /* Callback operations for layout drivers.
>>   */
>>  struct pnfs_client_operations pnfs_ops = {
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 3014f34..0c63fea 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -55,10 +55,16 @@ void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
>>  void pnfs_update_layout_commit(struct inode *, struct list_head *, pgoff_t, unsigned int);
>>  int pnfs_flush_one(struct inode *, struct list_head *, unsigned int, size_t, int);
>>  void pnfs_free_request_data(struct nfs_page *req);
>> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
>>  ssize_t pnfs_file_write(struct file *, const char __user *, size_t, loff_t *);
>>  void pnfs_get_layout_done(struct pnfs_layout_type *,
>>  			  struct nfs4_pnfs_layoutget *, int);
>>  void pnfs_layout_release(struct pnfs_layout_type *);
>> +int _pnfs_write_begin(struct inode *inode, struct nfs_server *nfss,
>> +		      struct page *page, loff_t pos, unsigned len,
>> +		      unsigned flags, struct pnfs_fsdata **fsdata);
>> +int _pnfs_do_flush(struct inode *inode, struct nfs_server *nfss,
>> +		   struct nfs_page *req, struct pnfs_fsdata *fsdata);
>>  
>>  #define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld &&	\
>>  				     (srv)->pnfs_curr_ld->ld_io_ops &&	\
>> @@ -109,6 +115,37 @@ static inline int pnfs_try_to_commit(struct nfs_write_data *data)
>>  	return 1;
>>  }
>>  
>> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
>> +				   loff_t pos, unsigned len, unsigned flags,
>> +				   void **fsdata)
>> +{
>> +	struct inode *inode = filp->f_dentry->d_inode;
>> +	struct nfs_server *nfss = NFS_SERVER(inode);
>> +	struct pnfs_fsdata *pnfs_fsdata = NULL;
>> +	int status = 0;
>> +
>> +	if (PNFS_EXISTS_LDIO_OP(nfss, write_begin))
>> +		status = _pnfs_write_begin(inode, nfss, page, pos, len, flags,
>> +					   &pnfs_fsdata);
>> +	*fsdata = pnfs_fsdata; /* fsdata should never be NULL */

Why not just pass fsdata on to _pnfs_write_begin as (struct pnfs_fsdata **)fsdata?
(The type cast isn't even needed as the *fsdata, as a void pointer can
be passed on to the LD's write_begin method as struct pnfs_fsdata *
with further casting, so _pnfs_write_begin can just accept void **fsdata)

>> +	return status;
>> +}
>> +
>> +static inline int pnfs_do_flush(struct nfs_page *req, void *fsdata)
>> +{
>> +	struct inode *inode = req->wb_context->path.dentry->d_inode;
>> +	struct nfs_server *nfss = NFS_SERVER(inode);

I'd add an empty line here, between the variable definitions to the
function code.

>> +	if (PNFS_EXISTS_LDPOLICY_OP(nfss, do_flush))
>> +		return _pnfs_do_flush(inode, nfss, req, fsdata);

The nfs_server argument is completely redundant (and not used in _pnfs_do_flush).
req is enough.

>> +	else
>> +		return 0;
>> +}
>> +
>> +static inline void pnfs_write_end_cleanup(void *fsdata)
>> +{
>> +	pnfs_free_fsdata((struct pnfs_fsdata *) fsdata);

No need to cast the void pointer.

>> +}
>> +
>>  #else  /* CONFIG_PNFS */
>>  
>>  static inline int pnfs_try_to_read_data(struct nfs_read_data *data,
>> @@ -129,6 +166,22 @@ static inline int pnfs_try_to_commit(struct nfs_write_data *data)
>>  	return 1;
>>  }
>>  
>> +static inline int pnfs_do_flush(struct nfs_page *req, void *fsdata)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
>> +				   loff_t pos, unsigned len, unsigned flags,
>> +				   void **fsdata)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void pnfs_write_end_cleanup(void *fsdata)
>> +{
>> +}
>> +
>>  #endif /* CONFIG_PNFS */
>>  
>>  #endif /* FS_NFS_PNFS_H */
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index 015e035..5616f48 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -699,7 +699,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>>  		if (req == NULL)
>>  			return 0;
>>  		do_flush = req->wb_page != page || req->wb_context != ctx
>> -			|| !nfs_dirty_request(req);
>> +			|| !nfs_dirty_request(req) || pnfs_do_flush(req, NULL);
>>  		nfs_release_request(req);
>>  		if (!do_flush)
>>  			return 0;
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 654767a..9ed6819 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -48,6 +48,11 @@ struct pnfs_layout_type {
>>  	u8 ld_data[];			/* layout driver private data */
>>  };
>>  
>> +struct pnfs_fsdata {
>> +	int ok_to_use_pnfs;
>> +};
>> +
>> +
>>  static inline struct inode *
>>  PNFS_INODE(struct pnfs_layout_type *lo)
>>  {
>> @@ -133,8 +138,11 @@ struct layoutdriver_io_operations {
>>  			       unsigned nr_pages, loff_t offset, size_t count,
>>  			       int sync, struct nfs_write_data *nfs_data);
>>  	int (*flush_one) (struct pnfs_layout_segment *, struct list_head *head, unsigned int npages, size_t count, int how);
>> +	int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
>> +			    loff_t pos, unsigned count,
>> +			    struct pnfs_fsdata *fsdata);
>>  	void (*free_request_data) (struct nfs_page *);
>> -
>> +	void (*free_fsdata) (struct pnfs_fsdata *data);
>>  
>>  	/* Consistency ops */
>>  	/* 2 problems:
>> @@ -173,6 +181,10 @@ struct layoutdriver_policy_operations {
>>  	/* test for nfs page cache coalescing */
>>  	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
>>  
>> +	/* Test for pre-write request flushing */
>> +	int (*do_flush)(struct pnfs_layout_segment *lseg, struct nfs_page *req,
>> +			struct pnfs_fsdata *fsdata);
>> +
>>  	/* Retreive the block size of the file system.  If gather_across_stripes == 1,
>>  	 * then the file system will gather requests into the block size.
>>  	 * TODO: Where will the layout driver get this info?  It is hard coded in PVFS2.
>>   
> _______________________________________________
> pNFS mailing list
> pNFS at linux-nfs.org
> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs



More information about the pNFS mailing list