[pnfs] [PATCH 07/10] objlayout: encode_layoutreturn support

Boaz Harrosh bharrosh at panasas.com
Tue Aug 11 10:46:14 EDT 2009


On 08/11/2009 03:00 PM, Benny Halevy wrote:
> On Aug. 10, 2009, 21:20 +0300, Boaz Harrosh <bharrosh at panasas.com> wrote:
>> An IO state preallocates an error information
>> structure for each posible osd-device that might error
>> during io. When I/O is done if all was well the I/O state
>> is de-allocated. (as today). If the I/O has ended with an
>> error, the io_state is queued on a per-layout err_list.
>> When eventually encode_layoutreturn() is called, each error
>> is properlly encoded on the XDR buffer and only then the
>> io_state is removed from err_list and de-allocated.
>>
>> It is up to the io_engin to fill in the segment that fault
>> and the type of osd_error that accured. By calling new
>> objlayout_io_set_result().
>>
>> [This patch only includes the generic parts, an io_engin
>>  specific patch will follow]
>>
>> Signed-off-by: Boaz Harrosh <bharrosh at panasas.com>
>> ---
>>  fs/nfs/objlayout/objlayout.c    |  107 ++++++++++++++++++++++++++++++++++++--
>>  fs/nfs/objlayout/objlayout.h    |   18 +++++++
>>  fs/nfs/objlayout/pnfs_osd_xdr.c |   60 ++++++++++++++++++++++
>>  fs/nfs/objlayout/pnfs_osd_xdr.h |   24 +++++++--
>>  4 files changed, 198 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
>> index b0de7a9..ff9c8e4 100644
>> --- a/fs/nfs/objlayout/objlayout.c
>> +++ b/fs/nfs/objlayout/objlayout.c
>> @@ -61,6 +61,9 @@ objlayout_alloc_layout(struct pnfs_mount_type *mountid, struct inode *inode)
>>  			  sizeof(struct objlayout), GFP_KERNEL);
>>  	if (pnfslay) {
>>  		struct objlayout *objlay = PNFS_LD_DATA(pnfslay);
>> +
>> +		spin_lock_init(&objlay->lock);
>> +		INIT_LIST_HEAD(&objlay->err_list);
>>  		objlayout_atomic64_init(&objlay->delta_space_used);
>>  	}
>>  	dprintk("%s: Return %p\n", __func__, pnfslay);
>> @@ -73,7 +76,11 @@ objlayout_alloc_layout(struct pnfs_mount_type *mountid, struct inode *inode)
>>  static void
>>  objlayout_free_layout(struct pnfs_layout_type *pnfslay)
>>  {
>> +	struct objlayout *objlay = PNFS_LD_DATA(pnfslay);
>> +
>>  	dprintk("%s: pnfslay %p\n", __func__, pnfslay);
>> +
>> +	WARN_ON(!list_empty(&objlay->err_list));
>>  	kfree(pnfslay);
>>  }
>>  
>> @@ -197,6 +204,7 @@ objlayout_alloc_io_state(struct pnfs_layout_type *pnfs_layout_type,
>>  	if (nr_pages > size_nr_pages)
>>  		nr_pages = size_nr_pages;
>>  
>> +	INIT_LIST_HEAD(&state->err_list);
>>  	state->lseg = lseg;
>>  	state->rpcdata = rpcdata;
>>  	state->pages = pages;
>> @@ -220,16 +228,53 @@ objlayout_free_io_state(struct objlayout_io_state *state)
>>  }
>>  
>>  /*
>> - * I/O done
>> - *
>> - * Dereference a layout segment and decrement io in-progress counter
>> - * Free layout segment is ref count reached zero
>> + * I/O done common code
>>   */
>>  static void
>>  objlayout_iodone(struct objlayout_io_state *state)
>>  {
>> -	dprintk("%s: state %p\n", __func__, state);
>> -	objlayout_free_io_state(state);
>> +	dprintk("%s: state %p status\n", __func__, state);
>> +
>> +	if (likely(state->status >= 0)) {
>> +		objlayout_free_io_state(state);
>> +	} else {
>> +		struct objlayout *objlay = PNFS_LD_DATA(state->lseg->layout);
>> +
>> +		spin_lock(&objlay->lock);
>> +		list_add(&objlay->err_list, &state->err_list);
>> +		spin_unlock(&objlay->lock);
>> +	}
>> +}
>> +
>> +void objlayout_io_set_result(struct objlayout_io_state *state,
>> +	unsigned index, int status, int osd_error,
>> +	u64 offset, u64 length, bool is_write)
>> +{
>> +	struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
>> +
>> +	BUG_ON(index >= state->num_comps);
>> +	state->status = status;
>> +	if (unlikely(status <= 0)) {
>> +		struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg);
>> +		struct pnfs_osd_layout *layout =
>> +				(typeof(layout))objlseg->pnfs_osd_layout;
>> +
>> +		ioerr->oer_component = layout->olo_comps[index].oc_object_id;
>> +		ioerr->oer_comp_offset = offset;
>> +		ioerr->oer_comp_length = length;
>> +		ioerr->oer_iswrite = is_write;
>> +		ioerr->oer_errno = osd_error;
>> +
>> +		dprintk("%s: err[%d]: errno=%d is_write=%d obj=0x%llx "
>> +			"offset=0x%llx length=0x%llx\n",
>> +			__func__, index, ioerr->oer_errno,
>> +			ioerr->oer_iswrite,
>> +			ioerr->oer_component.oid_object_id,
>> +			ioerr->oer_comp_offset,
>> +			ioerr->oer_comp_length);
>> +	} else {
>> +		ioerr->oer_errno = 0;
>> +	}
>>  }
>>  
>>  /*
>> @@ -472,6 +517,55 @@ out:
>>  	dprintk("%s: Return\n", __func__);
>>  }
>>  
>> +void
>> +objlayout_encode_layoutreturn(struct xdr_stream *xdr,
>> +		    const struct nfs4_pnfs_layoutreturn_arg *args)
>> +{
>> +	struct objlayout *objlay =
>> +			PNFS_LD_DATA(NFS_I(args->inode)->current_layout);
>> +	struct objlayout_io_state *state, *tmp;
>> +	u32 *start;
>> +
>> +	dprintk("%s: Begin\n", __func__);
>> +	start = xdr_reserve_space(xdr, 4);
>> +	if (WARN_ON(!start))
>> +		return;
>> +
>> +	spin_lock(&objlay->lock);
>> +
>> +	list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
>> +		unsigned i;
>> +		int res = 0;
>> +
>> +		for (i = 0; i < state->num_comps && !res; i++) {
>> +			if (!state->ioerrs[i].oer_errno)
>> +				continue;
>> +
>> +			dprintk("%s: err[%d]: errno=%d is_write=%d obj=0x%llx "
>> +				"offset=0x%llx length=0x%llx\n",
>> +				__func__, i, state->ioerrs[i].oer_errno,
>> +				state->ioerrs[i].oer_iswrite,
>> +				state->ioerrs[i].oer_component.oid_object_id,
>> +				state->ioerrs[i].oer_comp_offset,
>> +				state->ioerrs[i].oer_comp_length);
>> +
>> +			res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
>> +		}
>> +
>> +		list_del(&state->err_list);
>> +		objlayout_free_io_state(state);
>> +		state = NULL; /* FIXME: for debugging */
>> +		if (res) {
>> +			dprintk("Error! pnfs_osd_xdr_encode_ioerr\n");
>> +			break;
>> +		}
>> +	}
>> +	spin_unlock(&objlay->lock);
>> +
>> +	*start = htonl((xdr->p - start - 1) * 4);
>> +	dprintk("%s: Return\n", __func__);
>> +}
>> +
>>  struct objlayout_deviceinfo {
>>  	struct page *page;
>>  	struct pnfs_osd_deviceaddr da; /* This must be last */
>> @@ -588,6 +682,7 @@ struct layoutdriver_io_operations objlayout_io_operations = {
>>  	.free_lseg               = objlayout_free_lseg,
>>  	.setup_layoutcommit      = objlayout_setup_layoutcommit,
>>  	.cleanup_layoutcommit    = objlayout_cleanup_layoutcommit,
>> +	.encode_layoutreturn     = objlayout_encode_layoutreturn,
>>  	.initialize_mountpoint   = objlayout_initialize_mountpoint,
>>  	.uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
>>  };
>> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
>> index d6a5e36..ebf2862 100644
>> --- a/fs/nfs/objlayout/objlayout.h
>> +++ b/fs/nfs/objlayout/objlayout.h
>> @@ -138,7 +138,11 @@ static inline s64 objlayout_atomic64_xchg(struct objlayout_atomic64 *p, s64 val)
>>   * per-inode layout
>>   */
>>  struct objlayout {
>> +	 /* for layout_commit */
>>  	struct objlayout_atomic64 delta_space_used;  /* consumed by write ops */
>> +	 /* for layout_return */
>> +	spinlock_t lock;
>> +	struct list_head err_list;
>>  };
>>  
>>  /*
>> @@ -160,6 +164,16 @@ struct objlayout_io_state {
>>  	int eof;                /* res */
>>  	int committed;          /* res */
>>  	s64 delta_space_used;   /* res */
>> +
>> +	/* Error reporting (layout_return) */
>> +	struct list_head err_list;
>> +	unsigned num_comps;
>> +	/* Variable array of error descriptors per device.
>> +	 * It should contain as meany entries as devices in the osd_layout
>> +	 * that participate in the I/O. It is upto the io_engine to allocate
>> +	 * needed space. (This member must be kept last).
>> +	 */
>> +	struct pnfs_osd_ioerr ioerrs[1];
>>  };
>>  
>>  /*
>> @@ -184,6 +198,10 @@ extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
>>  /*
>>   * callback API
>>   */
>> +extern void objlayout_io_set_result(struct objlayout_io_state *state,
>> +	unsigned index, int status, int osd_error,
>> +	u64 offset, u64 length, bool is_write);
>> +
>>  extern void objlayout_read_done(struct objlayout_io_state *state, bool sync);
>>  extern void objlayout_write_done(struct objlayout_io_state *state, bool sync);
>>  
>> diff --git a/fs/nfs/objlayout/pnfs_osd_xdr.c b/fs/nfs/objlayout/pnfs_osd_xdr.c
>> index 5f69737..5fdc794 100644
>> --- a/fs/nfs/objlayout/pnfs_osd_xdr.c
>> +++ b/fs/nfs/objlayout/pnfs_osd_xdr.c
>> @@ -342,3 +342,63 @@ void pnfs_osd_xdr_decode_deviceaddr(
>>  
>>  	__xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
>>  }
>> +
>> +#define WRITE32(n)               *p++ = htonl(n)
>> +#define WRITE64(n)               do {		\
>> +	*p++ = htonl((uint32_t)((n) >> 32));	\
>> +	*p++ = htonl((uint32_t)(n));		\
>> +} while (0)
> 
> Note that we want to move away from using these macros in
> the future, but let's do that once we have the full api
> in place and we've converted the main xdr code.
> 

Sure, am waiting for the simple stuff to be available.

>> +
>> +/* draft-ietf-nfsv4-pnfs-obj-12
>> + * struct pnfs_osd_objid4 {
>> + *     deviceid4       oid_device_id;
>> + *     uint64_t        oid_partition_id;
>> + *     uint64_t        oid_object_id;
>> + * };
>> + */
>> +static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
>> +					    struct pnfs_osd_objid *object_id)
>> +{
>> +	u32 *p;
> 
> This should be __be32 *p;
> 

The all file and that other one should be fixed at once, I'll send a SQUASHME
to the complete code.

>> +
>> +	p = xdr_reserve_space(xdr, 32);
>> +	if (!p)
>> +		return -E2BIG;
>> +
>> +	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id,
>> +				    sizeof(object_id->oid_device_id));
> 
> Please use object_id->oid_device_id.data rather than the struct itself
> as we do in all other places (e.g. encode_getdeviceinfo).
> 

Yes

>> +	WRITE64(object_id->oid_partition_id);
>> +	WRITE64(object_id->oid_object_id);
>> +
>> +	return 0;
>> +}
>> +
> 
> let's keep a note that this comes from draft-ietf-nfsv4-pnfs-obj-12
> 

I have a patch in queue that fixes all these comments.
* It states once at head of file the "from draft-ietf-nfsv4-pnfs-obj-12"
* It converts all comments to the .c-structures and not the .xdr ones
* It makes all spaces/tabs consistent. So this will also be fixed there

>> +/*   struct pnfs_osd_ioerr4 {
>> + *       pnfs_osd_objid4     oer_component;
>> + *       length4             oer_comp_offset;
>> + *       length4             oer_comp_length;
>> + *       bool                oer_iswrite;
>> + *       pnfs_osd_errno4     oer_errno;
>> + *   };
>> + */
>> +int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
>> +			      struct pnfs_osd_ioerr *ioerr)
>> +{
>> +	u32 *p;
> 
> __be32 *p
> 

One patch to convert all the osd_xdr files, coming next

>> +	int ret;
>> +
>> +	ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
>> +	if (ret)
>> +		return ret;
>> +
>> +	p = xdr_reserve_space(xdr, 24);
>> +	if (!p)
>> +		return -E2BIG;
>> +
>> +	WRITE64(ioerr->oer_comp_offset);
>> +	WRITE64(ioerr->oer_comp_length);
>> +	WRITE32(ioerr->oer_iswrite);
>> +	WRITE32(ioerr->oer_errno);
>> +
>> +	return 0;
>> +}
>> diff --git a/fs/nfs/objlayout/pnfs_osd_xdr.h b/fs/nfs/objlayout/pnfs_osd_xdr.h
>> index 5ef16f3..66ee112 100644
>> --- a/fs/nfs/objlayout/pnfs_osd_xdr.h
>> +++ b/fs/nfs/objlayout/pnfs_osd_xdr.h
>> @@ -374,13 +374,19 @@ enum pnfs_osd_errno {
>>   *   };
>>   */
>>  struct pnfs_osd_ioerr {
>> -    struct pnfs_osd_objid		oer_component;
>> -    u64					oer_comp_offset;
>> -    u64					oer_comp_length;
>> -    u32					oer_iswrite;
>> -    u32					oer_errno;
>> +	struct pnfs_osd_objid	oer_component;
>> +	u64			oer_comp_offset;
>> +	u64			oer_comp_length;
>> +	u32			oer_iswrite;
>> +	u32			oer_errno;
>>  };
>>  
>> +static inline unsigned
>> +pnfs_osd_ioerr_xdr_sz(void)
> 
> for symmetry reasons please get a void *
> and pass it on to pnfs_osd_objid_xdr_sz
> 

I have also fixed all xx_sz that are constant to not receive
a pointer so pnfs_osd_objid_xdr_sz will change, not this one.

>> +{
>> +	return pnfs_osd_objid_xdr_sz(NULL) + 2 + 2 + 1 + 1;
>> +}
>> +
>>  /* OSD XDR API */
>>  
>>  /* Layout helpers */
>> @@ -409,4 +415,12 @@ extern void pnfs_osd_xdr_decode_deviceaddr(
>>  extern int pnfs_osd_xdr_encode_deviceaddr(
>>  	u32 **pp, u32 *end, struct pnfs_osd_deviceaddr *devaddr);
>>  
>> +/* osd_ioerr Info helpers (layout_return) */
> 
> Looks like "Info" got cut'n'pasted from "Device Info"
> 
> /* Client xdr encoding of osd_ioerror */ would be enough
> 
>> +
>> +extern int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
>> +				     struct pnfs_osd_ioerr *ioerr);
>> +
>> +/* For Servers */
> 
> How about /* Server xdr decoding of osd_ioerror */?
> 
> Benny
> 
>> +extern u32 *pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, u32 *p);
>> +
>>  #endif /* _PANLAYOUT_PNFS_OSD_XDR_H */

Boaz


More information about the pNFS mailing list