[pnfs] [PATCH 07/10] objlayout: encode_layoutreturn support
Boaz Harrosh
bharrosh at panasas.com
Tue Aug 11 10:46:14 EDT 2009
On 08/11/2009 03:00 PM, Benny Halevy wrote:
> On Aug. 10, 2009, 21:20 +0300, Boaz Harrosh <bharrosh at panasas.com> wrote:
>> An IO state preallocates an error information
>> structure for each posible osd-device that might error
>> during io. When I/O is done if all was well the I/O state
>> is de-allocated. (as today). If the I/O has ended with an
>> error, the io_state is queued on a per-layout err_list.
>> When eventually encode_layoutreturn() is called, each error
>> is properlly encoded on the XDR buffer and only then the
>> io_state is removed from err_list and de-allocated.
>>
>> It is up to the io_engin to fill in the segment that fault
>> and the type of osd_error that accured. By calling new
>> objlayout_io_set_result().
>>
>> [This patch only includes the generic parts, an io_engin
>> specific patch will follow]
>>
>> Signed-off-by: Boaz Harrosh <bharrosh at panasas.com>
>> ---
>> fs/nfs/objlayout/objlayout.c | 107 ++++++++++++++++++++++++++++++++++++--
>> fs/nfs/objlayout/objlayout.h | 18 +++++++
>> fs/nfs/objlayout/pnfs_osd_xdr.c | 60 ++++++++++++++++++++++
>> fs/nfs/objlayout/pnfs_osd_xdr.h | 24 +++++++--
>> 4 files changed, 198 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
>> index b0de7a9..ff9c8e4 100644
>> --- a/fs/nfs/objlayout/objlayout.c
>> +++ b/fs/nfs/objlayout/objlayout.c
>> @@ -61,6 +61,9 @@ objlayout_alloc_layout(struct pnfs_mount_type *mountid, struct inode *inode)
>> sizeof(struct objlayout), GFP_KERNEL);
>> if (pnfslay) {
>> struct objlayout *objlay = PNFS_LD_DATA(pnfslay);
>> +
>> + spin_lock_init(&objlay->lock);
>> + INIT_LIST_HEAD(&objlay->err_list);
>> objlayout_atomic64_init(&objlay->delta_space_used);
>> }
>> dprintk("%s: Return %p\n", __func__, pnfslay);
>> @@ -73,7 +76,11 @@ objlayout_alloc_layout(struct pnfs_mount_type *mountid, struct inode *inode)
>> static void
>> objlayout_free_layout(struct pnfs_layout_type *pnfslay)
>> {
>> + struct objlayout *objlay = PNFS_LD_DATA(pnfslay);
>> +
>> dprintk("%s: pnfslay %p\n", __func__, pnfslay);
>> +
>> + WARN_ON(!list_empty(&objlay->err_list));
>> kfree(pnfslay);
>> }
>>
>> @@ -197,6 +204,7 @@ objlayout_alloc_io_state(struct pnfs_layout_type *pnfs_layout_type,
>> if (nr_pages > size_nr_pages)
>> nr_pages = size_nr_pages;
>>
>> + INIT_LIST_HEAD(&state->err_list);
>> state->lseg = lseg;
>> state->rpcdata = rpcdata;
>> state->pages = pages;
>> @@ -220,16 +228,53 @@ objlayout_free_io_state(struct objlayout_io_state *state)
>> }
>>
>> /*
>> - * I/O done
>> - *
>> - * Dereference a layout segment and decrement io in-progress counter
>> - * Free layout segment is ref count reached zero
>> + * I/O done common code
>> */
>> static void
>> objlayout_iodone(struct objlayout_io_state *state)
>> {
>> - dprintk("%s: state %p\n", __func__, state);
>> - objlayout_free_io_state(state);
>> + dprintk("%s: state %p status\n", __func__, state);
>> +
>> + if (likely(state->status >= 0)) {
>> + objlayout_free_io_state(state);
>> + } else {
>> + struct objlayout *objlay = PNFS_LD_DATA(state->lseg->layout);
>> +
>> + spin_lock(&objlay->lock);
>> + list_add(&objlay->err_list, &state->err_list);
>> + spin_unlock(&objlay->lock);
>> + }
>> +}
>> +
>> +void objlayout_io_set_result(struct objlayout_io_state *state,
>> + unsigned index, int status, int osd_error,
>> + u64 offset, u64 length, bool is_write)
>> +{
>> + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
>> +
>> + BUG_ON(index >= state->num_comps);
>> + state->status = status;
>> + if (unlikely(status <= 0)) {
>> + struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg);
>> + struct pnfs_osd_layout *layout =
>> + (typeof(layout))objlseg->pnfs_osd_layout;
>> +
>> + ioerr->oer_component = layout->olo_comps[index].oc_object_id;
>> + ioerr->oer_comp_offset = offset;
>> + ioerr->oer_comp_length = length;
>> + ioerr->oer_iswrite = is_write;
>> + ioerr->oer_errno = osd_error;
>> +
>> + dprintk("%s: err[%d]: errno=%d is_write=%d obj=0x%llx "
>> + "offset=0x%llx length=0x%llx\n",
>> + __func__, index, ioerr->oer_errno,
>> + ioerr->oer_iswrite,
>> + ioerr->oer_component.oid_object_id,
>> + ioerr->oer_comp_offset,
>> + ioerr->oer_comp_length);
>> + } else {
>> + ioerr->oer_errno = 0;
>> + }
>> }
>>
>> /*
>> @@ -472,6 +517,55 @@ out:
>> dprintk("%s: Return\n", __func__);
>> }
>>
>> +void
>> +objlayout_encode_layoutreturn(struct xdr_stream *xdr,
>> + const struct nfs4_pnfs_layoutreturn_arg *args)
>> +{
>> + struct objlayout *objlay =
>> + PNFS_LD_DATA(NFS_I(args->inode)->current_layout);
>> + struct objlayout_io_state *state, *tmp;
>> + u32 *start;
>> +
>> + dprintk("%s: Begin\n", __func__);
>> + start = xdr_reserve_space(xdr, 4);
>> + if (WARN_ON(!start))
>> + return;
>> +
>> + spin_lock(&objlay->lock);
>> +
>> + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
>> + unsigned i;
>> + int res = 0;
>> +
>> + for (i = 0; i < state->num_comps && !res; i++) {
>> + if (!state->ioerrs[i].oer_errno)
>> + continue;
>> +
>> + dprintk("%s: err[%d]: errno=%d is_write=%d obj=0x%llx "
>> + "offset=0x%llx length=0x%llx\n",
>> + __func__, i, state->ioerrs[i].oer_errno,
>> + state->ioerrs[i].oer_iswrite,
>> + state->ioerrs[i].oer_component.oid_object_id,
>> + state->ioerrs[i].oer_comp_offset,
>> + state->ioerrs[i].oer_comp_length);
>> +
>> + res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]);
>> + }
>> +
>> + list_del(&state->err_list);
>> + objlayout_free_io_state(state);
>> + state = NULL; /* FIXME: for debugging */
>> + if (res) {
>> + dprintk("Error! pnfs_osd_xdr_encode_ioerr\n");
>> + break;
>> + }
>> + }
>> + spin_unlock(&objlay->lock);
>> +
>> + *start = htonl((xdr->p - start - 1) * 4);
>> + dprintk("%s: Return\n", __func__);
>> +}
>> +
>> struct objlayout_deviceinfo {
>> struct page *page;
>> struct pnfs_osd_deviceaddr da; /* This must be last */
>> @@ -588,6 +682,7 @@ struct layoutdriver_io_operations objlayout_io_operations = {
>> .free_lseg = objlayout_free_lseg,
>> .setup_layoutcommit = objlayout_setup_layoutcommit,
>> .cleanup_layoutcommit = objlayout_cleanup_layoutcommit,
>> + .encode_layoutreturn = objlayout_encode_layoutreturn,
>> .initialize_mountpoint = objlayout_initialize_mountpoint,
>> .uninitialize_mountpoint = objlayout_uninitialize_mountpoint,
>> };
>> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
>> index d6a5e36..ebf2862 100644
>> --- a/fs/nfs/objlayout/objlayout.h
>> +++ b/fs/nfs/objlayout/objlayout.h
>> @@ -138,7 +138,11 @@ static inline s64 objlayout_atomic64_xchg(struct objlayout_atomic64 *p, s64 val)
>> * per-inode layout
>> */
>> struct objlayout {
>> + /* for layout_commit */
>> struct objlayout_atomic64 delta_space_used; /* consumed by write ops */
>> + /* for layout_return */
>> + spinlock_t lock;
>> + struct list_head err_list;
>> };
>>
>> /*
>> @@ -160,6 +164,16 @@ struct objlayout_io_state {
>> int eof; /* res */
>> int committed; /* res */
>> s64 delta_space_used; /* res */
>> +
>> + /* Error reporting (layout_return) */
>> + struct list_head err_list;
>> + unsigned num_comps;
>> + /* Variable array of error descriptors per device.
>> + * It should contain as meany entries as devices in the osd_layout
>> + * that participate in the I/O. It is upto the io_engine to allocate
>> + * needed space. (This member must be kept last).
>> + */
>> + struct pnfs_osd_ioerr ioerrs[1];
>> };
>>
>> /*
>> @@ -184,6 +198,10 @@ extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
>> /*
>> * callback API
>> */
>> +extern void objlayout_io_set_result(struct objlayout_io_state *state,
>> + unsigned index, int status, int osd_error,
>> + u64 offset, u64 length, bool is_write);
>> +
>> extern void objlayout_read_done(struct objlayout_io_state *state, bool sync);
>> extern void objlayout_write_done(struct objlayout_io_state *state, bool sync);
>>
>> diff --git a/fs/nfs/objlayout/pnfs_osd_xdr.c b/fs/nfs/objlayout/pnfs_osd_xdr.c
>> index 5f69737..5fdc794 100644
>> --- a/fs/nfs/objlayout/pnfs_osd_xdr.c
>> +++ b/fs/nfs/objlayout/pnfs_osd_xdr.c
>> @@ -342,3 +342,63 @@ void pnfs_osd_xdr_decode_deviceaddr(
>>
>> __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace);
>> }
>> +
>> +#define WRITE32(n) *p++ = htonl(n)
>> +#define WRITE64(n) do { \
>> + *p++ = htonl((uint32_t)((n) >> 32)); \
>> + *p++ = htonl((uint32_t)(n)); \
>> +} while (0)
>
> Note that we want to move away from using these macros in
> the future, but let's do that once we have the full api
> in place and we've converted the main xdr code.
>
Sure, am waiting for the simple stuff to be available.
>> +
>> +/* draft-ietf-nfsv4-pnfs-obj-12
>> + * struct pnfs_osd_objid4 {
>> + * deviceid4 oid_device_id;
>> + * uint64_t oid_partition_id;
>> + * uint64_t oid_object_id;
>> + * };
>> + */
>> +static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr,
>> + struct pnfs_osd_objid *object_id)
>> +{
>> + u32 *p;
>
> This should be __be32 *p;
>
The all file and that other one should be fixed at once, I'll send a SQUASHME
to the complete code.
>> +
>> + p = xdr_reserve_space(xdr, 32);
>> + if (!p)
>> + return -E2BIG;
>> +
>> + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id,
>> + sizeof(object_id->oid_device_id));
>
> Please use object_id->oid_device_id.data rather than the struct itself
> as we do in all other places (e.g. encode_getdeviceinfo).
>
Yes
>> + WRITE64(object_id->oid_partition_id);
>> + WRITE64(object_id->oid_object_id);
>> +
>> + return 0;
>> +}
>> +
>
> let's keep a note that this comes from draft-ietf-nfsv4-pnfs-obj-12
>
I have a patch in queue that fixes all these comments.
* It states once at head of file the "from draft-ietf-nfsv4-pnfs-obj-12"
* It converts all comments to the .c-structures and not the .xdr ones
* It makes all spaces/tabs consistent. So this will also be fixed there
>> +/* struct pnfs_osd_ioerr4 {
>> + * pnfs_osd_objid4 oer_component;
>> + * length4 oer_comp_offset;
>> + * length4 oer_comp_length;
>> + * bool oer_iswrite;
>> + * pnfs_osd_errno4 oer_errno;
>> + * };
>> + */
>> +int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
>> + struct pnfs_osd_ioerr *ioerr)
>> +{
>> + u32 *p;
>
> __be32 *p
>
One patch to convert all the osd_xdr files, coming next
>> + int ret;
>> +
>> + ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component);
>> + if (ret)
>> + return ret;
>> +
>> + p = xdr_reserve_space(xdr, 24);
>> + if (!p)
>> + return -E2BIG;
>> +
>> + WRITE64(ioerr->oer_comp_offset);
>> + WRITE64(ioerr->oer_comp_length);
>> + WRITE32(ioerr->oer_iswrite);
>> + WRITE32(ioerr->oer_errno);
>> +
>> + return 0;
>> +}
>> diff --git a/fs/nfs/objlayout/pnfs_osd_xdr.h b/fs/nfs/objlayout/pnfs_osd_xdr.h
>> index 5ef16f3..66ee112 100644
>> --- a/fs/nfs/objlayout/pnfs_osd_xdr.h
>> +++ b/fs/nfs/objlayout/pnfs_osd_xdr.h
>> @@ -374,13 +374,19 @@ enum pnfs_osd_errno {
>> * };
>> */
>> struct pnfs_osd_ioerr {
>> - struct pnfs_osd_objid oer_component;
>> - u64 oer_comp_offset;
>> - u64 oer_comp_length;
>> - u32 oer_iswrite;
>> - u32 oer_errno;
>> + struct pnfs_osd_objid oer_component;
>> + u64 oer_comp_offset;
>> + u64 oer_comp_length;
>> + u32 oer_iswrite;
>> + u32 oer_errno;
>> };
>>
>> +static inline unsigned
>> +pnfs_osd_ioerr_xdr_sz(void)
>
> for symmetry reasons please get a void *
> and pass it on to pnfs_osd_objid_xdr_sz
>
I have also fixed all xx_sz that are constant to not receive
a pointer so pnfs_osd_objid_xdr_sz will change, not this one.
>> +{
>> + return pnfs_osd_objid_xdr_sz(NULL) + 2 + 2 + 1 + 1;
>> +}
>> +
>> /* OSD XDR API */
>>
>> /* Layout helpers */
>> @@ -409,4 +415,12 @@ extern void pnfs_osd_xdr_decode_deviceaddr(
>> extern int pnfs_osd_xdr_encode_deviceaddr(
>> u32 **pp, u32 *end, struct pnfs_osd_deviceaddr *devaddr);
>>
>> +/* osd_ioerr Info helpers (layout_return) */
>
> Looks like "Info" got cut'n'pasted from "Device Info"
>
> /* Client xdr encoding of osd_ioerror */ would be enough
>
>> +
>> +extern int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr,
>> + struct pnfs_osd_ioerr *ioerr);
>> +
>> +/* For Servers */
>
> How about /* Server xdr decoding of osd_ioerror */?
>
> Benny
>
>> +extern u32 *pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, u32 *p);
>> +
>> #endif /* _PANLAYOUT_PNFS_OSD_XDR_H */
Boaz
More information about the pNFS
mailing list