All of the above --- fs/Kconfig | 8 fs/lockd/mon.c | 3 fs/lockd/svclock.c | 29 - fs/lockd/xdr.c | 8 fs/lockd/xdr4.c | 8 fs/nfs/Makefile | 1 fs/nfs/client.c | 78 +- fs/nfs/delegation.c | 2 fs/nfs/dir.c | 83 +- fs/nfs/direct.c | 4 fs/nfs/file.c | 105 ++- fs/nfs/getroot.c | 3 fs/nfs/inode.c | 29 - fs/nfs/internal.h | 50 + fs/nfs/nfs2xdr.c | 20 fs/nfs/nfs3xdr.c | 25 - fs/nfs/nfs4proc.c | 58 - fs/nfs/nfs4state.c | 2 fs/nfs/nfs4xdr.c | 72 +- fs/nfs/nfsroot.c | 3 fs/nfs/read.c | 6 fs/nfs/super.c | 393 ++++----- fs/nfs/write.c | 197 ++--- fs/nfsd/nfs4xdr.c | 16 include/linux/jiffies.h | 4 include/linux/nfs_fs.h | 24 - include/linux/nfs_page.h | 1 include/linux/nfs_xdr.h | 3 include/linux/sunrpc/clnt.h | 2 include/linux/sunrpc/debug.h | 5 include/linux/sunrpc/msg_prot.h | 13 include/linux/sunrpc/rpc_rdma.h | 116 +++ include/linux/sunrpc/xdr.h | 5 include/linux/sunrpc/xprt.h | 42 - include/linux/sunrpc/xprtrdma.h | 85 ++ include/linux/sunrpc/xprtsock.h | 51 + include/linux/writeback.h | 2 kernel/auditsc.c | 1 net/sunrpc/Makefile | 1 net/sunrpc/clnt.c | 44 + net/sunrpc/rpc_pipe.c | 7 net/sunrpc/rpcb_clnt.c | 147 ++-- net/sunrpc/sched.c | 2 net/sunrpc/socklib.c | 3 net/sunrpc/sunrpc_syms.c | 2 net/sunrpc/timer.c | 4 net/sunrpc/xprt.c | 116 ++- net/sunrpc/xprtrdma/Makefile | 3 net/sunrpc/xprtrdma/rpc_rdma.c | 868 +++++++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 800 +++++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 1626 +++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 330 ++++++++ net/sunrpc/xprtsock.c | 557 +++++++++++-- 53 files changed, 5248 insertions(+), 819 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index f9eed6d..860ea8d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1728,6 +1728,14 @@ config SUNRPC config SUNRPC_GSS tristate +config SUNRPC_XPRT_RDMA + tristate "RDMA transport for sunrpc (EXPERIMENTAL)" + depends on SUNRPC && INFINIBAND && EXPERIMENTAL + default m + help + Adds a client RPC transport for supporting kernel NFS over RDMA + mounts, including Infiniband and iWARP. Experimental. + config SUNRPC_BIND34 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3353ed8..908b23f 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -132,7 +133,7 @@ nsm_create(void) .sin_port = 0, }; struct rpc_create_args args = { - .protocol = IPPROTO_UDP, + .protocol = XPRT_TRANSPORT_UDP, .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), .servername = "localhost", diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index a21e4bc..d098c7a 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -171,19 +171,14 @@ found: * GRANTED_RES message by cookie, without having to rely on the client's IP * address. --okir */ -static inline struct nlm_block * -nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, - struct nlm_lock *lock, struct nlm_cookie *cookie) +static struct nlm_block * +nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, + struct nlm_file *file, struct nlm_lock *lock, + struct nlm_cookie *cookie) { struct nlm_block *block; - struct nlm_host *host; struct nlm_rqst *call = NULL; - /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); - if (host == NULL) - return NULL; - call = nlm_alloc_call(host); if (call == NULL) return NULL; @@ -366,6 +361,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie) { struct nlm_block *block = NULL; + struct nlm_host *host; int error; __be32 ret; @@ -377,6 +373,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); + /* Create host handle for callback */ + host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); + if (host == NULL) + return nlm_lck_denied_nolocks; /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); @@ -385,7 +385,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ block = nlmsvc_lookup_block(file, lock); if (block == NULL) { - block = nlmsvc_create_block(rqstp, file, lock, cookie); + block = nlmsvc_create_block(rqstp, nlm_get_host(host), file, + lock, cookie); ret = nlm_lck_denied_nolocks; if (block == NULL) goto out; @@ -449,6 +450,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); + nlm_release_host(host); dprintk("lockd: nlmsvc_lock returned %u\n", ret); return ret; } @@ -477,10 +479,15 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, if (block == NULL) { struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + struct nlm_host *host; if (conf == NULL) return nlm_granted; - block = nlmsvc_create_block(rqstp, file, lock, cookie); + /* Create host handle for callback */ + host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); + if (host == NULL) + return nlm_lck_denied_nolocks; + block = nlmsvc_create_block(rqstp, host, file, lock, cookie); if (block == NULL) { kfree(conf); return nlm_granted; diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 5316e30..633653b 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -62,8 +62,9 @@ static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c) } else { - printk(KERN_NOTICE - "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); return NULL; } return p; @@ -84,8 +85,7 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f) unsigned int len; if ((len = ntohl(*p++)) != NFS2_FHSIZE) { - printk(KERN_NOTICE - "lockd: bad fhandle size %d (should be %d)\n", + dprintk("lockd: bad fhandle size %d (should be %d)\n", len, NFS2_FHSIZE); return NULL; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 846fc1d..43ff939 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -64,8 +64,9 @@ nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c) } else { - printk(KERN_NOTICE - "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); + dprintk("lockd: bad cookie size %d (only cookies under " + "%d bytes are supported.)\n", + len, NLM_MAXCOOKIELEN); return NULL; } return p; @@ -86,8 +87,7 @@ nlm4_decode_fh(__be32 *p, struct nfs_fh *f) memset(f->data, 0, sizeof(f->data)); f->size = ntohl(*p++); if (f->size > NFS_MAXFHSIZE) { - printk(KERN_NOTICE - "lockd: bad fhandle size %d (should be <=%d)\n", + dprintk("lockd: bad fhandle size %d (should be <=%d)\n", f->size, NFS_MAXFHSIZE); return NULL; } diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index b55cb23..df0f41e 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -16,4 +16,3 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ nfs4namespace.o nfs-$(CONFIG_NFS_DIRECTIO) += direct.o nfs-$(CONFIG_SYSCTL) += sysctl.o -nfs-objs := $(nfs-y) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a49f9fe..a532ee1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include #include @@ -340,7 +342,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_retries = 2; switch (proto) { - case IPPROTO_TCP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: if (!to->to_initval) to->to_initval = 60 * HZ; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) @@ -349,7 +352,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); to->to_exponential = 0; break; - case IPPROTO_UDP: + case XPRT_TRANSPORT_UDP: default: if (!to->to_initval) to->to_initval = 11 * HZ / 10; @@ -501,9 +504,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t /* * Initialise an NFS2 or NFS3 client */ -static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data) +static int nfs_init_client(struct nfs_client *clp, + const struct nfs_parsed_mount_data *data) { - int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; int error; if (clp->cl_cons_state == NFS_CS_READY) { @@ -522,8 +525,8 @@ static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data * * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans, - RPC_AUTH_UNIX, 0); + error = nfs_create_rpc_client(clp, data->nfs_server.protocol, + data->timeo, data->retrans, RPC_AUTH_UNIX, 0); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -538,7 +541,8 @@ error: /* * Create a version 2 or 3 client */ -static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data) +static int nfs_init_server(struct nfs_server *server, + const struct nfs_parsed_mount_data *data) { struct nfs_client *clp; int error, nfsvers = 2; @@ -551,7 +555,8 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat #endif /* Allocate or find a client reference we can use */ - clp = nfs_get_client(data->hostname, &data->addr, nfsvers); + clp = nfs_get_client(data->nfs_server.hostname, + &data->nfs_server.address, nfsvers); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); @@ -581,23 +586,13 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_dat if (error < 0) goto error; - error = nfs_init_server_rpcclient(server, data->pseudoflavor); + error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); if (error < 0) goto error; server->namelen = data->namlen; /* Create a client RPC handle for the NFSv3 ACL management interface */ nfs_init_server_aclclient(server); - if (clp->cl_nfsversion == 3) { - if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) - server->namelen = NFS3_MAXNAMLEN; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) - server->caps |= NFS_CAP_READDIRPLUS; - } else { - if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) - server->namelen = NFS2_MAXNAMLEN; - } - dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; @@ -770,7 +765,7 @@ void nfs_free_server(struct nfs_server *server) * Create a version 2 or 3 volume record * - keyed on server and FSID */ -struct nfs_server *nfs_create_server(const struct nfs_mount_data *data, +struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { struct nfs_server *server; @@ -794,6 +789,16 @@ struct nfs_server *nfs_create_server(const struct nfs_mount_data *data, error = nfs_probe_fsinfo(server, mntfh, &fattr); if (error < 0) goto error; + if (server->nfs_client->rpc_ops->version == 3) { + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + if (!(fattr.valid & NFS_ATTR_FATTR)) { error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); if (error < 0) { @@ -906,7 +911,7 @@ error: * Create a version 4 volume record */ static int nfs4_init_server(struct nfs_server *server, - const struct nfs4_mount_data *data, rpc_authflavor_t authflavour) + const struct nfs_parsed_mount_data *data) { int error; @@ -926,7 +931,7 @@ static int nfs4_init_server(struct nfs_server *server, server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; - error = nfs_init_server_rpcclient(server, authflavour); + error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); /* Done */ dprintk("<-- nfs4_init_server() = %d\n", error); @@ -937,12 +942,7 @@ static int nfs4_init_server(struct nfs_server *server, * Create a version 4 volume record * - keyed on server and FSID */ -struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, - const char *hostname, - const struct sockaddr_in *addr, - const char *mntpath, - const char *ip_addr, - rpc_authflavor_t authflavour, +struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { struct nfs_fattr fattr; @@ -956,13 +956,18 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, return ERR_PTR(-ENOMEM); /* Get a client record */ - error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour, - data->proto, data->timeo, data->retrans); + error = nfs4_set_client(server, + data->nfs_server.hostname, + &data->nfs_server.address, + data->client_address, + data->auth_flavors[0], + data->nfs_server.protocol, + data->timeo, data->retrans); if (error < 0) goto error; /* set up the general RPC client */ - error = nfs4_init_server(server, data, authflavour); + error = nfs4_init_server(server, data); if (error < 0) goto error; @@ -971,7 +976,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); /* Probe the root fh to retrieve its FSID */ - error = nfs4_path_walk(server, mntfh, mntpath); + error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); if (error < 0) goto error; @@ -984,6 +989,9 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, if (error < 0) goto error; + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + BUG_ON(!server->nfs_client); BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); @@ -1056,6 +1064,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + dprintk("Referral FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); @@ -1115,6 +1126,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (error < 0) goto out_free_server; + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + dprintk("Cloned FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index c55a761..7a1b6e8 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; - if ((struct nfs_open_context *)fl->fl_file->private_data != ctx) + if (nfs_file_open_context(fl->fl_file) != ctx) continue; status = nfs4_lock_delegation_recall(state, fl); if (status >= 0) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ea97408..2b5e611 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -407,7 +407,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct file *file = desc->file; struct nfs_entry *entry = desc->entry; struct dentry *dentry = NULL; - unsigned long fileid; + u64 fileid; int loop_count = 0, res; @@ -418,7 +418,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, unsigned d_type = DT_UNKNOWN; /* Note: entry->prev_cookie contains the cookie for * retrieving the current dirent on the server */ - fileid = nfs_fileid_to_ino_t(entry->ino); + fileid = entry->ino; /* Get a dentry if we have one */ if (dentry != NULL) @@ -428,7 +428,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, /* Use readdirplus info */ if (dentry != NULL && dentry->d_inode != NULL) { d_type = dt_type(dentry->d_inode); - fileid = dentry->d_inode->i_ino; + fileid = NFS_FILEID(dentry->d_inode); } res = filldir(dirent, entry->name, entry->len, @@ -558,7 +558,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) memset(desc, 0, sizeof(*desc)); desc->file = filp; - desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie; + desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); @@ -623,7 +623,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) } if (offset != filp->f_pos) { filp->f_pos = offset; - ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; + nfs_file_open_context(filp)->dir_cookie = 0; } out: mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); @@ -666,20 +666,6 @@ static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) dentry->d_time = verf; } -static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf) -{ - nfs_set_verifier(dentry, verf); -} - -/* - * Whenever an NFS operation succeeds, we know that the dentry - * is valid, so we update the revalidation timestamp. - */ -static inline void nfs_renew_times(struct dentry * dentry) -{ - dentry->d_time = jiffies; -} - /* * Return the intent data that applies to this particular path component * @@ -803,8 +789,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; - nfs_renew_times(dentry); - nfs_refresh_verifier(dentry, verifier); + nfs_set_verifier(dentry, verifier); out_valid: unlock_kernel(); dput(parent); @@ -872,8 +857,6 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) nfs_complete_unlink(dentry, inode); unlock_kernel(); } - /* When creating a negative dentry, we want to renew d_time */ - nfs_renew_times(dentry); iput(inode); } @@ -968,7 +951,6 @@ no_entry: dput(parent); dentry = res; } - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unlock: unlock_kernel(); @@ -1063,7 +1045,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } } else if (res != NULL) dentry = res; - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out: return res; @@ -1107,7 +1088,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) verifier = nfs_save_change_attribute(dir); ret = nfs4_open_revalidate(dir, dentry, openflags, nd); if (!ret) - nfs_refresh_verifier(dentry, verifier); + nfs_set_verifier(dentry, verifier); unlock_kernel(); out: dput(parent); @@ -1162,6 +1143,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) } if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) return NULL; + if (name.len > NFS_SERVER(dir)->namelen) + return NULL; /* Note: caller is already holding the dir->i_mutex! */ dentry = d_alloc(parent, &name); if (dentry == NULL) @@ -1181,12 +1164,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) dentry = alias; } - nfs_renew_times(dentry); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - return dentry; out_renew: - nfs_renew_times(dentry); - nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return dentry; } @@ -1252,7 +1231,6 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, nfs_end_data_update(dir); if (error != 0) goto out_err; - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); unlock_kernel(); return 0; @@ -1286,7 +1264,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) nfs_end_data_update(dir); if (status != 0) goto out_err; - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); unlock_kernel(); return 0; @@ -1316,7 +1293,6 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) nfs_end_data_update(dir); if (error != 0) goto out_err; - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); unlock_kernel(); return 0; @@ -1348,9 +1324,9 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry) static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) { static unsigned int sillycounter; - const int i_inosize = sizeof(dir->i_ino)*2; + const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; const int countersize = sizeof(sillycounter)*2; - const int slen = sizeof(".nfs") + i_inosize + countersize - 1; + const int slen = sizeof(".nfs")+fileidsize+countersize-1; char silly[slen+1]; struct qstr qsilly; struct dentry *sdentry; @@ -1368,8 +1344,9 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - sprintf(silly, ".nfs%*.*lx", - i_inosize, i_inosize, dentry->d_inode->i_ino); + sprintf(silly, ".nfs%*.*Lx", + fileidsize, fileidsize, + (unsigned long long)NFS_FILEID(dentry->d_inode)); /* Return delegation in anticipation of the rename */ nfs_inode_return_delegation(dentry->d_inode); @@ -1408,7 +1385,6 @@ static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) dir, &qsilly); nfs_end_data_update(dir); if (!error) { - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); d_move(dentry, sdentry); error = nfs_async_unlink(dir, dentry); @@ -1491,7 +1467,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); if (!error) { - nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } else if (need_rehash) d_rehash(dentry); @@ -1713,8 +1688,8 @@ out: d_rehash(rehash); if (!error) { d_move(old_dentry, new_dentry); - nfs_renew_times(new_dentry); - nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir)); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); } /* new dentry created? */ @@ -1840,7 +1815,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; @@ -1852,7 +1827,7 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs cache = nfs_access_search_rbtree(inode, cred); if (cache == NULL) goto out; - if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) + if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) goto out_stale; res->jiffies = cache->jiffies; res->cred = cache->cred; @@ -1907,7 +1882,7 @@ found: nfs_access_free_entry(entry); } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) @@ -1955,6 +1930,24 @@ out: return -EACCES; } +static int nfs_open_permission_mask(int openflags) +{ + int mask = 0; + + if (openflags & FMODE_READ) + mask |= MAY_READ; + if (openflags & FMODE_WRITE) + mask |= MAY_WRITE; + if (openflags & FMODE_EXEC) + mask |= MAY_EXEC; + return mask; +} + +int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) +{ + return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); +} + int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) { struct rpc_cred *cred; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index fcf4d38..28c8e1b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -368,7 +368,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size return -ENOMEM; dreq->inode = inode; - dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; @@ -718,7 +718,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz sync = FLUSH_STABLE; dreq->inode = inode; - dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 579cf8a..c664bb9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -33,6 +33,7 @@ #include #include "delegation.h" +#include "internal.h" #include "iostat.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -55,6 +56,8 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); +static struct vm_operations_struct nfs_file_vm_ops; + const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read = do_sync_read, @@ -174,13 +177,38 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) } /* + * Helper for nfs_file_flush() and nfs_fsync() + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occured, and hence cause it to + * fall back to doing a synchronous write. + */ +static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) +{ + int have_error, status; + int ret = 0; + + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_wb_all(inode); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret) + ret = status; + return ret; +} + +/* * Flush all dirty pages, and check for write errors. * */ static int nfs_file_flush(struct file *file, fl_owner_t id) { - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file->f_path.dentry->d_inode; int status; @@ -189,16 +217,11 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; nfs_inc_stats(inode, NFSIOS_VFSFLUSH); - lock_kernel(); + /* Ensure that data+attribute caches are up to date after close() */ - status = nfs_wb_all(inode); - if (!status) { - status = ctx->error; - ctx->error = 0; - if (!status) - nfs_revalidate_inode(NFS_SERVER(inode), inode); - } - unlock_kernel(); + status = nfs_do_fsync(ctx, inode); + if (!status) + nfs_revalidate_inode(NFS_SERVER(inode), inode); return status; } @@ -257,8 +280,11 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dentry->d_parent->d_name.name, dentry->d_name.name); status = nfs_revalidate_mapping(inode, file->f_mapping); - if (!status) - status = generic_file_mmap(file, vma); + if (!status) { + vma->vm_ops = &nfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + file_accessed(file); + } return status; } @@ -270,21 +296,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) static int nfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - int status; dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - lock_kernel(); - status = nfs_wb_all(inode); - if (!status) { - status = ctx->error; - ctx->error = 0; - } - unlock_kernel(); - return status; + return nfs_do_fsync(ctx, inode); } /* @@ -333,7 +351,7 @@ static int nfs_launder_page(struct page *page) const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, - .set_page_dirty = nfs_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .writepage = nfs_writepage, .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, @@ -346,6 +364,43 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, }; +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct file *filp = vma->vm_file; + unsigned pagelen; + int ret = -EINVAL; + + lock_page(page); + if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) + goto out_unlock; + pagelen = nfs_page_length(page); + if (pagelen == 0) + goto out_unlock; + ret = nfs_prepare_write(filp, page, 0, pagelen); + if (!ret) + ret = nfs_commit_write(filp, page, 0, pagelen); +out_unlock: + unlock_page(page); + return ret; +} + +static struct vm_operations_struct nfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nfs_vm_page_mkwrite, +}; + +static int nfs_need_sync_write(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx; + + if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) + return 1; + ctx = nfs_file_open_context(filp); + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) + return 1; + return 0; +} + static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -382,8 +437,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, iov, nr_segs, pos); /* Return error values for O_SYNC and IS_SYNC() */ - if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) { - int err = nfs_fsync(iocb->ki_filp, dentry, 1); + if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { + int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); if (err < 0) result = err; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index d1cbf0a..522e5ad 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -175,6 +175,9 @@ next_component: path++; name.len = path - (const char *) name.name; + if (name.len > NFS4_MAXNAMLEN) + return -ENAMETOOLONG; + eat_dot_dir: while (*path == '/') path++; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 71a49c3..b335048 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -431,7 +431,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) /* Flush out writes to the server in order to update c/mtime */ if (S_ISREG(inode->i_mode)) - nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT); + nfs_wb_nocommit(inode); /* * We may force a getattr if the user cares about atime. @@ -450,8 +450,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); else err = nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (!err) + if (!err) { generic_fillattr(inode, stat); + stat->ino = NFS_FILEID(inode); + } return err; } @@ -536,7 +538,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { struct inode *inode = filp->f_path.dentry->d_inode; - struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data; + struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { filp->private_data = NULL; @@ -654,7 +656,7 @@ int nfs_attribute_timeout(struct inode *inode) if (nfs_have_delegation(inode, FMODE_READ)) return 0; - return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); + return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } /** @@ -666,6 +668,7 @@ int nfs_attribute_timeout(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { + smp_rmb(); if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) && !nfs_attribute_timeout(inode)) return NFS_STALE(inode) ? -ESTALE : 0; @@ -719,6 +722,7 @@ int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *map struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; + smp_rmb(); if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); @@ -744,6 +748,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; + smp_rmb(); if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); @@ -784,6 +789,7 @@ void nfs_end_data_update(struct inode *inode) spin_unlock(&inode->i_lock); } nfsi->cache_change_attribute = jiffies; + smp_wmb(); atomic_dec(&nfsi->data_updates); } @@ -913,14 +919,14 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct nfs_inode *nfsi = NFS_I(inode); int status = 0; - spin_lock(&inode->i_lock); if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { + spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + spin_unlock(&inode->i_lock); goto out; } - status = nfs_update_inode(inode, fattr); + status = nfs_refresh_inode(inode, fattr); out: - spin_unlock(&inode->i_lock); return status; } @@ -976,8 +982,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Are we racing with known updates of the metadata on the server? */ data_stable = nfs_verify_change_attribute(inode, fattr->time_start); - if (data_stable) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME); + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1053,17 +1059,16 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - } else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { + } else if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } + invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; - if (data_stable) - invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); if (!nfs_have_delegation(inode, FMODE_READ) || (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 76cf55d..f3acf48 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -5,8 +5,6 @@ #include struct nfs_string; -struct nfs_mount_data; -struct nfs4_mount_data; /* Maximum number of readahead requests * FIXME: this should really be a sysctl so that users may tune it to suit @@ -27,20 +25,50 @@ struct nfs_clone_mount { rpc_authflavor_t authflavor; }; +/* + * In-kernel mount arguments + */ +struct nfs_parsed_mount_data { + int flags; + int rsize, wsize; + int timeo, retrans; + int acregmin, acregmax, + acdirmin, acdirmax; + int namlen; + unsigned int bsize; + unsigned int auth_flavor_len; + rpc_authflavor_t auth_flavors[1]; + char *client_address; + + struct { + struct sockaddr_in address; + char *hostname; + unsigned int program; + unsigned int version; + unsigned short port; + int protocol; + } mount_server; + + struct { + struct sockaddr_in address; + char *hostname; + char *export_path; + unsigned int program; + int protocol; + } nfs_server; +}; + /* client.c */ extern struct rpc_program nfs_program; extern void nfs_put_client(struct nfs_client *); extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); -extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *, - struct nfs_fh *); -extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *, - const char *, - const struct sockaddr_in *, - const char *, - const char *, - rpc_authflavor_t, - struct nfs_fh *); +extern struct nfs_server *nfs_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_server( + const struct nfs_parsed_mount_data *, + struct nfs_fh *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); extern void nfs_free_server(struct nfs_server *server); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index c5fce75..668ab96 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -251,6 +251,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, count); + req->rq_rcv_buf.flags |= XDRBUF_READ; return 0; } @@ -271,7 +272,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) res->eof = 0; hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READ reply header overflowed:" + dprintk("NFS: READ reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { @@ -281,7 +282,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { - printk(KERN_WARNING "NFS: server cheating in read reply: " + dprintk("NFS: server cheating in read reply: " "count %d > recvd %d\n", count, recvd); count = recvd; } @@ -313,6 +314,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) /* Copy the page array */ xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); + sndbuf->flags |= XDRBUF_WRITE; return 0; } @@ -431,7 +433,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READDIR reply header overflowed:" + dprintk("NFS: READDIR reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { @@ -454,7 +456,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) len = ntohl(*p++); p += XDR_QUADLEN(len) + 1; /* name plus cookie */ if (len > NFS2_MAXNAMLEN) { - printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n", + dprintk("NFS: giant filename in readdir (len 0x%x)!\n", len); goto err_unmap; } @@ -471,7 +473,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) entry[0] = entry[1] = 0; /* truncate listing ? */ if (!nr) { - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); + dprintk("NFS: readdir reply truncated!\n"); entry[1] = 1; } goto out; @@ -583,12 +585,12 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) /* Convert length of symlink */ len = ntohl(*p++); if (len >= rcvbuf->page_len || len <= 0) { - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); + dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; } hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READLINK reply header overflowed:" + dprintk("NFS: READLINK reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { @@ -597,7 +599,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) } recvd = req->rq_rcv_buf.len - hdrlen; if (recvd < len) { - printk(KERN_WARNING "NFS: server cheating in readlink reply: " + dprintk("NFS: server cheating in readlink reply: " "count %u > recvd %u\n", len, recvd); return -EIO; } @@ -695,7 +697,7 @@ nfs_stat_to_errno(int stat) if (nfs_errtbl[i].stat == stat) return nfs_errtbl[i].errno; } - printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat); + dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); return nfs_errtbl[i].errno; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9e08f0..616d326 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -346,6 +346,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, count); + req->rq_rcv_buf.flags |= XDRBUF_READ; return 0; } @@ -367,6 +368,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) /* Copy the page array */ xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); + sndbuf->flags |= XDRBUF_WRITE; return 0; } @@ -524,7 +526,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READDIR reply header overflowed:" + dprintk("NFS: READDIR reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { @@ -547,7 +549,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res len = ntohl(*p++); /* string length */ p += XDR_QUADLEN(len) + 2; /* name + cookie */ if (len > NFS3_MAXNAMLEN) { - printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n", + dprintk("NFS: giant filename in readdir (len %x)!\n", len); goto err_unmap; } @@ -567,7 +569,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res goto short_pkt; len = ntohl(*p++); if (len > NFS3_FHSIZE) { - printk(KERN_WARNING "NFS: giant filehandle in " + dprintk("NFS: giant filehandle in " "readdir (len %x)!\n", len); goto err_unmap; } @@ -588,7 +590,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res entry[0] = entry[1] = 0; /* truncate listing ? */ if (!nr) { - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); + dprintk("NFS: readdir reply truncated!\n"); entry[1] = 1; } goto out; @@ -826,22 +828,23 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) /* Convert length of symlink */ len = ntohl(*p++); if (len >= rcvbuf->page_len || len <= 0) { - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); + dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; } hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READLINK reply header overflowed:" + dprintk("NFS: READLINK reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); + dprintk("NFS: READLINK header is short. " + "iovec will be shifted.\n"); xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); } recvd = req->rq_rcv_buf.len - hdrlen; if (recvd < len) { - printk(KERN_WARNING "NFS: server cheating in readlink reply: " + dprintk("NFS: server cheating in readlink reply: " "count %u > recvd %u\n", len, recvd); return -EIO; } @@ -876,13 +879,13 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) ocount = ntohl(*p++); if (ocount != count) { - printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n"); + dprintk("NFS: READ count doesn't match RPC opaque count.\n"); return -errno_NFSERR_IO; } hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { - printk(KERN_WARNING "NFS: READ reply header overflowed:" + dprintk("NFS: READ reply header overflowed:" "length %d > %Zu\n", hdrlen, iov->iov_len); return -errno_NFSERR_IO; } else if (iov->iov_len != hdrlen) { @@ -892,7 +895,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { - printk(KERN_WARNING "NFS: server cheating in read reply: " + dprintk("NFS: server cheating in read reply: " "count %d > recvd %d\n", count, recvd); count = recvd; res->eof = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b90e17..0e366a3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -62,10 +62,8 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); -static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); -static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); @@ -177,7 +175,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, dentry->d_inode->i_ino); + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); } *p++ = xdr_one; /* next */ @@ -189,7 +187,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino); + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -454,7 +452,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); rcu_read_unlock(); lock_kernel(); - ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); unlock_kernel(); if (ret != 0) goto out; @@ -948,36 +946,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags) -{ - struct nfs_access_entry cache; - int mask = 0; - int status; - - if (openflags & FMODE_READ) - mask |= MAY_READ; - if (openflags & FMODE_WRITE) - mask |= MAY_WRITE; - if (openflags & FMODE_EXEC) - mask |= MAY_EXEC; - status = nfs_access_get_cached(inode, cred, &cache); - if (status == 0) - goto out; - - /* Be clever: ask server to check for all possible rights */ - cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; - cache.cred = cred; - cache.jiffies = jiffies; - status = _nfs4_proc_access(inode, &cache); - if (status != 0) - return status; - nfs_access_add_cache(inode, &cache); -out: - if ((cache.mask & mask) == mask) - return 0; - return -EACCES; -} - static int nfs4_recover_expired_lease(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; @@ -1381,7 +1349,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct /* If the open_intent is for execute, we have an extra check to make */ if (nd->intent.open.flags & FMODE_EXEC) { - ret = _nfs4_do_access(state->inode, + ret = nfs_may_open(state->inode, state->owner->so_cred, nd->intent.open.flags); if (ret < 0) @@ -1390,7 +1358,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct filp = lookup_instantiate_filp(nd, path->dentry, NULL); if (!IS_ERR(filp)) { struct nfs_open_context *ctx; - ctx = (struct nfs_open_context *)filp->private_data; + ctx = nfs_file_open_context(filp); ctx->state = state; return 0; } @@ -1757,10 +1725,16 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) { + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; struct nfs4_accessargs args = { .fh = NFS_FH(inode), + .bitmask = server->attr_bitmask, + }; + struct nfs4_accessres res = { + .server = server, + .fattr = &fattr, }; - struct nfs4_accessres res = { 0 }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], .rpc_argp = &args, @@ -1786,6 +1760,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (mode & MAY_EXEC) args.access |= NFS4_ACCESS_EXECUTE; } + nfs_fattr_init(&fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (!status) { entry->mask = 0; @@ -1795,6 +1770,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry entry->mask |= MAY_WRITE; if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) entry->mask |= MAY_EXEC; + nfs_refresh_inode(inode, &fattr); } return status; } @@ -3303,7 +3279,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = -ENOMEM; if (seqid == NULL) goto out; - task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); + task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) goto out; @@ -3447,7 +3423,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f int ret; dprintk("%s: begin!\n", __FUNCTION__); - data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data, + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), fl->fl_u.nfs4_fl.owner); if (data == NULL) return -ENOMEM; @@ -3573,7 +3549,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) int status; /* verify open state */ - ctx = (struct nfs_open_context *)filp->private_data; + ctx = nfs_file_open_context(filp); state = ctx->state; if (request->fl_start < 0 || request->fl_end < 0) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3e4adf8..bfb3626 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -774,7 +774,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; - if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state) + if (nfs_file_open_context(fl->fl_file)->state != state) continue; status = ops->recover_lock(state, fl); if (status >= 0) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index badd73b..51dd380 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -376,10 +376,12 @@ static int nfs4_stat_to_errno(int); decode_locku_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_access_maxsz) + encode_access_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_access_maxsz) + decode_access_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz) @@ -562,7 +564,6 @@ struct compound_hdr { #define RESERVE_SPACE(nbytes) do { \ p = xdr_reserve_space(xdr, nbytes); \ - if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \ BUG_ON(!p); \ } while (0) @@ -628,8 +629,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s if (iap->ia_valid & ATTR_UID) { owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); if (owner_namelen < 0) { - printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n", - iap->ia_uid); + dprintk("nfs: couldn't resolve uid %d to string\n", + iap->ia_uid); /* XXX */ strcpy(owner_name, "nobody"); owner_namelen = sizeof("nobody") - 1; @@ -640,8 +641,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s if (iap->ia_valid & ATTR_GID) { owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); if (owner_grouplen < 0) { - printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n", - iap->ia_gid); + dprintk("nfs: couldn't resolve gid %d to string\n", + iap->ia_gid); strcpy(owner_group, "nobody"); owner_grouplen = sizeof("nobody") - 1; /* goto out; */ @@ -711,7 +712,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s * Now we backfill the bitmap and the attribute buffer length. */ if (len != ((char *)p - (char *)q) + 4) { - printk ("encode_attr: Attr length calculation error! %u != %Zu\n", + printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", len, ((char *)p - (char *)q) + 4); BUG(); } @@ -1376,14 +1377,20 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) == 0) - status = encode_access(&xdr, args->access); + status = encode_putfh(&xdr, args->fh); + if (status != 0) + goto out; + status = encode_access(&xdr, args->access); + if (status != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); +out: return status; } @@ -1857,6 +1864,7 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->count); + req->rq_rcv_buf.flags |= XDRBUF_READ; out: return status; } @@ -1933,6 +1941,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea status = encode_write(&xdr, args); if (status) goto out; + req->rq_snd_buf.flags |= XDRBUF_WRITE; status = encode_getfattr(&xdr, args->bitmask); out: return status; @@ -2180,9 +2189,9 @@ out: #define READ_BUF(nbytes) do { \ p = xdr_inline_decode(xdr, nbytes); \ if (unlikely(!p)) { \ - printk(KERN_INFO "%s: prematurely hit end of receive" \ + dprintk("nfs: %s: prematurely hit end of receive" \ " buffer\n", __FUNCTION__); \ - printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ + dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ __FUNCTION__, xdr->p, nbytes, xdr->end); \ return -EIO; \ } \ @@ -2223,9 +2232,8 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) READ_BUF(8); READ32(opnum); if (opnum != expected) { - printk(KERN_NOTICE - "nfs4_decode_op_hdr: Server returned operation" - " %d but we issued a request for %d\n", + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", opnum, expected); return -EIO; } @@ -2758,7 +2766,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf dprintk("%s: nfs_map_name_to_uid failed!\n", __FUNCTION__); } else - printk(KERN_WARNING "%s: name too long (%u)!\n", + dprintk("%s: name too long (%u)!\n", __FUNCTION__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER; } @@ -2783,7 +2791,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf dprintk("%s: nfs_map_group_to_gid failed!\n", __FUNCTION__); } else - printk(KERN_WARNING "%s: name too long (%u)!\n", + dprintk("%s: name too long (%u)!\n", __FUNCTION__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } @@ -2950,7 +2958,8 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl unsigned int nwords = xdr->p - savep; if (unlikely(attrwords != nwords)) { - printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n", + dprintk("%s: server returned incorrect attribute length: " + "%u %c %u\n", __FUNCTION__, attrwords << 2, (attrwords < nwords) ? '<' : '>', @@ -3451,7 +3460,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { - printk(KERN_WARNING "NFS: server cheating in read reply: " + dprintk("NFS: server cheating in read reply: " "count %u > recvd %u\n", count, recvd); count = recvd; eof = 0; @@ -3500,7 +3509,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n p += 2; /* cookie */ len = ntohl(*p++); /* filename length */ if (len > NFS4_MAXNAMLEN) { - printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); + dprintk("NFS: giant filename in readdir (len 0x%x)\n", + len); goto err_unmap; } xlen = XDR_QUADLEN(len); @@ -3528,7 +3538,7 @@ short_pkt: entry[0] = entry[1] = 0; /* truncate listing ? */ if (!nr) { - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); + dprintk("NFS: readdir reply truncated!\n"); entry[1] = 1; } goto out; @@ -3554,13 +3564,13 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) READ_BUF(4); READ32(len); if (len >= rcvbuf->page_len || len <= 0) { - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); + dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; } hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (recvd < len) { - printk(KERN_WARNING "NFS: server cheating in readlink reply: " + dprintk("NFS: server cheating in readlink reply: " "count %u > recvd %u\n", len, recvd); return -EIO; } @@ -3643,7 +3653,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (attrlen > recvd) { - printk(KERN_WARNING "NFS: server cheating in getattr" + dprintk("NFS: server cheating in getattr" " acl reply: attrlen %u > recvd %u\n", attrlen, recvd); return -EINVAL; @@ -3688,8 +3698,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) READ_BUF(8); READ32(opnum); if (opnum != OP_SETCLIENTID) { - printk(KERN_NOTICE - "nfs4_decode_setclientid: Server returned operation" + dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } @@ -3783,8 +3792,13 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; - if ((status = decode_putfh(&xdr)) == 0) - status = decode_access(&xdr, res); + status = decode_putfh(&xdr); + if (status != 0) + goto out; + status = decode_access(&xdr, res); + if (status != 0) + goto out; + decode_getfattr(&xdr, res->fattr, res->server); out: return status; } diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 3490322..e87b44e 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -491,7 +492,7 @@ static int __init root_nfs_get_handle(void) struct sockaddr_in sin; int status; int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? - IPPROTO_TCP : IPPROTO_UDP; + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP; int version = (nfs_data.flags & NFS_MOUNT_VER3) ? NFS_MNT3_VERSION : NFS_MNT_VERSION; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 19e0563..d6e62d7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -497,8 +497,7 @@ int nfs_readpage(struct file *file, struct page *page) if (ctx == NULL) goto out_unlock; } else - ctx = get_nfs_open_context((struct nfs_open_context *) - file->private_data); + ctx = get_nfs_open_context(nfs_file_open_context(file)); error = nfs_readpage_async(ctx, inode, page); @@ -576,8 +575,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (desc.ctx == NULL) return -EBADF; } else - desc.ctx = get_nfs_open_context((struct nfs_open_context *) - filp->private_data); + desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b878528..fa517ae 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include #include @@ -58,36 +60,6 @@ #define NFSDBG_FACILITY NFSDBG_VFS - -struct nfs_parsed_mount_data { - int flags; - int rsize, wsize; - int timeo, retrans; - int acregmin, acregmax, - acdirmin, acdirmax; - int namlen; - unsigned int bsize; - unsigned int auth_flavor_len; - rpc_authflavor_t auth_flavors[1]; - char *client_address; - - struct { - struct sockaddr_in address; - unsigned int program; - unsigned int version; - unsigned short port; - int protocol; - } mount_server; - - struct { - struct sockaddr_in address; - char *hostname; - char *export_path; - unsigned int program; - int protocol; - } nfs_server; -}; - enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, @@ -97,7 +69,7 @@ enum { Opt_ac, Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, - Opt_udp, Opt_tcp, + Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, Opt_sharecache, Opt_nosharecache, @@ -116,7 +88,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, - Opt_addr, Opt_mounthost, Opt_clientaddr, + Opt_addr, Opt_mountaddr, Opt_clientaddr, /* Mount options that are ignored */ Opt_userspace, Opt_deprecated, @@ -143,6 +115,7 @@ static match_table_t nfs_mount_option_tokens = { { Opt_v3, "v3" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, + { Opt_rdma, "rdma" }, { Opt_acl, "acl" }, { Opt_noacl, "noacl" }, { Opt_rdirplus, "rdirplus" }, @@ -175,13 +148,14 @@ static match_table_t nfs_mount_option_tokens = { { Opt_mountproto, "mountproto=%s" }, { Opt_addr, "addr=%s" }, { Opt_clientaddr, "clientaddr=%s" }, - { Opt_mounthost, "mounthost=%s" }, + { Opt_userspace, "mounthost=%s" }, + { Opt_mountaddr, "mountaddr=%s" }, { Opt_err, NULL } }; enum { - Opt_xprt_udp, Opt_xprt_tcp, + Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, Opt_xprt_err }; @@ -189,6 +163,7 @@ enum { static match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_rdma, "rdma" }, { Opt_xprt_err, NULL } }; @@ -449,7 +424,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, - { NFS_MOUNT_INTR, ",intr", "" }, + { NFS_MOUNT_INTR, ",intr", ",nointr" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", "" }, @@ -460,8 +435,6 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; - char buf[12]; - const char *proto; seq_printf(m, ",vers=%d", clp->rpc_ops->version); seq_printf(m, ",rsize=%d", nfss->rsize); @@ -480,18 +453,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else seq_puts(m, nfs_infop->nostr); } - switch (nfss->client->cl_xprt->prot) { - case IPPROTO_TCP: - proto = "tcp"; - break; - case IPPROTO_UDP: - proto = "udp"; - break; - default: - snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot); - proto = buf; - } - seq_printf(m, ",proto=%s", proto); + seq_printf(m, ",proto=%s", + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ); seq_printf(m, ",retrans=%u", clp->retrans_count); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); @@ -506,8 +469,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) nfs_show_mount_options(m, nfss, 0); - seq_puts(m, ",addr="); - seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\"); + seq_printf(m, ",addr="NIPQUAD_FMT, + NIPQUAD(nfss->nfs_client->cl_addr.sin_addr)); return 0; } @@ -698,13 +661,19 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; - mnt->nfs_server.protocol = IPPROTO_UDP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; mnt->timeo = 7; mnt->retrans = 5; break; case Opt_tcp: mnt->flags |= NFS_MOUNT_TCP; - mnt->nfs_server.protocol = IPPROTO_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + mnt->timeo = 600; + mnt->retrans = 2; + break; + case Opt_rdma: + mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; mnt->timeo = 600; mnt->retrans = 2; break; @@ -913,13 +882,20 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; - mnt->nfs_server.protocol = IPPROTO_UDP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; mnt->timeo = 7; mnt->retrans = 5; break; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; - mnt->nfs_server.protocol = IPPROTO_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + mnt->timeo = 600; + mnt->retrans = 2; + break; + case Opt_xprt_rdma: + /* vector side protocols to TCP */ + mnt->flags |= NFS_MOUNT_TCP; + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; mnt->timeo = 600; mnt->retrans = 2; break; @@ -937,11 +913,12 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp: - mnt->mount_server.protocol = IPPROTO_UDP; + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp: - mnt->mount_server.protocol = IPPROTO_TCP; + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; + case Opt_xprt_rdma: /* not used for side protocols */ default: goto out_unrec_xprt; } @@ -961,7 +938,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; mnt->client_address = string; break; - case Opt_mounthost: + case Opt_mountaddr: string = match_strdup(args); if (string == NULL) goto out_nomem; @@ -1027,16 +1004,10 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, sin = args->mount_server.address; else sin = args->nfs_server.address; - if (args->mount_server.port == 0) { - status = rpcb_getport_sync(&sin, - args->mount_server.program, - args->mount_server.version, - args->mount_server.protocol); - if (status < 0) - goto out_err; - sin.sin_port = htons(status); - } else - sin.sin_port = htons(args->mount_server.port); + /* + * autobind will be used if mount_server.port == 0 + */ + sin.sin_port = htons(args->mount_server.port); /* * Now ask the mount server to map our export path @@ -1049,14 +1020,11 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.version, args->mount_server.protocol, root_fh); - if (status < 0) - goto out_err; - - return status; + if (status == 0) + return 0; -out_err: - dfprintk(MOUNT, "NFS: unable to contact server on host " - NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr)); + dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT + ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status); return status; } @@ -1079,15 +1047,31 @@ out_err: * XXX: as far as I can tell, changing the NFS program number is not * supported in the NFS client. */ -static int nfs_validate_mount_data(struct nfs_mount_data **options, +static int nfs_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, struct nfs_fh *mntfh, const char *dev_name) { - struct nfs_mount_data *data = *options; + struct nfs_mount_data *data = (struct nfs_mount_data *)options; if (data == NULL) goto out_no_data; + memset(args, 0, sizeof(*args)); + args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + args->rsize = NFS_MAX_FILE_IO_SIZE; + args->wsize = NFS_MAX_FILE_IO_SIZE; + args->timeo = 600; + args->retrans = 2; + args->acregmin = 3; + args->acregmax = 60; + args->acdirmin = 30; + args->acdirmax = 60; + args->mount_server.protocol = XPRT_TRANSPORT_UDP; + args->mount_server.program = NFS_MNT_PROGRAM; + args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + args->nfs_server.program = NFS_PROGRAM; + switch (data->version) { case 1: data->namlen = 0; @@ -1116,92 +1100,73 @@ static int nfs_validate_mount_data(struct nfs_mount_data **options, if (mntfh->size < sizeof(mntfh->data)) memset(mntfh->data + mntfh->size, 0, sizeof(mntfh->data) - mntfh->size); + + if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) + goto out_no_address; + + /* + * Translate to nfs_parsed_mount_data, which nfs_fill_super + * can deal with. + */ + args->flags = data->flags; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->flags = data->flags; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + args->nfs_server.address = data->addr; + if (!(data->flags & NFS_MOUNT_TCP)) + args->nfs_server.protocol = XPRT_TRANSPORT_UDP; + /* N.B. caller will free nfs_server.hostname in all cases */ + args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); + args->namlen = data->namlen; + args->bsize = data->bsize; + args->auth_flavors[0] = data->pseudoflavor; break; default: { unsigned int len; char *c; int status; - struct nfs_parsed_mount_data args = { - .flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP), - .rsize = NFS_MAX_FILE_IO_SIZE, - .wsize = NFS_MAX_FILE_IO_SIZE, - .timeo = 600, - .retrans = 2, - .acregmin = 3, - .acregmax = 60, - .acdirmin = 30, - .acdirmax = 60, - .mount_server.protocol = IPPROTO_UDP, - .mount_server.program = NFS_MNT_PROGRAM, - .nfs_server.protocol = IPPROTO_TCP, - .nfs_server.program = NFS_PROGRAM, - }; - - if (nfs_parse_mount_options((char *) *options, &args) == 0) - return -EINVAL; - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return -ENOMEM; + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; - /* - * NB: after this point, caller will free "data" - * if we return an error - */ - *options = data; + if (!nfs_verify_server_address((struct sockaddr *) + &args->nfs_server.address)) + goto out_no_address; c = strchr(dev_name, ':'); if (c == NULL) return -EINVAL; len = c - dev_name; - if (len > sizeof(data->hostname)) - return -ENAMETOOLONG; - strncpy(data->hostname, dev_name, len); - args.nfs_server.hostname = data->hostname; + /* N.B. caller will free nfs_server.hostname in all cases */ + args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); c++; if (strlen(c) > NFS_MAXPATHLEN) return -ENAMETOOLONG; - args.nfs_server.export_path = c; + args->nfs_server.export_path = c; - status = nfs_try_mount(&args, mntfh); + status = nfs_try_mount(args, mntfh); if (status) return status; - /* - * Translate to nfs_mount_data, which nfs_fill_super - * can deal with. - */ - data->version = 6; - data->flags = args.flags; - data->rsize = args.rsize; - data->wsize = args.wsize; - data->timeo = args.timeo; - data->retrans = args.retrans; - data->acregmin = args.acregmin; - data->acregmax = args.acregmax; - data->acdirmin = args.acdirmin; - data->acdirmax = args.acdirmax; - data->addr = args.nfs_server.address; - data->namlen = args.namlen; - data->bsize = args.bsize; - data->pseudoflavor = args.auth_flavors[0]; - break; } } - if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) - data->pseudoflavor = RPC_AUTH_UNIX; + if (!(args->flags & NFS_MOUNT_SECFLAVOUR)) + args->auth_flavors[0] = RPC_AUTH_UNIX; #ifndef CONFIG_NFS_V3 - if (data->flags & NFS_MOUNT_VER3) + if (args->flags & NFS_MOUNT_VER3) goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ - if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) - goto out_no_address; - return 0; out_no_data: @@ -1258,7 +1223,8 @@ static inline void nfs_initialise_sb(struct super_block *sb) /* * Finish setting up an NFS2/3 superblock */ -static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data) +static void nfs_fill_super(struct super_block *sb, + struct nfs_parsed_mount_data *data) { struct nfs_server *server = NFS_SB(sb); @@ -1379,7 +1345,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, struct nfs_server *server = NULL; struct super_block *s; struct nfs_fh mntfh; - struct nfs_mount_data *data = raw_data; + struct nfs_parsed_mount_data data; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { @@ -1388,12 +1354,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, int error; /* Validate the mount data */ - error = nfs_validate_mount_data(&data, &mntfh, dev_name); + error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs_create_server(data, &mntfh); + server = nfs_create_server(&data, &mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1417,7 +1383,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, data); + nfs_fill_super(s, &data); } mntroot = nfs_get_root(s, &mntfh); @@ -1432,8 +1398,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: - if (data != raw_data) - kfree(data); + kfree(data.nfs_server.hostname); return error; out_err_nosb: @@ -1559,38 +1524,49 @@ static void nfs4_fill_super(struct super_block *sb) /* * Validate NFSv4 mount options */ -static int nfs4_validate_mount_data(struct nfs4_mount_data **options, - const char *dev_name, - struct sockaddr_in *addr, - rpc_authflavor_t *authflavour, - char **hostname, - char **mntpath, - char **ip_addr) +static int nfs4_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) { - struct nfs4_mount_data *data = *options; + struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; if (data == NULL) goto out_no_data; + memset(args, 0, sizeof(*args)); + args->rsize = NFS_MAX_FILE_IO_SIZE; + args->wsize = NFS_MAX_FILE_IO_SIZE; + args->timeo = 600; + args->retrans = 2; + args->acregmin = 3; + args->acregmax = 60; + args->acdirmin = 30; + args->acdirmax = 60; + args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + switch (data->version) { case 1: - if (data->host_addrlen != sizeof(*addr)) + if (data->host_addrlen != sizeof(args->nfs_server.address)) goto out_no_address; - if (copy_from_user(addr, data->host_addr, sizeof(*addr))) + if (copy_from_user(&args->nfs_server.address, + data->host_addr, + sizeof(args->nfs_server.address))) return -EFAULT; - if (addr->sin_port == 0) - addr->sin_port = htons(NFS_PORT); - if (!nfs_verify_server_address((struct sockaddr *) addr)) + if (args->nfs_server.address.sin_port == 0) + args->nfs_server.address.sin_port = htons(NFS_PORT); + if (!nfs_verify_server_address((struct sockaddr *) + &args->nfs_server.address)) goto out_no_address; switch (data->auth_flavourlen) { case 0: - *authflavour = RPC_AUTH_UNIX; + args->auth_flavors[0] = RPC_AUTH_UNIX; break; case 1: - if (copy_from_user(authflavour, data->auth_flavours, - sizeof(*authflavour))) + if (copy_from_user(&args->auth_flavors[0], + data->auth_flavours, + sizeof(args->auth_flavors[0]))) return -EFAULT; break; default: @@ -1600,75 +1576,57 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) return PTR_ERR(c); - *hostname = c; + args->nfs_server.hostname = c; c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); if (IS_ERR(c)) return PTR_ERR(c); - *mntpath = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath); + args->nfs_server.export_path = c; + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) return PTR_ERR(c); - *ip_addr = c; + args->client_address = c; + + /* + * Translate to nfs_parsed_mount_data, which nfs4_fill_super + * can deal with. + */ + + args->flags = data->flags & NFS4_MOUNT_FLAGMASK; + args->rsize = data->rsize; + args->wsize = data->wsize; + args->timeo = data->timeo; + args->retrans = data->retrans; + args->acregmin = data->acregmin; + args->acregmax = data->acregmax; + args->acdirmin = data->acdirmin; + args->acdirmax = data->acdirmax; + args->nfs_server.protocol = data->proto; break; default: { unsigned int len; - struct nfs_parsed_mount_data args = { - .rsize = NFS_MAX_FILE_IO_SIZE, - .wsize = NFS_MAX_FILE_IO_SIZE, - .timeo = 600, - .retrans = 2, - .acregmin = 3, - .acregmax = 60, - .acdirmin = 30, - .acdirmax = 60, - .nfs_server.protocol = IPPROTO_TCP, - }; - - if (nfs_parse_mount_options((char *) *options, &args) == 0) + + if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; if (!nfs_verify_server_address((struct sockaddr *) - &args.nfs_server.address)) + &args->nfs_server.address)) return -EINVAL; - *addr = args.nfs_server.address; - switch (args.auth_flavor_len) { + switch (args->auth_flavor_len) { case 0: - *authflavour = RPC_AUTH_UNIX; + args->auth_flavors[0] = RPC_AUTH_UNIX; break; case 1: - *authflavour = (rpc_authflavor_t) args.auth_flavors[0]; break; default: goto out_inval_auth; } /* - * Translate to nfs4_mount_data, which nfs4_fill_super - * can deal with. - */ - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return -ENOMEM; - *options = data; - - data->version = 1; - data->flags = args.flags & NFS4_MOUNT_FLAGMASK; - data->rsize = args.rsize; - data->wsize = args.wsize; - data->timeo = args.timeo; - data->retrans = args.retrans; - data->acregmin = args.acregmin; - data->acregmax = args.acregmax; - data->acdirmin = args.acdirmin; - data->acdirmax = args.acdirmax; - data->proto = args.nfs_server.protocol; - - /* * Split "dev_name" into "hostname:mntpath". */ c = strchr(dev_name, ':'); @@ -1678,27 +1636,25 @@ static int nfs4_validate_mount_data(struct nfs4_mount_data **options, len = c - dev_name; if (len > NFS4_MAXNAMLEN) return -ENAMETOOLONG; - *hostname = kzalloc(len, GFP_KERNEL); - if (*hostname == NULL) + args->nfs_server.hostname = kzalloc(len, GFP_KERNEL); + if (args->nfs_server.hostname == NULL) return -ENOMEM; - strncpy(*hostname, dev_name, len - 1); + strncpy(args->nfs_server.hostname, dev_name, len - 1); c++; /* step over the ':' */ len = strlen(c); if (len > NFS4_MAXPATHLEN) return -ENAMETOOLONG; - *mntpath = kzalloc(len + 1, GFP_KERNEL); - if (*mntpath == NULL) + args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL); + if (args->nfs_server.export_path == NULL) return -ENOMEM; - strncpy(*mntpath, c, len); + strncpy(args->nfs_server.export_path, c, len); - dprintk("MNTPATH: %s\n", *mntpath); + dprintk("MNTPATH: %s\n", args->nfs_server.export_path); - if (args.client_address == NULL) + if (args->client_address == NULL) goto out_no_client_address; - *ip_addr = args.client_address; - break; } } @@ -1729,14 +1685,11 @@ out_no_client_address: static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - struct nfs4_mount_data *data = raw_data; + struct nfs_parsed_mount_data data; struct super_block *s; struct nfs_server *server; - struct sockaddr_in addr; - rpc_authflavor_t authflavour; struct nfs_fh mntfh; struct dentry *mntroot; - char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, @@ -1744,14 +1697,12 @@ static int nfs4_get_sb(struct file_system_type *fs_type, int error; /* Validate the mount data */ - error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour, - &hostname, &mntpath, &ip_addr); + error = nfs4_validate_mount_data(raw_data, &data, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, - authflavour, &mntfh); + server = nfs4_create_server(&data, &mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1790,9 +1741,9 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(ip_addr); - kfree(mntpath); - kfree(hostname); + kfree(data.client_address); + kfree(data.nfs_server.export_path); + kfree(data.nfs_server.hostname); return error; out_free: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0d7a77c..3e9e268 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -110,6 +110,13 @@ void nfs_writedata_release(void *wdata) nfs_writedata_free(wdata); } +static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +{ + ctx->error = error; + smp_wmb(); + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); +} + static struct nfs_page *nfs_page_find_request_locked(struct page *page) { struct nfs_page *req = NULL; @@ -243,10 +250,7 @@ static void nfs_end_page_writeback(struct page *page) /* * Find an associated nfs write request, and prepare to flush it out - * Returns 1 if there was no write request, or if the request was - * already tagged by nfs_set_page_dirty.Returns 0 if the request - * was not tagged. - * May also return an error if the user signalled nfs_wait_on_request(). + * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct page *page) @@ -261,7 +265,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, req = nfs_page_find_request_locked(page); if (req == NULL) { spin_unlock(&inode->i_lock); - return 1; + return 0; } if (nfs_lock_request_dontget(req)) break; @@ -282,7 +286,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, spin_unlock(&inode->i_lock); nfs_unlock_request(req); nfs_pageio_complete(pgio); - return 1; + return 0; } if (nfs_set_page_writeback(page) != 0) { spin_unlock(&inode->i_lock); @@ -290,70 +294,56 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, } radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); spin_unlock(&inode->i_lock); nfs_pageio_add_request(pgio, req); - return ret; + return 0; } -/* - * Write an mmapped page to the server. - */ -static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - struct nfs_pageio_descriptor mypgio, *pgio; - struct nfs_open_context *ctx; struct inode *inode = page->mapping->host; - unsigned offset; - int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - if (wbc->for_writepages) - pgio = wbc->fs_private; - else { - nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc)); - pgio = &mypgio; - } - nfs_pageio_cond_complete(pgio, page->index); + return nfs_page_async_flush(pgio, page); +} - err = nfs_page_async_flush(pgio, page); - if (err <= 0) - goto out; - err = 0; - offset = nfs_page_length(page); - if (!offset) - goto out; - - nfs_pageio_cond_complete(pgio, page->index); +/* + * Write an mmapped page to the server. + */ +static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +{ + struct nfs_pageio_descriptor pgio; + int err; - ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE); - if (ctx == NULL) { - err = -EBADF; - goto out; - } - err = nfs_writepage_setup(ctx, page, 0, offset); - put_nfs_open_context(ctx); - if (err != 0) - goto out; - err = nfs_page_async_flush(pgio, page); - if (err > 0) - err = 0; -out: - if (!wbc->for_writepages) - nfs_pageio_complete(pgio); - return err; + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); + err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_complete(&pgio); + if (err < 0) + return err; + if (pgio.pg_error < 0) + return pgio.pg_error; + return 0; } int nfs_writepage(struct page *page, struct writeback_control *wbc) { - int err; + int ret; + + ret = nfs_writepage_locked(page, wbc); + unlock_page(page); + return ret; +} + +static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +{ + int ret; - err = nfs_writepage_locked(page, wbc); + ret = nfs_do_writepage(page, wbc, data); unlock_page(page); - return err; + return ret; } int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -365,12 +355,11 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); - wbc->fs_private = &pgio; - err = generic_writepages(mapping, wbc); + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - if (err) + if (err < 0) return err; - if (pgio.pg_error) + if (pgio.pg_error < 0) return pgio.pg_error; return 0; } @@ -395,8 +384,6 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) } SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); - if (PageDirty(req->wb_page)) - set_bit(PG_NEED_FLUSH, &req->wb_flags); nfsi->npages++; kref_get(&req->wb_kref); return 0; @@ -416,8 +403,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); - if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags)) - __set_page_dirty_nobuffers(req->wb_page); nfsi->npages--; if (!nfsi->npages) { spin_unlock(&inode->i_lock); @@ -682,7 +667,7 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, int nfs_flush_incompatible(struct file *file, struct page *page) { - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_page *req; int do_flush, status; /* @@ -716,7 +701,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = page->mapping->host; int status = 0; @@ -967,7 +952,7 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { nfs_set_pageerror(page); - req->wb_context->error = task->tk_status; + nfs_context_set_write_error(req->wb_context, task->tk_status); dprintk(", error = %d\n", task->tk_status); goto out; } @@ -1030,7 +1015,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { nfs_set_pageerror(page); - req->wb_context->error = task->tk_status; + nfs_context_set_write_error(req->wb_context, task->tk_status); dprintk(", error = %d\n", task->tk_status); goto remove_request; } @@ -1244,7 +1229,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) req->wb_bytes, (long long)req_offset(req)); if (task->tk_status < 0) { - req->wb_context->error = task->tk_status; + nfs_context_set_write_error(req->wb_context, task->tk_status); nfs_inode_remove_request(req); dprintk(", error = %d\n", task->tk_status); goto next; @@ -1347,53 +1332,52 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr return ret; } -/* - * flush the inode to disk. - */ -int nfs_wb_all(struct inode *inode) +static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) { - struct address_space *mapping = inode->i_mapping; - struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_writepages = 1, - .range_cyclic = 1, - }; int ret; - ret = nfs_writepages(mapping, &wbc); + ret = nfs_writepages(mapping, wbc); if (ret < 0) goto out; - ret = nfs_sync_mapping_wait(mapping, &wbc, 0); - if (ret >= 0) - return 0; + ret = nfs_sync_mapping_wait(mapping, wbc, how); + if (ret < 0) + goto out; + return 0; out: __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return ret; } -int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how) +/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ +static int nfs_write_mapping(struct address_space *mapping, int how) { struct writeback_control wbc = { .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, - .range_start = range_start, - .range_end = range_end, .for_writepages = 1, + .range_cyclic = 1, }; int ret; - ret = nfs_writepages(mapping, &wbc); + ret = __nfs_write_mapping(mapping, &wbc, how); if (ret < 0) - goto out; - ret = nfs_sync_mapping_wait(mapping, &wbc, how); - if (ret >= 0) - return 0; -out: - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return ret; + return ret; + wbc.sync_mode = WB_SYNC_ALL; + return __nfs_write_mapping(mapping, &wbc, how); +} + +/* + * flush the inode to disk. + */ +int nfs_wb_all(struct inode *inode) +{ + return nfs_write_mapping(inode->i_mapping, 0); +} + +int nfs_wb_nocommit(struct inode *inode) +{ + return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT); } int nfs_wb_page_cancel(struct inode *inode, struct page *page) @@ -1477,35 +1461,6 @@ int nfs_wb_page(struct inode *inode, struct page* page) return nfs_wb_page_priority(inode, page, FLUSH_STABLE); } -int nfs_set_page_dirty(struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode; - struct nfs_page *req; - int ret; - - if (!mapping) - goto out_raced; - inode = mapping->host; - if (!inode) - goto out_raced; - spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); - if (req != NULL) { - /* Mark any existing write requests for flushing */ - ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags); - spin_unlock(&inode->i_lock); - nfs_release_request(req); - return ret; - } - ret = __set_page_dirty_nobuffers(page); - spin_unlock(&inode->i_lock); - return ret; -out_raced: - return !TestSetPageDirty(page); -} - - int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8ef0964..163a76c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -102,7 +102,8 @@ check_filename(char *str, int len, __be32 err) out: \ return status; \ xdr_error: \ - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ status = nfserr_bad_xdr; \ goto out @@ -124,7 +125,8 @@ xdr_error: \ if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ savemem(argp, p, nbytes) : \ (char *)p)) { \ - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ goto xdr_error; \ } \ p += XDR_QUADLEN(nbytes); \ @@ -140,7 +142,8 @@ xdr_error: \ p = argp->p; \ argp->p += XDR_QUADLEN(nbytes); \ } else if (!(p = read_buf(argp, nbytes))) { \ - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ + dprintk("NFSD: xdr error (%s:%d)\n", \ + __FILE__, __LINE__); \ goto xdr_error; \ } \ } while (0) @@ -948,7 +951,8 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) */ avail = (char*)argp->end - (char*)argp->p; if (avail + argp->pagelen < write->wr_buflen) { - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); + dprintk("NFSD: xdr error (%s:%d)\n", + __FILE__, __LINE__); goto xdr_error; } argp->rqstp->rq_vec[0].iov_base = p; @@ -1019,7 +1023,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); if (!argp->ops) { argp->ops = argp->iops; - printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n"); + dprintk("nfsd: couldn't allocate room for COMPOUND\n"); goto xdr_error; } } @@ -1326,7 +1330,7 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 * path = exp->ex_path; if (strncmp(path, rootpath, strlen(rootpath))) { - printk("nfsd: fs_locations failed;" + dprintk("nfsd: fs_locations failed;" "%s is not contained in %s\n", path, rootpath); *stat = nfserr_notsupp; return NULL; diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index c080f61..f1c87ad 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -115,6 +115,10 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +#define time_in_range(a,b,c) \ + (time_after_eq(a,b) && \ + time_before_eq(a,c)) + /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64() */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 7250eea..5b42fef 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -77,6 +77,9 @@ struct nfs_open_context { struct nfs4_state *state; fl_owner_t lockowner; int mode; + + unsigned long flags; +#define NFS_CONTEXT_ERROR_WRITE (0) int error; struct list_head list; @@ -289,9 +292,6 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int nfs_permission(struct inode *, int, struct nameidata *); -extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); -extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); -extern void nfs_access_zap_cache(struct inode *inode); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); @@ -328,14 +328,15 @@ extern const struct inode_operations nfs3_file_inode_operations; extern const struct file_operations nfs_file_operations; extern const struct address_space_operations nfs_file_aops; -static inline struct rpc_cred *nfs_file_cred(struct file *file) +static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) { - if (file != NULL) { - struct nfs_open_context *ctx; + return filp->private_data; +} - ctx = (struct nfs_open_context*)file->private_data; - return ctx->cred; - } +static inline struct rpc_cred *nfs_file_cred(struct file *file) +{ + if (file != NULL) + return nfs_file_open_context(file)->cred; return NULL; } @@ -378,6 +379,8 @@ extern const struct file_operations nfs_dir_operations; extern struct dentry_operations nfs_dentry_operations; extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); +extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags); +extern void nfs_access_zap_cache(struct inode *inode); /* * linux/fs/nfs/symlink.c @@ -420,15 +423,14 @@ extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); extern void nfs_writedata_release(void *); -extern int nfs_set_page_dirty(struct page *); /* * Try to write back everything synchronously (but check the * return value!) */ extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int); -extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int); extern int nfs_wb_all(struct inode *inode); +extern int nfs_wb_nocommit(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 78e6079..30dbcc1 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -30,7 +30,6 @@ #define PG_BUSY 0 #define PG_NEED_COMMIT 1 #define PG_NEED_RESCHED 2 -#define PG_NEED_FLUSH 3 struct nfs_inode; struct nfs_page { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index cf74a4d..0303201 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -538,10 +538,13 @@ typedef u64 clientid4; struct nfs4_accessargs { const struct nfs_fh * fh; + const u32 * bitmask; u32 access; }; struct nfs4_accessres { + const struct nfs_server * server; + struct nfs_fattr * fattr; u32 supported; u32 access; }; diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index c0d9d14..d9d5c5a 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -117,7 +117,7 @@ struct rpc_create_args { struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, - struct rpc_program *, int); + struct rpc_program *, u32); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 3912cf1..3347c72 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -88,6 +88,11 @@ enum { CTL_SLOTTABLE_TCP, CTL_MIN_RESVPORT, CTL_MAX_RESVPORT, + CTL_SLOTTABLE_RDMA, + CTL_RDMA_MAXINLINEREAD, + CTL_RDMA_MAXINLINEWRITE, + CTL_RDMA_WRITEPADDING, + CTL_RDMA_MEMREG, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 784d4c3..c4beb57 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -138,6 +138,19 @@ typedef __be32 rpc_fraghdr; #define RPC_MAX_HEADER_WITH_AUTH \ (RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4)) +/* + * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. + */ +#define RPCBIND_NETID_UDP "udp" +#define RPCBIND_NETID_TCP "tcp" +#define RPCBIND_NETID_UDP6 "udp6" +#define RPCBIND_NETID_TCP6 "tcp6" + +/* + * Note that RFC 1833 does not put any size restrictions on the + * netid string, but all currently defined netid's fit in 4 bytes. + */ +#define RPCBIND_MAXNETIDLEN (4u) #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_MSGPROT_H_ */ diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h new file mode 100644 index 0000000..0013a0d --- /dev/null +++ b/include/linux/sunrpc/rpc_rdma.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_RPC_RDMA_H +#define _LINUX_SUNRPC_RPC_RDMA_H + +struct rpcrdma_segment { + uint32_t rs_handle; /* Registered memory handle */ + uint32_t rs_length; /* Length of the chunk in bytes */ + uint64_t rs_offset; /* Chunk virtual address or offset */ +}; + +/* + * read chunk(s), encoded as a linked list. + */ +struct rpcrdma_read_chunk { + uint32_t rc_discrim; /* 1 indicates presence */ + uint32_t rc_position; /* Position in XDR stream */ + struct rpcrdma_segment rc_target; +}; + +/* + * write chunk, and reply chunk. + */ +struct rpcrdma_write_chunk { + struct rpcrdma_segment wc_target; +}; + +/* + * write chunk(s), encoded as a counted array. + */ +struct rpcrdma_write_array { + uint32_t wc_discrim; /* 1 indicates presence */ + uint32_t wc_nchunks; /* Array count */ + struct rpcrdma_write_chunk wc_array[0]; +}; + +struct rpcrdma_msg { + uint32_t rm_xid; /* Mirrors the RPC header xid */ + uint32_t rm_vers; /* Version of this protocol */ + uint32_t rm_credit; /* Buffers requested/granted */ + uint32_t rm_type; /* Type of message (enum rpcrdma_proc) */ + union { + + struct { /* no chunks */ + uint32_t rm_empty[3]; /* 3 empty chunk lists */ + } rm_nochunks; + + struct { /* no chunks and padded */ + uint32_t rm_align; /* Padding alignment */ + uint32_t rm_thresh; /* Padding threshold */ + uint32_t rm_pempty[3]; /* 3 empty chunk lists */ + } rm_padded; + + uint32_t rm_chunks[0]; /* read, write and reply chunks */ + + } rm_body; +}; + +#define RPCRDMA_HDRLEN_MIN 28 + +enum rpcrdma_errcode { + ERR_VERS = 1, + ERR_CHUNK = 2 +}; + +struct rpcrdma_err_vers { + uint32_t rdma_vers_low; /* Version range supported by peer */ + uint32_t rdma_vers_high; +}; + +enum rpcrdma_proc { + RDMA_MSG = 0, /* An RPC call or reply msg */ + RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */ + RDMA_MSGP = 2, /* An RPC call or reply msg with padding */ + RDMA_DONE = 3, /* Client signals reply completion */ + RDMA_ERROR = 4 /* An RPC RDMA encoding error */ +}; + +#endif /* _LINUX_SUNRPC_RPC_RDMA_H */ diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index c6b53d1..0751c94 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -70,7 +70,10 @@ struct xdr_buf { struct page ** pages; /* Array of contiguous pages */ unsigned int page_base, /* Start of page data */ - page_len; /* Length of page data */ + page_len, /* Length of page data */ + flags; /* Flags for data disposition */ +#define XDRBUF_READ 0x01 /* target of file read */ +#define XDRBUF_WRITE 0x02 /* source of file write */ unsigned int buflen, /* Total length of storage buffer */ len; /* Length of XDR encoded message */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d11cedd..30b17b3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -19,25 +19,11 @@ #ifdef __KERNEL__ -extern unsigned int xprt_udp_slot_table_entries; -extern unsigned int xprt_tcp_slot_table_entries; - #define RPC_MIN_SLOT_TABLE (2U) #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE (128U) /* - * Parameters for choosing a free port - */ -extern unsigned int xprt_min_resvport; -extern unsigned int xprt_max_resvport; - -#define RPC_MIN_RESVPORT (1U) -#define RPC_MAX_RESVPORT (65535U) -#define RPC_DEF_MIN_RESVPORT (665U) -#define RPC_DEF_MAX_RESVPORT (1023U) - -/* * This describes a timeout strategy */ struct rpc_timeout { @@ -53,6 +39,10 @@ enum rpc_display_format_t { RPC_DISPLAY_PORT, RPC_DISPLAY_PROTO, RPC_DISPLAY_ALL, + RPC_DISPLAY_HEX_ADDR, + RPC_DISPLAY_HEX_PORT, + RPC_DISPLAY_UNIVERSAL_ADDR, + RPC_DISPLAY_NETID, RPC_DISPLAY_MAX, }; @@ -196,14 +186,22 @@ struct rpc_xprt { char * address_strings[RPC_DISPLAY_MAX]; }; -struct rpc_xprtsock_create { - int proto; /* IPPROTO_UDP or IPPROTO_TCP */ +struct xprt_create { + int ident; /* XPRT_TRANSPORT identifier */ struct sockaddr * srcaddr; /* optional local address */ struct sockaddr * dstaddr; /* remote peer address */ size_t addrlen; struct rpc_timeout * timeout; /* optional timeout parameters */ }; +struct xprt_class { + struct list_head list; + int ident; /* XPRT_TRANSPORT identifier */ + struct rpc_xprt * (*setup)(struct xprt_create *); + struct module *owner; + char name[32]; +}; + /* * Transport operations used by ULPs */ @@ -212,7 +210,7 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long /* * Generic internal transport functions */ -struct rpc_xprt * xprt_create_transport(struct rpc_xprtsock_create *args); +struct rpc_xprt *xprt_create_transport(struct xprt_create *args); void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_task *task); @@ -235,6 +233,8 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 * /* * Transport switch helper functions */ +int xprt_register_transport(struct xprt_class *type); +int xprt_unregister_transport(struct xprt_class *type); void xprt_set_retrans_timeout_def(struct rpc_task *task); void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); @@ -248,14 +248,6 @@ void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect(struct rpc_xprt *xprt); /* - * Socket transport setup operations - */ -struct rpc_xprt * xs_setup_udp(struct rpc_xprtsock_create *args); -struct rpc_xprt * xs_setup_tcp(struct rpc_xprtsock_create *args); -int init_socket_xprt(void); -void cleanup_socket_xprt(void); - -/* * Reserved bit positions in xprt->state */ #define XPRT_LOCKED (0) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h new file mode 100644 index 0000000..4de56b1 --- /dev/null +++ b/include/linux/sunrpc/xprtrdma.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_XPRTRDMA_H +#define _LINUX_SUNRPC_XPRTRDMA_H + +/* + * RPC transport identifier for RDMA + */ +#define XPRT_TRANSPORT_RDMA 256 + +/* + * rpcbind (v3+) RDMA netid. + */ +#define RPCBIND_NETID_RDMA "rdma" + +/* + * Constants. Max RPC/NFS header is big enough to account for + * additional marshaling buffers passed down by Linux client. + * + * RDMA header is currently fixed max size, and is big enough for a + * fully-chunked NFS message (read chunks are the largest). Note only + * a single chunk type per message is supported currently. + */ +#define RPCRDMA_MIN_SLOT_TABLE (2U) +#define RPCRDMA_DEF_SLOT_TABLE (32U) +#define RPCRDMA_MAX_SLOT_TABLE (256U) + +#define RPCRDMA_DEF_INLINE (1024) /* default inline max */ + +#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ + +#define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + +/* memory registration strategies */ +#define RPCRDMA_PERSISTENT_REGISTRATION (1) + +enum rpcrdma_memreg { + RPCRDMA_BOUNCEBUFFERS = 0, + RPCRDMA_REGISTER, + RPCRDMA_MEMWINDOWS, + RPCRDMA_MEMWINDOWS_ASYNC, + RPCRDMA_MTHCAFMR, + RPCRDMA_ALLPHYSICAL, + RPCRDMA_LAST +}; + +#endif /* _LINUX_SUNRPC_XPRTRDMA_H */ diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h new file mode 100644 index 0000000..2c6c2c2 --- /dev/null +++ b/include/linux/sunrpc/xprtsock.h @@ -0,0 +1,51 @@ +/* + * linux/include/linux/sunrpc/xprtsock.h + * + * Declarations for the RPC transport socket provider. + */ + +#ifndef _LINUX_SUNRPC_XPRTSOCK_H +#define _LINUX_SUNRPC_XPRTSOCK_H + +#ifdef __KERNEL__ + +/* + * Socket transport setup operations + */ +struct rpc_xprt *xs_setup_udp(struct xprt_create *args); +struct rpc_xprt *xs_setup_tcp(struct xprt_create *args); + +int init_socket_xprt(void); +void cleanup_socket_xprt(void); + +/* + * RPC transport identifiers for UDP, TCP + * + * To preserve compatibility with the historical use of raw IP protocol + * id's for transport selection, these are specified with the previous + * values. No such restriction exists for new transports, except that + * they may not collide with these values (17 and 6, respectively). + */ +#define XPRT_TRANSPORT_UDP IPPROTO_UDP +#define XPRT_TRANSPORT_TCP IPPROTO_TCP + +/* + * RPC slot table sizes for UDP, TCP transports + */ +extern unsigned int xprt_udp_slot_table_entries; +extern unsigned int xprt_tcp_slot_table_entries; + +/* + * Parameters for choosing a free port + */ +extern unsigned int xprt_min_resvport; +extern unsigned int xprt_max_resvport; + +#define RPC_MIN_RESVPORT (1U) +#define RPC_MAX_RESVPORT (65535U) +#define RPC_DEF_MIN_RESVPORT (665U) +#define RPC_DEF_MAX_RESVPORT (1023U) + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_SUNRPC_XPRTSOCK_H */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4ef4d22..835cc85 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -61,8 +61,6 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ - - void *fs_private; /* For use by ->writepages() */ }; /* diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 04f3ffb..0ae703c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1525,6 +1525,7 @@ add_names: context->names[idx].ino = (unsigned long)-1; } } +EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 8ebfc4d..5c69a72 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 52429b1..6cdf53c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -127,7 +127,14 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s struct rpc_clnt *clnt = NULL; struct rpc_auth *auth; int err; - int len; + size_t len; + + /* sanity check the name before trying to print it */ + err = -EINVAL; + len = strlen(servname); + if (len > RPC_MAXNETNAMELEN) + goto out_no_rpciod; + len++; dprintk("RPC: creating %s client for %s (xprt %p)\n", program->name, servname, xprt); @@ -148,7 +155,6 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s clnt->cl_parent = clnt; clnt->cl_server = clnt->cl_inline_name; - len = strlen(servname) + 1; if (len > sizeof(clnt->cl_inline_name)) { char *buf = kmalloc(len, GFP_KERNEL); if (buf != 0) @@ -234,8 +240,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) { struct rpc_xprt *xprt; struct rpc_clnt *clnt; - struct rpc_xprtsock_create xprtargs = { - .proto = args->protocol, + struct xprt_create xprtargs = { + .ident = args->protocol, .srcaddr = args->saddress, .dstaddr = args->address, .addrlen = args->addrsize, @@ -269,9 +275,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; - dprintk("RPC: creating %s client for %s (xprt %p)\n", - args->program->name, args->servername, xprt); - clnt = rpc_new_client(xprt, args->servername, args->program, args->version, args->authflavor); if (IS_ERR(clnt)) @@ -439,7 +442,7 @@ rpc_release_client(struct rpc_clnt *clnt) */ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, struct rpc_program *program, - int vers) + u32 vers) { struct rpc_clnt *clnt; struct rpc_version *version; @@ -871,6 +874,7 @@ rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) buf->head[0].iov_len = len; buf->tail[0].iov_len = 0; buf->page_len = 0; + buf->flags = 0; buf->len = 0; buf->buflen = len; } @@ -937,7 +941,7 @@ call_bind(struct rpc_task *task) static void call_bind_status(struct rpc_task *task) { - int status = -EACCES; + int status = -EIO; if (task->tk_status >= 0) { dprint_status(task); @@ -947,9 +951,20 @@ call_bind_status(struct rpc_task *task) } switch (task->tk_status) { + case -EAGAIN: + dprintk("RPC: %5u rpcbind waiting for another request " + "to finish\n", task->tk_pid); + /* avoid busy-waiting here -- could be a network outage. */ + rpc_delay(task, 5*HZ); + goto retry_timeout; case -EACCES: dprintk("RPC: %5u remote rpcbind: RPC program/version " "unavailable\n", task->tk_pid); + /* fail immediately if this is an RPC ping */ + if (task->tk_msg.rpc_proc->p_proc == 0) { + status = -EOPNOTSUPP; + break; + } rpc_delay(task, 3*HZ); goto retry_timeout; case -ETIMEDOUT: @@ -957,6 +972,7 @@ call_bind_status(struct rpc_task *task) task->tk_pid); goto retry_timeout; case -EPFNOSUPPORT: + /* server doesn't support any rpcbind version we know of */ dprintk("RPC: %5u remote rpcbind service unavailable\n", task->tk_pid); break; @@ -969,7 +985,6 @@ call_bind_status(struct rpc_task *task) default: dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", task->tk_pid, -task->tk_status); - status = -EIO; } rpc_exit(task, status); @@ -1523,13 +1538,18 @@ void rpc_show_tasks(void) spin_lock(&clnt->cl_lock); list_for_each_entry(t, &clnt->cl_tasks, tk_task) { const char *rpc_waitq = "none"; + int proc; + + if (t->tk_msg.rpc_proc) + proc = t->tk_msg.rpc_proc->p_proc; + else + proc = -1; if (RPC_IS_QUEUED(t)) rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", - t->tk_pid, - (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), + t->tk_pid, proc, t->tk_flags, t->tk_status, t->tk_client, (t->tk_client ? t->tk_client->cl_prog : 0), diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 669e12a..ae83ac8 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -585,6 +585,7 @@ rpc_populate(struct dentry *parent, if (S_ISDIR(mode)) inc_nlink(dir); d_add(dentry, inode); + fsnotify_create(dir, dentry); } mutex_unlock(&dir->i_mutex); return 0; @@ -606,7 +607,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry) inode->i_ino = iunique(dir->i_sb, 100); d_instantiate(dentry, inode); inc_nlink(dir); - inode_dir_notify(dir, DN_CREATE); + fsnotify_mkdir(dir, dentry); return 0; out_err: printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", @@ -748,7 +749,7 @@ rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pi rpci->flags = flags; rpci->ops = ops; rpci->nkern_readwriters = 1; - inode_dir_notify(dir, DN_CREATE); + fsnotify_create(dir, dentry); dget(dentry); out: mutex_unlock(&dir->i_mutex); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index d1740db..8db3d70 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -16,11 +16,14 @@ #include #include +#include +#include #include #include #include #include +#include #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_BIND @@ -91,26 +94,6 @@ enum { #define RPCB_MAXADDRLEN (128u) /* - * r_netid - * - * Quoting RFC 3530, section 2.2: - * - * For TCP over IPv4 the value of r_netid is the string "tcp". For UDP - * over IPv4 the value of r_netid is the string "udp". - * - * ... - * - * For TCP over IPv6 the value of r_netid is the string "tcp6". For UDP - * over IPv6 the value of r_netid is the string "udp6". - */ -#define RPCB_NETID_UDP "\165\144\160" /* "udp" */ -#define RPCB_NETID_TCP "\164\143\160" /* "tcp" */ -#define RPCB_NETID_UDP6 "\165\144\160\066" /* "udp6" */ -#define RPCB_NETID_TCP6 "\164\143\160\066" /* "tcp6" */ - -#define RPCB_MAXNETIDLEN (4u) - -/* * r_owner * * The "owner" is allowed to unset a service in the rpcbind database. @@ -137,10 +120,13 @@ struct rpcbind_args { static struct rpc_procinfo rpcb_procedures2[]; static struct rpc_procinfo rpcb_procedures3[]; -static struct rpcb_info { +struct rpcb_info { int rpc_vers; struct rpc_procinfo * rpc_proc; -} rpcb_next_version[]; +}; + +static struct rpcb_info rpcb_next_version[]; +static struct rpcb_info rpcb_next_version6[]; static void rpcb_getport_prepare(struct rpc_task *task, void *calldata) { @@ -190,7 +176,17 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, RPC_CLNT_CREATE_INTR), }; - ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); + switch (srvaddr->sa_family) { + case AF_INET: + ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); + break; + case AF_INET6: + ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); + break; + default: + return NULL; + } + if (!privileged) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; return rpc_create(&args); @@ -234,7 +230,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) prog, vers, prot, port); rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, - IPPROTO_UDP, 2, 1); + XPRT_TRANSPORT_UDP, 2, 1); if (IS_ERR(rpcb_clnt)) return PTR_ERR(rpcb_clnt); @@ -316,6 +312,7 @@ void rpcb_getport_async(struct rpc_task *task) struct rpc_task *child; struct sockaddr addr; int status; + struct rpcb_info *info; dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", task->tk_pid, __FUNCTION__, @@ -325,7 +322,7 @@ void rpcb_getport_async(struct rpc_task *task) BUG_ON(clnt->cl_parent != clnt); if (xprt_test_and_set_binding(xprt)) { - status = -EACCES; /* tell caller to check again */ + status = -EAGAIN; /* tell caller to check again */ dprintk("RPC: %5u %s: waiting for another binder\n", task->tk_pid, __FUNCTION__); goto bailout_nowake; @@ -343,18 +340,43 @@ void rpcb_getport_async(struct rpc_task *task) goto bailout_nofree; } - if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) { + rpc_peeraddr(clnt, (void *)&addr, sizeof(addr)); + + /* Don't ever use rpcbind v2 for AF_INET6 requests */ + switch (addr.sa_family) { + case AF_INET: + info = rpcb_next_version; + break; + case AF_INET6: + info = rpcb_next_version6; + break; + default: + status = -EAFNOSUPPORT; + dprintk("RPC: %5u %s: bad address family\n", + task->tk_pid, __FUNCTION__); + goto bailout_nofree; + } + if (info[xprt->bind_index].rpc_proc == NULL) { xprt->bind_index = 0; - status = -EACCES; /* tell caller to try again later */ + status = -EPFNOSUPPORT; dprintk("RPC: %5u %s: no more getport versions available\n", task->tk_pid, __FUNCTION__); goto bailout_nofree; } - bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; + bind_version = info[xprt->bind_index].rpc_vers; dprintk("RPC: %5u %s: trying rpcbind version %u\n", task->tk_pid, __FUNCTION__, bind_version); + rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, + bind_version, 0); + if (IS_ERR(rpcb_clnt)) { + status = PTR_ERR(rpcb_clnt); + dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", + task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt)); + goto bailout_nofree; + } + map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); if (!map) { status = -ENOMEM; @@ -367,28 +389,19 @@ void rpcb_getport_async(struct rpc_task *task) map->r_prot = xprt->prot; map->r_port = 0; map->r_xprt = xprt_get(xprt); - map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP : - RPCB_NETID_UDP; - memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR), - sizeof(map->r_addr)); + map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); + memcpy(map->r_addr, + rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR), + sizeof(map->r_addr)); map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ - rpc_peeraddr(clnt, (void *)&addr, sizeof(addr)); - rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0); - if (IS_ERR(rpcb_clnt)) { - status = PTR_ERR(rpcb_clnt); - dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", - task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt)); - goto bailout; - } - child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map); rpc_release_client(rpcb_clnt); if (IS_ERR(child)) { status = -EIO; dprintk("RPC: %5u %s: rpc_run_task failed\n", task->tk_pid, __FUNCTION__); - goto bailout_nofree; + goto bailout; } rpc_put_task(child); @@ -403,6 +416,7 @@ bailout_nofree: bailout_nowake: task->tk_status = status; } +EXPORT_SYMBOL_GPL(rpcb_getport_async); /* * Rpcbind child task calls this callback via tk_exit. @@ -413,6 +427,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) struct rpc_xprt *xprt = map->r_xprt; int status = child->tk_status; + /* Garbage reply: retry with a lesser rpcbind version */ + if (status == -EIO) + status = -EPROTONOSUPPORT; + /* rpcbind server doesn't support this rpcbind protocol version */ if (status == -EPROTONOSUPPORT) xprt->bind_index++; @@ -490,16 +508,24 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, unsigned short *portp) { char *addr; - int addr_len, c, i, f, first, val; + u32 addr_len; + int c, i, f, first, val; *portp = 0; - addr_len = (unsigned int) ntohl(*p++); - if (addr_len > RPCB_MAXADDRLEN) /* sanity */ - return -EINVAL; - - dprintk("RPC: rpcb_decode_getaddr returned string: '%s'\n", - (char *) p); - + addr_len = ntohl(*p++); + + /* + * Simple sanity check. The smallest possible universal + * address is an IPv4 address string containing 11 bytes. + */ + if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN) + goto out_err; + + /* + * Start at the end and walk backwards until the first dot + * is encountered. When the second dot is found, we have + * both parts of the port number. + */ addr = (char *)p; val = 0; first = 1; @@ -521,8 +547,19 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, } } + /* + * Simple sanity check. If we never saw a dot in the reply, + * then this was probably just garbage. + */ + if (first) + goto out_err; + dprintk("RPC: rpcb_decode_getaddr port=%u\n", *portp); return 0; + +out_err: + dprintk("RPC: rpcbind server returned malformed reply\n"); + return -EIO; } #define RPCB_program_sz (1u) @@ -531,7 +568,7 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p, #define RPCB_port_sz (1u) #define RPCB_boolean_sz (1u) -#define RPCB_netid_sz (1+XDR_QUADLEN(RPCB_MAXNETIDLEN)) +#define RPCB_netid_sz (1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN)) #define RPCB_addr_sz (1+XDR_QUADLEN(RPCB_MAXADDRLEN)) #define RPCB_ownerstring_sz (1+XDR_QUADLEN(RPCB_MAXOWNERLEN)) @@ -593,6 +630,14 @@ static struct rpcb_info rpcb_next_version[] = { { 0, NULL }, }; +static struct rpcb_info rpcb_next_version6[] = { +#ifdef CONFIG_SUNRPC_BIND34 + { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] }, + { 3, &rpcb_procedures3[RPCBPROC_GETADDR] }, +#endif + { 0, NULL }, +}; + static struct rpc_version rpcb_version2 = { .number = 2, .nrprocs = RPCB_HIGHPROC_2, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 954d7ec..3c773c5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -777,6 +777,7 @@ void *rpc_malloc(struct rpc_task *task, size_t size) task->tk_pid, size, buf); return &buf->data; } +EXPORT_SYMBOL_GPL(rpc_malloc); /** * rpc_free - free buffer allocated via rpc_malloc @@ -802,6 +803,7 @@ void rpc_free(void *buffer) else kfree(buf); } +EXPORT_SYMBOL_GPL(rpc_free); /* * Creation and deletion of RPC task structures diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1d377d1..97ac45f 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -34,6 +34,7 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) desc->offset += len; return len; } +EXPORT_SYMBOL_GPL(xdr_skb_read_bits); /** * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer @@ -137,6 +138,7 @@ copy_tail: out: return copied; } +EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); /** * csum_partial_copy_to_xdr - checksum and copy data @@ -179,3 +181,4 @@ no_checksum: return -1; return 0; } +EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 384c4ad..33d89e8 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -20,7 +20,7 @@ #include #include #include - +#include /* RPC scheduler */ EXPORT_SYMBOL(rpc_execute); diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c index 8142fdb..31becbf 100644 --- a/net/sunrpc/timer.c +++ b/net/sunrpc/timer.c @@ -17,6 +17,7 @@ #include #include +#include #include @@ -40,6 +41,7 @@ rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo) rt->ntimeouts[i] = 0; } } +EXPORT_SYMBOL_GPL(rpc_init_rtt); /* * NB: When computing the smoothed RTT and standard deviation, @@ -75,6 +77,7 @@ rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m) if (*sdrtt < RPC_RTO_MIN) *sdrtt = RPC_RTO_MIN; } +EXPORT_SYMBOL_GPL(rpc_update_rtt); /* * Estimate rto for an nfs rpc sent via. an unreliable datagram. @@ -103,3 +106,4 @@ rpc_calc_rto(struct rpc_rtt *rt, unsigned timer) return res; } +EXPORT_SYMBOL_GPL(rpc_calc_rto); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index c8c2edc..282a9a2 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -62,6 +62,9 @@ static inline void do_xprt_reserve(struct rpc_task *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); +static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(xprt_list); + /* * The transport code maintains an estimate on the maximum number of out- * standing RPC requests, using a smoothed version of the congestion @@ -81,6 +84,78 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) /** + * xprt_register_transport - register a transport implementation + * @transport: transport to register + * + * If a transport implementation is loaded as a kernel module, it can + * call this interface to make itself known to the RPC client. + * + * Returns: + * 0: transport successfully registered + * -EEXIST: transport already registered + * -EINVAL: transport module being unloaded + */ +int xprt_register_transport(struct xprt_class *transport) +{ + struct xprt_class *t; + int result; + + result = -EEXIST; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + /* don't register the same transport class twice */ + if (t->ident == transport->ident) + goto out; + } + + result = -EINVAL; + if (try_module_get(THIS_MODULE)) { + list_add_tail(&transport->list, &xprt_list); + printk(KERN_INFO "RPC: Registered %s transport module.\n", + transport->name); + result = 0; + } + +out: + spin_unlock(&xprt_list_lock); + return result; +} +EXPORT_SYMBOL_GPL(xprt_register_transport); + +/** + * xprt_unregister_transport - unregister a transport implementation + * transport: transport to unregister + * + * Returns: + * 0: transport successfully unregistered + * -ENOENT: transport never registered + */ +int xprt_unregister_transport(struct xprt_class *transport) +{ + struct xprt_class *t; + int result; + + result = 0; + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (t == transport) { + printk(KERN_INFO + "RPC: Unregistered %s transport module.\n", + transport->name); + list_del_init(&transport->list); + module_put(THIS_MODULE); + goto out; + } + } + result = -ENOENT; + +out: + spin_unlock(&xprt_list_lock); + return result; +} +EXPORT_SYMBOL_GPL(xprt_unregister_transport); + +/** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport * @@ -118,6 +193,7 @@ out_sleep: rpc_sleep_on(&xprt->sending, task, NULL, NULL); return 0; } +EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static void xprt_clear_locked(struct rpc_xprt *xprt) { @@ -167,6 +243,7 @@ out_sleep: rpc_sleep_on(&xprt->sending, task, NULL, NULL); return 0; } +EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -246,6 +323,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) __xprt_lock_write_next(xprt); } } +EXPORT_SYMBOL_GPL(xprt_release_xprt); /** * xprt_release_xprt_cong - allow other requests to use a transport @@ -262,6 +340,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) __xprt_lock_write_next_cong(xprt); } } +EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -314,6 +393,7 @@ void xprt_release_rqst_cong(struct rpc_task *task) { __xprt_put_cong(task->tk_xprt, task->tk_rqstp); } +EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); /** * xprt_adjust_cwnd - adjust transport congestion window @@ -345,6 +425,7 @@ void xprt_adjust_cwnd(struct rpc_task *task, int result) xprt->cwnd = cwnd; __xprt_put_cong(xprt, req); } +EXPORT_SYMBOL_GPL(xprt_adjust_cwnd); /** * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue @@ -359,6 +440,7 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) else rpc_wake_up(&xprt->pending); } +EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear @@ -373,6 +455,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task) task->tk_timeout = req->rq_timeout; rpc_sleep_on(&xprt->pending, task, NULL, NULL); } +EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); /** * xprt_write_space - wake the task waiting for transport output buffer space @@ -393,6 +476,7 @@ void xprt_write_space(struct rpc_xprt *xprt) } spin_unlock_bh(&xprt->transport_lock); } +EXPORT_SYMBOL_GPL(xprt_write_space); /** * xprt_set_retrans_timeout_def - set a request's retransmit timeout @@ -406,6 +490,7 @@ void xprt_set_retrans_timeout_def(struct rpc_task *task) { task->tk_timeout = task->tk_rqstp->rq_timeout; } +EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); /* * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout @@ -425,6 +510,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task) if (task->tk_timeout > max_timeout || task->tk_timeout == 0) task->tk_timeout = max_timeout; } +EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt); static void xprt_reset_majortimeo(struct rpc_rqst *req) { @@ -500,6 +586,7 @@ void xprt_disconnect(struct rpc_xprt *xprt) xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } +EXPORT_SYMBOL_GPL(xprt_disconnect); static void xprt_init_autodisconnect(unsigned long data) @@ -610,6 +697,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) xprt->stat.bad_xids++; return NULL; } +EXPORT_SYMBOL_GPL(xprt_lookup_rqst); /** * xprt_update_rtt - update an RPC client's RTT state after receiving a reply @@ -629,6 +717,7 @@ void xprt_update_rtt(struct rpc_task *task) rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } +EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete @@ -653,6 +742,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) req->rq_received = req->rq_private_buf.len = copied; rpc_wake_up_task(task); } +EXPORT_SYMBOL_GPL(xprt_complete_rqst); static void xprt_timer(struct rpc_task *task) { @@ -889,23 +979,25 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i * @args: rpc transport creation arguments * */ -struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args) +struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; struct rpc_rqst *req; + struct xprt_class *t; - switch (args->proto) { - case IPPROTO_UDP: - xprt = xs_setup_udp(args); - break; - case IPPROTO_TCP: - xprt = xs_setup_tcp(args); - break; - default: - printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n", - args->proto); - return ERR_PTR(-EIO); + spin_lock(&xprt_list_lock); + list_for_each_entry(t, &xprt_list, list) { + if (t->ident == args->ident) { + spin_unlock(&xprt_list_lock); + goto found; + } } + spin_unlock(&xprt_list_lock); + printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); + +found: + xprt = t->setup(args); if (IS_ERR(xprt)) { dprintk("RPC: xprt_create_transport: failed, %ld\n", -PTR_ERR(xprt)); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile new file mode 100644 index 0000000..264f0fe --- /dev/null +++ b/net/sunrpc/xprtrdma/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o + +xprtrdma-y := transport.o rpc_rdma.o verbs.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c new file mode 100644 index 0000000..12db635 --- /dev/null +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -0,0 +1,868 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * rpc_rdma.c + * + * This file contains the guts of the RPC RDMA protocol, and + * does marshaling/unmarshaling, etc. It is also where interfacing + * to the Linux RPC framework lives. + */ + +#include "xprt_rdma.h" + +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +#ifdef RPC_DEBUG +static const char transfertypes[][12] = { + "pure inline", /* no chunks */ + " read chunk", /* some argument via rdma read */ + "*read chunk", /* entire request via rdma read */ + "write chunk", /* some result via rdma write */ + "reply chunk" /* entire reply via rdma write */ +}; +#endif + +/* + * Chunk assembly from upper layer xdr_buf. + * + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. + * + * Note, this routine is never called if the connection's memory + * registration strategy is 0 (bounce buffers). + */ + +static int +rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos, + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) +{ + int len, n = 0, p; + + if (pos == 0 && xdrbuf->head[0].iov_len) { + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->head[0].iov_base; + seg[n].mr_len = xdrbuf->head[0].iov_len; + pos += xdrbuf->head[0].iov_len; + ++n; + } + + if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { + if (n == nsegs) + return 0; + seg[n].mr_page = xdrbuf->pages[0]; + seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; + seg[n].mr_len = min_t(u32, + PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); + len = xdrbuf->page_len - seg[n].mr_len; + pos += len; + ++n; + p = 1; + while (len > 0) { + if (n == nsegs) + return 0; + seg[n].mr_page = xdrbuf->pages[p]; + seg[n].mr_offset = NULL; + seg[n].mr_len = min_t(u32, PAGE_SIZE, len); + len -= seg[n].mr_len; + ++n; + ++p; + } + } + + if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) { + if (n == nsegs) + return 0; + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->tail[0].iov_base; + seg[n].mr_len = xdrbuf->tail[0].iov_len; + pos += xdrbuf->tail[0].iov_len; + ++n; + } + + if (pos < xdrbuf->len) + dprintk("RPC: %s: marshaled only %d of %d\n", + __func__, pos, xdrbuf->len); + + return n; +} + +/* + * Create read/write chunk lists, and reply chunks, for RDMA + * + * Assume check against THRESHOLD has been done, and chunks are required. + * Assume only encoding one list entry for read|write chunks. The NFSv3 + * protocol is simple enough to allow this as it only has a single "bulk + * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The + * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) + * + * When used for a single reply chunk (which is a special write + * chunk used for the entire reply, rather than just the data), it + * is used primarily for READDIR and READLINK which would otherwise + * be severely size-limited by a small rdma inline read max. The server + * response will come back as an RDMA Write, followed by a message + * of type RDMA_NOMSG carrying the xid and length. As a result, reply + * chunks do not provide data alignment, however they do not require + * "fixup" (moving the response to the upper layer buffer) either. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + */ + +static unsigned int +rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt); + int nsegs, nchunks = 0; + int pos; + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_read_chunk *cur_rchunk = NULL; + struct rpcrdma_write_array *warray = NULL; + struct rpcrdma_write_chunk *cur_wchunk = NULL; + u32 *iptr = headerp->rm_body.rm_chunks; + + if (type == rpcrdma_readch || type == rpcrdma_areadch) { + /* a read chunk - server will RDMA Read our memory */ + cur_rchunk = (struct rpcrdma_read_chunk *) iptr; + } else { + /* a write or reply chunk - server will RDMA Write our memory */ + *iptr++ = xdr_zero; /* encode a NULL read chunk list */ + if (type == rpcrdma_replych) + *iptr++ = xdr_zero; /* a NULL write chunk list */ + warray = (struct rpcrdma_write_array *) iptr; + cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); + } + + if (type == rpcrdma_replych || type == rpcrdma_areadch) + pos = 0; + else + pos = target->head[0].iov_len; + + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); + if (nsegs == 0) + return 0; + + do { + /* bind/register the memory, then build chunk from result. */ + int n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) + goto out; + if (cur_rchunk) { /* read */ + cur_rchunk->rc_discrim = xdr_one; + /* all read chunks have the same "position" */ + cur_rchunk->rc_position = htonl(pos); + cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); + cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (u32 *)&cur_rchunk->rc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: read chunk " + "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__, + seg->mr_len, seg->mr_base, seg->mr_rkey, pos, + n < nsegs ? "more" : "last"); + cur_rchunk++; + r_xprt->rx_stats.read_chunk_count++; + } else { /* write/reply */ + cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); + cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); + xdr_encode_hyper( + (u32 *)&cur_wchunk->wc_target.rs_offset, + seg->mr_base); + dprintk("RPC: %s: %s chunk " + "elem %d@0x%llx:0x%x (%s)\n", __func__, + (type == rpcrdma_replych) ? "reply" : "write", + seg->mr_len, seg->mr_base, seg->mr_rkey, + n < nsegs ? "more" : "last"); + cur_wchunk++; + if (type == rpcrdma_replych) + r_xprt->rx_stats.reply_chunk_count++; + else + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + } + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + + /* success. all failures return above */ + req->rl_nchunks = nchunks; + + BUG_ON(nchunks == 0); + + /* + * finish off header. If write, marshal discrim and nchunks. + */ + if (cur_rchunk) { + iptr = (u32 *) cur_rchunk; + *iptr++ = xdr_zero; /* finish the read chunk list */ + *iptr++ = xdr_zero; /* encode a NULL write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } else { + warray->wc_discrim = xdr_one; + warray->wc_nchunks = htonl(nchunks); + iptr = (u32 *) cur_wchunk; + if (type == rpcrdma_writech) { + *iptr++ = xdr_zero; /* finish the write chunk list */ + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ + } + } + + /* + * Return header size. + */ + return (unsigned char *)iptr - (unsigned char *)headerp; + +out: + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt, NULL); + return 0; +} + +/* + * Copy write data inline. + * This function is used for "small" requests. Data which is passed + * to RPC via iovecs (or page list) is copied directly into the + * pre-registered memory buffer for this request. For small amounts + * of data, this is efficient. The cutoff value is tunable. + */ +static int +rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +{ + int i, npages, curlen; + int copy_len; + unsigned char *srcp, *destp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + + destp = rqst->rq_svec[0].iov_base; + curlen = rqst->rq_svec[0].iov_len; + destp += curlen; + /* + * Do optional padding where it makes sense. Alignment of write + * payload can help the server, if our setting is accurate. + */ + pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); + if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) + pad = 0; /* don't pad this request */ + + dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", + __func__, pad, destp, rqst->rq_slen, curlen); + + copy_len = rqst->rq_snd_buf.page_len; + r_xprt->rx_stats.pullup_copy_count += copy_len; + npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; + for (i = 0; copy_len && i < npages; i++) { + if (i == 0) + curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base; + else + curlen = PAGE_SIZE; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", + __func__, i, destp, copy_len, curlen); + srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], + KM_SKB_SUNRPC_DATA); + if (i == 0) + memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen); + else + memcpy(destp, srcp, curlen); + kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); + rqst->rq_svec[0].iov_len += curlen; + destp += curlen; + copy_len -= curlen; + } + if (rqst->rq_snd_buf.tail[0].iov_len) { + curlen = rqst->rq_snd_buf.tail[0].iov_len; + if (destp != rqst->rq_snd_buf.tail[0].iov_base) { + memcpy(destp, + rqst->rq_snd_buf.tail[0].iov_base, curlen); + r_xprt->rx_stats.pullup_copy_count += curlen; + } + dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", + __func__, destp, copy_len, curlen); + rqst->rq_svec[0].iov_len += curlen; + } + /* header now contains entire send message */ + return pad; +} + +/* + * Marshal a request: the primary job of this routine is to choose + * the transfer modes. See comments below. + * + * Uses multiple RDMA IOVs for a request: + * [0] -- RPC RDMA header, which uses memory from the *start* of the + * preregistered buffer that already holds the RPC data in + * its middle. + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. + * [2] -- optional padding. + * [3] -- if padded, header only in [1] and data here. + */ + +int +rpcrdma_marshal_req(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + char *base; + size_t hdrlen, rpclen, padlen; + enum rpcrdma_chunktype rtype, wtype; + struct rpcrdma_msg *headerp; + + /* + * rpclen gets amount of data in first buffer, which is the + * pre-registered buffer. + */ + base = rqst->rq_svec[0].iov_base; + rpclen = rqst->rq_svec[0].iov_len; + + /* build RDMA header in private area at front */ + headerp = (struct rpcrdma_msg *) req->rl_base; + /* don't htonl XID, it's already done in request */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = xdr_one; + headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); + headerp->rm_type = __constant_htonl(RDMA_MSG); + + /* + * Chunks needed for results? + * + * o If the expected result is under the inline threshold, all ops + * return as inline (but see later). + * o Large non-read ops return as a single reply chunk. + * o Large read ops return data as write chunk(s), header as inline. + * + * Note: the NFS code sending down multiple result segments implies + * the op is one of read, readdir[plus], readlink or NFSv4 getacl. + */ + + /* + * This code can handle read chunks, write chunks OR reply + * chunks -- only one type. If the request is too big to fit + * inline, then we will choose read chunks. If the request is + * a READ, then use write chunks to separate the file data + * into pages; otherwise use reply chunks. + */ + if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) + wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.page_len == 0) + wtype = rpcrdma_replych; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; + else + wtype = rpcrdma_replych; + + /* + * Chunks needed for arguments? + * + * o If the total request is under the inline threshold, all ops + * are sent as inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). + * o Large write ops transmit data as read chunk(s), header as + * inline. + * + * Note: the NFS code sending down multiple argument segments + * implies the op is a write. + * TBD check NFSv4 setacl + */ + if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + rtype = rpcrdma_noch; + else if (rqst->rq_snd_buf.page_len == 0) + rtype = rpcrdma_areadch; + else + rtype = rpcrdma_readch; + + /* The following simplification is not true forever */ + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); + + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && + (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { + /* forced to "pure inline"? */ + dprintk("RPC: %s: too much data (%d/%d) for inline\n", + __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); + return -1; + } + + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; + + /* + * Pull up any extra send data into the preregistered buffer. + * When padding is in use and applies to the transfer, insert + * it and change the message type. + */ + if (rtype == rpcrdma_noch) { + + padlen = rpcrdma_inline_pullup(rqst, + RPCRDMA_INLINE_PAD_VALUE(rqst)); + + if (padlen) { + headerp->rm_type = __constant_htonl(RDMA_MSGP); + headerp->rm_body.rm_padded.rm_align = + htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); + headerp->rm_body.rm_padded.rm_thresh = + __constant_htonl(RPCRDMA_INLINE_PAD_THRESH); + headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ + BUG_ON(wtype != rpcrdma_noch); + + } else { + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + /* + * Currently we try to not actually use read inline. + * Reply chunks have the desirable property that + * they land, packed, directly in the target buffers + * without headers, so they require no fixup. The + * additional RDMA Write op sends the same amount + * of data, streams on-the-wire and adds no overhead + * on receive. Therefore, we request a reply chunk + * for non-writes wherever feasible and efficient. + */ + if (wtype == rpcrdma_noch && + r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) + wtype = rpcrdma_replych; + } + } + + /* + * Marshal chunks. This routine will return the header length + * consumed by marshaling. + */ + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_snd_buf, headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_rcv_buf, headerp, wtype); + } + + if (hdrlen == 0) + return -1; + + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" + " headerp 0x%p base 0x%p lkey 0x%x\n", + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + headerp, base, req->rl_iov.lkey); + + /* + * initialize send_iov's - normally only two: rdma chunk header and + * single preregistered RPC header buffer, but if padding is present, + * then use a preregistered (and zeroed) pad buffer between the RPC + * header and any write data. In all non-rdma cases, any following + * data has been copied into the RPC header buffer. + */ + req->rl_send_iov[0].addr = req->rl_iov.addr; + req->rl_send_iov[0].length = hdrlen; + req->rl_send_iov[0].lkey = req->rl_iov.lkey; + + req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = req->rl_iov.lkey; + + req->rl_niovs = 2; + + if (padlen) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + + req->rl_send_iov[2].addr = ep->rep_pad.addr; + req->rl_send_iov[2].length = padlen; + req->rl_send_iov[2].lkey = ep->rep_pad.lkey; + + req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; + req->rl_send_iov[3].length = rqst->rq_slen - rpclen; + req->rl_send_iov[3].lkey = req->rl_iov.lkey; + + req->rl_niovs = 4; + } + + return 0; +} + +/* + * Chase down a received write or reply chunklist to get length + * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) + */ +static int +rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp) +{ + unsigned int i, total_len; + struct rpcrdma_write_chunk *cur_wchunk; + + i = ntohl(**iptrp); /* get array count */ + if (i > max) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); + total_len = 0; + while (i--) { + struct rpcrdma_segment *seg = &cur_wchunk->wc_target; + ifdebug(FACILITY) { + u64 off; + xdr_decode_hyper((u32 *)&seg->rs_offset, &off); + dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", + __func__, + ntohl(seg->rs_length), + off, + ntohl(seg->rs_handle)); + } + total_len += ntohl(seg->rs_length); + ++cur_wchunk; + } + /* check and adjust for properly terminated write chunk */ + if (wrchunk) { + u32 *w = (u32 *) cur_wchunk; + if (*w++ != xdr_zero) + return -1; + cur_wchunk = (struct rpcrdma_write_chunk *) w; + } + if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) + return -1; + + *iptrp = (u32 *) cur_wchunk; + return total_len; +} + +/* + * Scatter inline received data back into provided iov's. + */ +static void +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) +{ + int i, npages, curlen, olen; + char *destp; + + curlen = rqst->rq_rcv_buf.head[0].iov_len; + if (curlen > copy_len) { /* write chunk header fixup */ + curlen = copy_len; + rqst->rq_rcv_buf.head[0].iov_len = curlen; + } + + dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", + __func__, srcp, copy_len, curlen); + + /* Shift pointer for first receive segment only */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + srcp += curlen; + copy_len -= curlen; + + olen = copy_len; + i = 0; + rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + if (copy_len && rqst->rq_rcv_buf.page_len) { + npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; + for (; i < npages; i++) { + if (i == 0) + curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base; + else + curlen = PAGE_SIZE; + if (curlen > copy_len) + curlen = copy_len; + dprintk("RPC: %s: page %d" + " srcp 0x%p len %d curlen %d\n", + __func__, i, srcp, copy_len, curlen); + destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], + KM_SKB_SUNRPC_DATA); + if (i == 0) + memcpy(destp + rqst->rq_rcv_buf.page_base, + srcp, curlen); + else + memcpy(destp, srcp, curlen); + flush_dcache_page(rqst->rq_rcv_buf.pages[i]); + kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); + srcp += curlen; + copy_len -= curlen; + if (copy_len == 0) + break; + } + rqst->rq_rcv_buf.page_len = olen - copy_len; + } else + rqst->rq_rcv_buf.page_len = 0; + + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { + curlen = copy_len; + if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) + curlen = rqst->rq_rcv_buf.tail[0].iov_len; + if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) + memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); + dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", + __func__, srcp, copy_len, curlen); + rqst->rq_rcv_buf.tail[0].iov_len = curlen; + copy_len -= curlen; ++i; + } else + rqst->rq_rcv_buf.tail[0].iov_len = 0; + + if (copy_len) + dprintk("RPC: %s: %d bytes in" + " %d extra segments (%d lost)\n", + __func__, olen, i, copy_len); + + /* TBD avoid a warning from call_decode() */ + rqst->rq_private_buf = rqst->rq_rcv_buf; +} + +/* + * This function is called when an async event is posted to + * the connection which changes the connection state. All it + * does at this point is mark the connection up/down, the rpc + * timers do the rest. + */ +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + struct rpc_xprt *xprt = ep->rep_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) + xprt_wake_pending_tasks(xprt, 0); + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, ep->rep_connected); + } + spin_unlock_bh(&xprt->transport_lock); +} + +/* + * This function is called when memory window unbind which we are waiting + * for completes. Just use rr_func (zeroed by upcall) to signal completion. + */ +static void +rpcrdma_unbind_func(struct rpcrdma_rep *rep) +{ + wake_up(&rep->rr_unbind); +} + +/* + * Called as a tasklet to do req/reply match and complete a request + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. + */ +void +rpcrdma_reply_handler(struct rpcrdma_rep *rep) +{ + struct rpcrdma_msg *headerp; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct rpc_xprt *xprt = rep->rr_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + u32 *iptr; + int i, rdmalen, status; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + } + if (rep->rr_len < 28) { + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + } + headerp = (struct rpcrdma_msg *) rep->rr_base; + if (headerp->rm_vers != xdr_one) { + dprintk("RPC: %s: invalid version %d\n", + __func__, ntohl(headerp->rm_vers)); + goto repost; + } + + /* Get XID and try for a match. */ + spin_lock(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (rqst == NULL) { + spin_unlock(&xprt->transport_lock); + dprintk("RPC: %s: reply 0x%p failed " + "to match any request xid 0x%08x len %d\n", + __func__, rep, headerp->rm_xid, rep->rr_len); +repost: + r_xprt->rx_stats.bad_reply_count++; + rep->rr_func = rpcrdma_reply_handler; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); + + return; + } + + /* get request object */ + req = rpcr_to_rdmar(rqst); + + dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" + " RPC request 0x%p xid 0x%08x\n", + __func__, rep, req, rqst, headerp->rm_xid); + + BUG_ON(!req || req->rl_reply); + + /* from here on, the reply is no longer an orphan */ + req->rl_reply = rep; + + /* check for expected message types */ + /* The order of some of these tests is important. */ + switch (headerp->rm_type) { + case __constant_htonl(RDMA_MSG): + /* never expect read chunks */ + /* never expect reply chunks (two ways to check) */ + /* never expect write chunks without having offered RDMA */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + (headerp->rm_body.rm_chunks[1] == xdr_zero && + headerp->rm_body.rm_chunks[2] != xdr_zero) || + (headerp->rm_body.rm_chunks[1] != xdr_zero && + req->rl_nchunks == 0)) + goto badheader; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) { + /* count any expected write chunks in read reply */ + /* start at write chunk array count */ + iptr = &headerp->rm_body.rm_chunks[2]; + rdmalen = rpcrdma_count_chunks(rep, + req->rl_nchunks, 1, &iptr); + /* check for validity, and no reply chunk after */ + if (rdmalen < 0 || *iptr++ != xdr_zero) + goto badheader; + rep->rr_len -= + ((unsigned char *)iptr - (unsigned char *)headerp); + status = rep->rr_len + rdmalen; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + } else { + /* else ordinary inline */ + iptr = (u32 *)((unsigned char *)headerp + 28); + rep->rr_len -= 28; /*sizeof *headerp;*/ + status = rep->rr_len; + } + /* Fix up the rpc results for upper layer */ + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); + break; + + case __constant_htonl(RDMA_NOMSG): + /* never expect read or write chunks, always reply chunks */ + if (headerp->rm_body.rm_chunks[0] != xdr_zero || + headerp->rm_body.rm_chunks[1] != xdr_zero || + headerp->rm_body.rm_chunks[2] != xdr_one || + req->rl_nchunks == 0) + goto badheader; + iptr = (u32 *)((unsigned char *)headerp + 28); + rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + if (rdmalen < 0) + goto badheader; + r_xprt->rx_stats.total_rdma_reply += rdmalen; + /* Reply chunk buffer already is the reply vector - no fixup. */ + status = rdmalen; + break; + +badheader: + default: + dprintk("%s: invalid rpcrdma reply header (type %d):" + " chunks[012] == %d %d %d" + " expected chunks <= %d\n", + __func__, ntohl(headerp->rm_type), + headerp->rm_body.rm_chunks[0], + headerp->rm_body.rm_chunks[1], + headerp->rm_body.rm_chunks[2], + req->rl_nchunks); + status = -EIO; + r_xprt->rx_stats.bad_reply_count++; + break; + } + + /* If using mw bind, start the deregister process now. */ + /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ + if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS: + for (i = 0; req->rl_nchunks-- > 1;) + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt, NULL); + /* Optionally wait (not here) for unbinds to complete */ + rep->rr_func = rpcrdma_unbind_func; + (void) rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, rep); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + for (i = 0; req->rl_nchunks--;) + i += rpcrdma_deregister_external(&req->rl_segments[i], + r_xprt, NULL); + break; + default: + break; + } + + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock(&xprt->transport_lock); +} diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c new file mode 100644 index 0000000..dc55cc9 --- /dev/null +++ b/net/sunrpc/xprtrdma/transport.c @@ -0,0 +1,800 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * transport.c + * + * This file contains the top-level implementation of an RPC RDMA + * transport. + * + * Naming convention: functions beginning with xprt_ are part of the + * transport switch. All others are RPC RDMA internal. + */ + +#include +#include +#include + +#include "xprt_rdma.h" + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); +MODULE_AUTHOR("Network Appliance, Inc."); + +/* + * tunables + */ + +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_inline_write_padding; +#if !RPCRDMA_PERSISTENT_REGISTRATION +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ +#else +static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; +#endif + +#ifdef RPC_DEBUG + +static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int zero; +static unsigned int max_padding = PAGE_SIZE; +static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; +static unsigned int max_memreg = RPCRDMA_LAST - 1; + +static struct ctl_table_header *sunrpc_table_header; + +static ctl_table xr_tunables_table[] = { + { + .ctl_name = CTL_SLOTTABLE_RDMA, + .procname = "rdma_slot_table_entries", + .data = &xprt_rdma_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .ctl_name = CTL_RDMA_MAXINLINEREAD, + .procname = "rdma_max_inline_read", + .data = &xprt_rdma_max_inline_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = CTL_RDMA_MAXINLINEWRITE, + .procname = "rdma_max_inline_write", + .data = &xprt_rdma_max_inline_write, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = CTL_RDMA_WRITEPADDING, + .procname = "rdma_inline_write_padding", + .data = &xprt_rdma_inline_write_padding, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &max_padding, + }, + { + .ctl_name = CTL_RDMA_MEMREG, + .procname = "rdma_memreg_strategy", + .data = &xprt_rdma_memreg_strategy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_memreg, + .extra2 = &max_memreg, + }, + { + .ctl_name = 0, + }, +}; + +static ctl_table sunrpc_table[] = { + { + .ctl_name = CTL_SUNRPC, + .procname = "sunrpc", + .mode = 0555, + .child = xr_tunables_table + }, + { + .ctl_name = 0, + }, +}; + +#endif + +static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ + +static void +xprt_rdma_format_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr_in *addr = (struct sockaddr_in *) + &rpcx_to_rdmad(xprt).addr; + char *buf; + + buf = kzalloc(20, GFP_KERNEL); + if (buf) + snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) + snprintf(buf, 8, "%u", ntohs(addr->sin_port)); + xprt->address_strings[RPC_DISPLAY_PORT] = buf; + + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; + + buf = kzalloc(48, GFP_KERNEL); + if (buf) + snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), "rdma"); + xprt->address_strings[RPC_DISPLAY_ALL] = buf; + + buf = kzalloc(10, GFP_KERNEL); + if (buf) + snprintf(buf, 10, "%02x%02x%02x%02x", + NIPQUAD(addr->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) + snprintf(buf, 8, "%4hx", ntohs(addr->sin_port)); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; + + buf = kzalloc(30, GFP_KERNEL); + if (buf) + snprintf(buf, 30, NIPQUAD_FMT".%u.%u", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port) >> 8, + ntohs(addr->sin_port) & 0xff); + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; + + /* netid */ + xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; +} + +static void +xprt_rdma_free_addresses(struct rpc_xprt *xprt) +{ + kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + kfree(xprt->address_strings[RPC_DISPLAY_ALL]); + kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]); + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]); +} + +static void +xprt_rdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_xprt *r_xprt = + container_of(work, struct rpcrdma_xprt, rdma_connect.work); + struct rpc_xprt *xprt = &r_xprt->xprt; + int rc = 0; + + if (!xprt->shutdown) { + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + goto out; + } + goto out_clear; + +out: + xprt_wake_pending_tasks(xprt, rc); + +out_clear: + dprintk("RPC: %s: exit\n", __func__); + xprt_clear_connecting(xprt); +} + +/* + * xprt_rdma_destroy + * + * Destroy the xprt. + * Free all memory associated with the object, including its own. + * NOTE: none of the *destroy methods free memory for their top-level + * objects, even though they may have allocated it (they do free + * private memory). It's up to the caller to handle it. In this + * case (RDMA transport), all structure memory is inlined with the + * struct rpcrdma_xprt. + */ +static void +xprt_rdma_destroy(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; + + dprintk("RPC: %s: called\n", __func__); + + cancel_delayed_work(&r_xprt->rdma_connect); + flush_scheduled_work(); + + xprt_clear_connected(xprt); + + rpcrdma_buffer_destroy(&r_xprt->rx_buf); + rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", + __func__, rc); + rpcrdma_ia_close(&r_xprt->rx_ia); + + xprt_rdma_free_addresses(xprt); + + kfree(xprt->slot); + xprt->slot = NULL; + kfree(xprt); + + dprintk("RPC: %s: returning\n", __func__); + + module_put(THIS_MODULE); +} + +/** + * xprt_setup_rdma - Set up transport to use RDMA + * + * @args: rpc transport arguments + */ +static struct rpc_xprt * +xprt_setup_rdma(struct xprt_create *args) +{ + struct rpcrdma_create_data_internal cdata; + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + struct rpcrdma_ep *new_ep; + struct sockaddr_in *sin; + int rc; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL); + if (xprt == NULL) { + dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->max_reqs = xprt_rdma_slot_table_entries; + xprt->slot = kcalloc(xprt->max_reqs, + sizeof(struct rpc_rqst), GFP_KERNEL); + if (xprt->slot == NULL) { + kfree(xprt); + dprintk("RPC: %s: couldn't allocate %d slots\n", + __func__, xprt->max_reqs); + return ERR_PTR(-ENOMEM); + } + + /* 60 second timeout, no retries */ + xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ); + xprt->bind_timeout = (60U * HZ); + xprt->connect_timeout = (60U * HZ); + xprt->reestablish_timeout = (5U * HZ); + xprt->idle_timeout = (5U * 60 * HZ); + + xprt->resvport = 0; /* privileged port not needed */ + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; + xprt->ops = &xprt_rdma_procs; + + /* + * Set up RDMA-specific connect data. + */ + + /* Put server RDMA address in local cdata */ + memcpy(&cdata.addr, args->dstaddr, args->addrlen); + + /* Ensure xprt->addr holds valid server TCP (not RDMA) + * address, for any side protocols which peek at it */ + xprt->prot = IPPROTO_TCP; + xprt->addrlen = args->addrlen; + memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + + sin = (struct sockaddr_in *)&cdata.addr; + if (ntohs(sin->sin_port) != 0) + xprt_set_bound(xprt); + + dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__, + NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port)); + + /* Set max requests */ + cdata.max_requests = xprt->max_reqs; + + /* Set some length limits */ + cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ + cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ + + cdata.inline_wsize = xprt_rdma_max_inline_write; + if (cdata.inline_wsize > cdata.wsize) + cdata.inline_wsize = cdata.wsize; + + cdata.inline_rsize = xprt_rdma_max_inline_read; + if (cdata.inline_rsize > cdata.rsize) + cdata.inline_rsize = cdata.rsize; + + cdata.padding = xprt_rdma_inline_write_padding; + + /* + * Create new transport instance, which includes initialized + * o ia + * o endpoint + * o buffers + */ + + new_xprt = rpcx_to_rdmax(xprt); + + rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, + xprt_rdma_memreg_strategy); + if (rc) + goto out1; + + /* + * initialize and create ep + */ + new_xprt->rx_data = cdata; + new_ep = &new_xprt->rx_ep; + new_ep->rep_remote_addr = cdata.addr; + + rc = rpcrdma_ep_create(&new_xprt->rx_ep, + &new_xprt->rx_ia, &new_xprt->rx_data); + if (rc) + goto out2; + + /* + * Allocate pre-registered send and receive buffers for headers and + * any inline data. Also specify any padding which will be provided + * from a preregistered zero buffer. + */ + rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, + &new_xprt->rx_data); + if (rc) + goto out3; + + /* + * Register a callback for connection events. This is necessary because + * connection loss notification is async. We also catch connection loss + * when reaping receives. + */ + INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); + new_ep->rep_func = rpcrdma_conn_func; + new_ep->rep_xprt = xprt; + + xprt_rdma_format_addresses(xprt); + + if (!try_module_get(THIS_MODULE)) + goto out4; + + return xprt; + +out4: + xprt_rdma_free_addresses(xprt); + rc = -EINVAL; +out3: + (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); +out2: + rpcrdma_ia_close(&new_xprt->rx_ia); +out1: + kfree(xprt->slot); + kfree(xprt); + return ERR_PTR(rc); +} + +/* + * Close a connection, during shutdown or timeout/reconnect + */ +static void +xprt_rdma_close(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + dprintk("RPC: %s: closing\n", __func__); + xprt_disconnect(xprt); + (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); +} + +static void +xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) +{ + struct sockaddr_in *sap; + + sap = (struct sockaddr_in *)&xprt->addr; + sap->sin_port = htons(port); + sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; + sap->sin_port = htons(port); + dprintk("RPC: %s: %u\n", __func__, port); +} + +static void +xprt_rdma_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + if (!xprt_test_and_set_connecting(xprt)) { + if (r_xprt->rx_ep.rep_connected != 0) { + /* Reconnect */ + schedule_delayed_work(&r_xprt->rdma_connect, + xprt->reestablish_timeout); + } else { + schedule_delayed_work(&r_xprt->rdma_connect, 0); + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); + } + } +} + +static int +xprt_rdma_reserve_xprt(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int credits = atomic_read(&r_xprt->rx_buf.rb_credits); + + /* == RPC_CWNDSCALE @ init, but *after* setup */ + if (r_xprt->rx_buf.rb_cwndscale == 0UL) { + r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; + dprintk("RPC: %s: cwndscale %lu\n", __func__, + r_xprt->rx_buf.rb_cwndscale); + BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); + } + xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; + return xprt_reserve_xprt_cong(task); +} + +/* + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual send/recv + * sequence. For this reason, the recv buffers are attached to send + * buffers for portions of the RPC. Note that the RPC layer allocates + * both send and receive buffers in the same call. We may register + * the receive buffer portion when using reply chunks. + */ +static void * +xprt_rdma_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_req *req, *nreq; + + req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); + BUG_ON(NULL == req); + + if (size > req->rl_size) { + dprintk("RPC: %s: size %zd too large for buffer[%zd]: " + "prog %d vers %d proc %d\n", + __func__, size, req->rl_size, + task->tk_client->cl_prog, task->tk_client->cl_vers, + task->tk_msg.rpc_proc->p_proc); + /* + * Outgoing length shortage. Our inline write max must have + * been configured to perform direct i/o. + * + * This is therefore a large metadata operation, and the + * allocate call was made on the maximum possible message, + * e.g. containing long filename(s) or symlink data. In + * fact, while these metadata operations *might* carry + * large outgoing payloads, they rarely *do*. However, we + * have to commit to the request here, so reallocate and + * register it now. The data path will never require this + * reallocation. + * + * If the allocation or registration fails, the RPC framework + * will (doggedly) retry. + */ + if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == + RPCRDMA_BOUNCEBUFFERS) { + /* forced to "pure inline" */ + dprintk("RPC: %s: too much data (%zd) for inline " + "(r/w max %d/%d)\n", __func__, size, + rpcx_to_rdmad(xprt).inline_rsize, + rpcx_to_rdmad(xprt).inline_wsize); + size = req->rl_size; + rpc_exit(task, -EIO); /* fail the operation */ + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + goto out; + } + if (task->tk_flags & RPC_TASK_SWAPPER) + nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); + else + nreq = kmalloc(sizeof *req + size, GFP_NOFS); + if (nreq == NULL) + goto outfail; + + if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, + nreq->rl_base, size + sizeof(struct rpcrdma_req) + - offsetof(struct rpcrdma_req, rl_base), + &nreq->rl_handle, &nreq->rl_iov)) { + kfree(nreq); + goto outfail; + } + rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; + nreq->rl_size = size; + nreq->rl_niovs = 0; + nreq->rl_nchunks = 0; + nreq->rl_buffer = (struct rpcrdma_buffer *)req; + nreq->rl_reply = req->rl_reply; + memcpy(nreq->rl_segments, + req->rl_segments, sizeof nreq->rl_segments); + /* flag the swap with an unused field */ + nreq->rl_iov.length = 0; + req->rl_reply = NULL; + req = nreq; + } + dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); +out: + return req->rl_xdr_buf; + +outfail: + rpcrdma_buffer_put(req); + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + return NULL; +} + +/* + * This function returns all RDMA resources to the pool. + */ +static void +xprt_rdma_free(void *buffer) +{ + struct rpcrdma_req *req; + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_rep *rep; + int i; + + if (buffer == NULL) + return; + + req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); + rep = req->rl_reply; + + dprintk("RPC: %s: called on 0x%p%s\n", + __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); + + /* + * Finish the deregistration. When using mw bind, this was + * begun in rpcrdma_reply_handler(). In all other modes, we + * do it here, in thread context. The process is considered + * complete when the rr_func vector becomes NULL - this + * was put in place during rpcrdma_reply_handler() - the wait + * call below will not block if the dereg is "done". If + * interrupted, our framework will clean up. + */ + for (i = 0; req->rl_nchunks;) { + --req->rl_nchunks; + i += rpcrdma_deregister_external( + &req->rl_segments[i], r_xprt, NULL); + } + + if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { + rep->rr_func = NULL; /* abandon the callback */ + req->rl_reply = NULL; + } + + if (req->rl_iov.length == 0) { /* see allocate above */ + struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; + oreq->rl_reply = req->rl_reply; + (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, + req->rl_handle, + &req->rl_iov); + kfree(req); + req = oreq; + } + + /* Put back request+reply buffers */ + rpcrdma_buffer_put(req); +} + +/* + * send_request invokes the meat of RPC RDMA. It must do the following: + * 1. Marshal the RPC request into an RPC RDMA request, which means + * putting a header in front of data, and creating IOVs for RDMA + * from those in the request. + * 2. In marshaling, detect opportunities for RDMA, and use them. + * 3. Post a recv message to set up asynch completion, then send + * the request (rpcrdma_ep_post). + * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). + */ + +static int +xprt_rdma_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + /* marshal the send itself */ + if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { + r_xprt->rx_stats.failed_marshal_count++; + dprintk("RPC: %s: rpcrdma_marshal_req failed\n", + __func__); + return -EIO; + } + + if (req->rl_reply == NULL) /* e.g. reconnection */ + rpcrdma_recv_buffer_get(req); + + if (req->rl_reply) { + req->rl_reply->rr_func = rpcrdma_reply_handler; + /* this need only be done once, but... */ + req->rl_reply->rr_xprt = xprt; + } + + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { + xprt_disconnect(xprt); + return -ENOTCONN; /* implies disconnect */ + } + + rqst->rq_bytes_sent = 0; + return 0; +} + +static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, + "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " + "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", + + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u, + + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count); +} + +/* + * Plumbing for rpc transport switch and kernel module + */ + +static struct rpc_xprt_ops xprt_rdma_procs = { + .reserve_xprt = xprt_rdma_reserve_xprt, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .release_request = xprt_release_rqst_cong, /* ditto */ + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ + .set_port = xprt_rdma_set_port, + .connect = xprt_rdma_connect, + .buf_alloc = xprt_rdma_allocate, + .buf_free = xprt_rdma_free, + .send_request = xprt_rdma_send_request, + .close = xprt_rdma_close, + .destroy = xprt_rdma_destroy, + .print_stats = xprt_rdma_print_stats +}; + +static struct xprt_class xprt_rdma = { + .list = LIST_HEAD_INIT(xprt_rdma.list), + .name = "rdma", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_RDMA, + .setup = xprt_setup_rdma, +}; + +static void __exit xprt_rdma_cleanup(void) +{ + int rc; + + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); +#ifdef RPC_DEBUG + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +#endif + rc = xprt_unregister_transport(&xprt_rdma); + if (rc) + dprintk("RPC: %s: xprt_unregister returned %i\n", + __func__, rc); +} + +static int __init xprt_rdma_init(void) +{ + int rc; + + rc = xprt_register_transport(&xprt_rdma); + + if (rc) + return rc; + + dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); + + dprintk(KERN_INFO "Defaults:\n"); + dprintk(KERN_INFO "\tSlots %d\n" + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", + xprt_rdma_slot_table_entries, + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); + dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", + xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + +#ifdef RPC_DEBUG + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +#endif + return 0; +} + +module_init(xprt_rdma_init); +module_exit(xprt_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c new file mode 100644 index 0000000..9ec8ca4 --- /dev/null +++ b/net/sunrpc/xprtrdma/verbs.c @@ -0,0 +1,1626 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * verbs.c + * + * Encapsulates the major functions managing: + * o adapters + * o endpoints + * o connections + * o buffer memory + */ + +#include /* for Tavor hack below */ + +#include "xprt_rdma.h" + +/* + * Globals/Macros + */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* + * internal functions + */ + +/* + * handle replies in tasklet context, using a single, global list + * rdma tasklet function -- just turn around and call the func + * for all replies on the list + */ + +static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); +static LIST_HEAD(rpcrdma_tasklets_g); + +static void +rpcrdma_run_tasklet(unsigned long data) +{ + struct rpcrdma_rep *rep; + void (*func)(struct rpcrdma_rep *); + unsigned long flags; + + data = data; + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + while (!list_empty(&rpcrdma_tasklets_g)) { + rep = list_entry(rpcrdma_tasklets_g.next, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + func = rep->rr_func; + rep->rr_func = NULL; + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + + if (func) + func(rep); + else + rpcrdma_recv_buffer_put(rep); + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + } + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); +} + +static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); + +static inline void +rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) +{ + unsigned long flags; + + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); +} + +static void +rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: QP error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static void +rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + + dprintk("RPC: %s: CQ error %X on device %s ep %p\n", + __func__, event->event, event->device->name, context); + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +static inline +void rpcrdma_event_process(struct ib_wc *wc) +{ + struct rpcrdma_rep *rep = + (struct rpcrdma_rep *)(unsigned long) wc->wr_id; + + dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + + if (!rep) /* send or bind completion that we don't care about */ + return; + + if (IB_WC_SUCCESS != wc->status) { + dprintk("RPC: %s: %s WC status %X, connection lost\n", + __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", + wc->status); + rep->rr_len = ~0U; + rpcrdma_schedule_tasklet(rep); + return; + } + + switch (wc->opcode) { + case IB_WC_RECV: + rep->rr_len = wc->byte_len; + ib_dma_sync_single_for_cpu( + rdmab_to_ia(rep->rr_buffer)->ri_id->device, + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); + /* Keep (only) the most recent credits, after check validity */ + if (rep->rr_len >= 16) { + struct rpcrdma_msg *p = + (struct rpcrdma_msg *) rep->rr_base; + unsigned int credits = ntohl(p->rm_credit); + if (credits == 0) { + dprintk("RPC: %s: server" + " dropped credits to 0!\n", __func__); + /* don't deadlock */ + credits = 1; + } else if (credits > rep->rr_buffer->rb_max_requests) { + dprintk("RPC: %s: server" + " over-crediting: %d (%d)\n", + __func__, credits, + rep->rr_buffer->rb_max_requests); + credits = rep->rr_buffer->rb_max_requests; + } + atomic_set(&rep->rr_buffer->rb_credits, credits); + } + /* fall through */ + case IB_WC_BIND_MW: + rpcrdma_schedule_tasklet(rep); + break; + default: + dprintk("RPC: %s: unexpected WC event %X\n", + __func__, wc->opcode); + break; + } +} + +static inline int +rpcrdma_cq_poll(struct ib_cq *cq) +{ + struct ib_wc wc; + int rc; + + for (;;) { + rc = ib_poll_cq(cq, 1, &wc); + if (rc < 0) { + dprintk("RPC: %s: ib_poll_cq failed %i\n", + __func__, rc); + return rc; + } + if (rc == 0) + break; + + rpcrdma_event_process(&wc); + } + + return 0; +} + +/* + * rpcrdma_cq_event_upcall + * + * This upcall handles recv, send, bind and unbind events. + * It is reentrant but processes single events in order to maintain + * ordering of receives to keep server credits. + * + * It is the responsibility of the scheduled tasklet to return + * recv buffers to the pool. NOTE: this affects synchronization of + * connection shutdown. That is, the structures required for + * the completion of the reply handler must remain intact until + * all memory has been reclaimed. + * + * Note that send events are suppressed and do not result in an upcall. + */ +static void +rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) +{ + int rc; + + rc = rpcrdma_cq_poll(cq); + if (rc) + return; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed %i\n", + __func__, rc); + return; + } + + rpcrdma_cq_poll(cq); +} + +#ifdef RPC_DEBUG +static const char * const conn[] = { + "address resolved", + "address error", + "route resolved", + "route error", + "connect request", + "connect response", + "connect error", + "unreachable", + "rejected", + "established", + "disconnected", + "device removal" +}; +#endif + +static int +rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct rpcrdma_xprt *xprt = id->context; + struct rpcrdma_ia *ia = &xprt->rx_ia; + struct rpcrdma_ep *ep = &xprt->rx_ep; + struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct ib_qp_attr attr; + struct ib_qp_init_attr iattr; + int connstate = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + ia->ri_async_rc = -EHOSTUNREACH; + dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + ia->ri_async_rc = -ENETUNREACH; + dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", + __func__, ep); + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ESTABLISHED: + connstate = 1; + ib_query_qp(ia->ri_id->qp, &attr, + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, + &iattr); + dprintk("RPC: %s: %d responder resources" + " (%d initiator)\n", + __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); + goto connected; + case RDMA_CM_EVENT_CONNECT_ERROR: + connstate = -ENOTCONN; + goto connected; + case RDMA_CM_EVENT_UNREACHABLE: + connstate = -ENETDOWN; + goto connected; + case RDMA_CM_EVENT_REJECTED: + connstate = -ECONNREFUSED; + goto connected; + case RDMA_CM_EVENT_DISCONNECTED: + connstate = -ECONNABORTED; + goto connected; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + connstate = -ENODEV; +connected: + dprintk("RPC: %s: %s: %u.%u.%u.%u:%u" + " (ep 0x%p event 0x%x)\n", + __func__, + (event->event <= 11) ? conn[event->event] : + "unknown connection error", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + ep, event->event); + atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); + dprintk("RPC: %s: %sconnected\n", + __func__, connstate > 0 ? "" : "dis"); + ep->rep_connected = connstate; + ep->rep_func(ep); + wake_up_all(&ep->rep_connect_wait); + break; + default: + ia->ri_async_rc = -EINVAL; + dprintk("RPC: %s: unexpected CM event %X\n", + __func__, event->event); + complete(&ia->ri_done); + break; + } + + return 0; +} + +static struct rdma_cm_id * +rpcrdma_create_id(struct rpcrdma_xprt *xprt, + struct rpcrdma_ia *ia, struct sockaddr *addr) +{ + struct rdma_cm_id *id; + int rc; + + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + dprintk("RPC: %s: rdma_create_id() failed %i\n", + __func__, rc); + return id; + } + + ia->ri_async_rc = 0; + rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion(&ia->ri_done); + rc = ia->ri_async_rc; + if (rc) + goto out; + + ia->ri_async_rc = 0; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_route() failed %i\n", + __func__, rc); + goto out; + } + wait_for_completion(&ia->ri_done); + rc = ia->ri_async_rc; + if (rc) + goto out; + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Drain any cq, prior to teardown. + */ +static void +rpcrdma_clean_cq(struct ib_cq *cq) +{ + struct ib_wc wc; + int count = 0; + + while (1 == ib_poll_cq(cq, 1, &wc)) + ++count; + + if (count) + dprintk("RPC: %s: flushed %d events (last 0x%x)\n", + __func__, count, wc.opcode); +} + +/* + * Exported functions. + */ + +/* + * Open and initialize an Interface Adapter. + * o initializes fields of struct rpcrdma_ia, including + * interface and provider attributes and protection zone. + */ +int +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +{ + int rc; + struct rpcrdma_ia *ia = &xprt->rx_ia; + + init_completion(&ia->ri_done); + + ia->ri_id = rpcrdma_create_id(xprt, ia, addr); + if (IS_ERR(ia->ri_id)) { + rc = PTR_ERR(ia->ri_id); + goto out1; + } + + ia->ri_pd = ib_alloc_pd(ia->ri_id->device); + if (IS_ERR(ia->ri_pd)) { + rc = PTR_ERR(ia->ri_pd); + dprintk("RPC: %s: ib_alloc_pd() failed %i\n", + __func__, rc); + goto out2; + } + + /* + * Optionally obtain an underlying physical identity mapping in + * order to do a memory window-based bind. This base registration + * is protected from remote access - that is enabled only by binding + * for the specific bytes targeted during each RPC operation, and + * revoked after the corresponding completion similar to a storage + * adapter. + */ + if (memreg > RPCRDMA_REGISTER) { + int mem_priv = IB_ACCESS_LOCAL_WRITE; + switch (memreg) { +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + mem_priv |= IB_ACCESS_REMOTE_WRITE; + mem_priv |= IB_ACCESS_REMOTE_READ; + break; +#endif + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + mem_priv |= IB_ACCESS_MW_BIND; + break; + default: + break; + } + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); + if (IS_ERR(ia->ri_bind_mem)) { + printk(KERN_ALERT "%s: ib_get_dma_mr for " + "phys register failed with %lX\n\t" + "Will continue with degraded performance\n", + __func__, PTR_ERR(ia->ri_bind_mem)); + memreg = RPCRDMA_REGISTER; + ia->ri_bind_mem = NULL; + } + } + + /* Else will do memory reg/dereg for each chunk */ + ia->ri_memreg_strategy = memreg; + + return 0; +out2: + rdma_destroy_id(ia->ri_id); +out1: + return rc; +} + +/* + * Clean up/close an IA. + * o if event handles and PD have been initialized, free them. + * o close the IA + */ +void +rpcrdma_ia_close(struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering\n", __func__); + if (ia->ri_bind_mem != NULL) { + rc = ib_dereg_mr(ia->ri_bind_mem); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { + rc = ib_dealloc_pd(ia->ri_pd); + dprintk("RPC: %s: ib_dealloc_pd returned %i\n", + __func__, rc); + } + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) + rdma_destroy_id(ia->ri_id); +} + +/* + * Create unconnected endpoint. + */ +int +rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr devattr; + int rc; + + rc = ib_query_device(ia->ri_id->device, &devattr); + if (rc) { + dprintk("RPC: %s: ib_query_device failed %d\n", + __func__, rc); + return rc; + } + + /* check provider's send/recv wr limits */ + if (cdata->max_requests > devattr.max_qp_wr) + cdata->max_requests = devattr.max_qp_wr; + + ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.qp_context = ep; + /* send_cq and recv_cq initialized below */ + ep->rep_attr.srq = NULL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* Add room for mw_binds+unbinds - overkill! */ + ep->rep_attr.cap.max_send_wr++; + ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) + return -EINVAL; + break; + default: + break; + } + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_recv_sge = 1; + ep->rep_attr.cap.max_inline_data = 0; + ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->rep_attr.qp_type = IB_QPT_RC; + ep->rep_attr.port_num = ~0; + + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " + "iovs: send %d recv %d\n", + __func__, + ep->rep_attr.cap.max_send_wr, + ep->rep_attr.cap.max_recv_wr, + ep->rep_attr.cap.max_send_sge, + ep->rep_attr.cap.max_recv_sge); + + /* set trigger for requesting send completion */ + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + ep->rep_cqinit -= RPCRDMA_MAX_SEGS; + break; + default: + break; + } + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; + INIT_CQCOUNT(ep); + ep->rep_ia = ia; + init_waitqueue_head(&ep->rep_connect_wait); + + /* + * Create a single cq for receive dto and mw_bind (only ever + * care about unbind, really). Send completions are suppressed. + * Use single threaded tasklet upcalls to maintain ordering. + */ + ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, + rpcrdma_cq_async_error_upcall, NULL, + ep->rep_attr.cap.max_recv_wr + + ep->rep_attr.cap.max_send_wr + 1, 0); + if (IS_ERR(ep->rep_cq)) { + rc = PTR_ERR(ep->rep_cq); + dprintk("RPC: %s: ib_create_cq failed: %i\n", + __func__, rc); + goto out1; + } + + rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + + ep->rep_attr.send_cq = ep->rep_cq; + ep->rep_attr.recv_cq = ep->rep_cq; + + /* Initialize cma parameters */ + + /* RPC/RDMA does not use private data */ + ep->rep_remote_cma.private_data = NULL; + ep->rep_remote_cma.private_data_len = 0; + + /* Client offers RDMA Read but does not initiate */ + switch (ia->ri_memreg_strategy) { + case RPCRDMA_BOUNCEBUFFERS: + ep->rep_remote_cma.responder_resources = 0; + break; + case RPCRDMA_MTHCAFMR: + case RPCRDMA_REGISTER: + ep->rep_remote_cma.responder_resources = cdata->max_requests * + (RPCRDMA_MAX_DATA_SEGS / 8); + break; + case RPCRDMA_MEMWINDOWS: + case RPCRDMA_MEMWINDOWS_ASYNC: +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: +#endif + ep->rep_remote_cma.responder_resources = cdata->max_requests * + (RPCRDMA_MAX_DATA_SEGS / 2); + break; + default: + break; + } + if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) + ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; + ep->rep_remote_cma.initiator_depth = 0; + + ep->rep_remote_cma.retry_count = 7; + ep->rep_remote_cma.flow_control = 0; + ep->rep_remote_cma.rnr_retry_count = 0; + + return 0; + +out2: + if (ib_destroy_cq(ep->rep_cq)) + ; +out1: + return rc; +} + +/* + * rpcrdma_ep_destroy + * + * Disconnect and destroy endpoint. After this, the only + * valid operations on the ep are to free it (if dynamically + * allocated) or re-create it. + * + * The caller's error handling must be sure to not leak the endpoint + * if this function fails. + */ +int +rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + dprintk("RPC: %s: entering, connected is %d\n", + __func__, ep->rep_connected); + + if (ia->ri_id->qp) { + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc) + dprintk("RPC: %s: rpcrdma_ep_disconnect" + " returned %i\n", __func__, rc); + } + + ep->rep_func = NULL; + + /* padding - could be done in rpcrdma_buffer_destroy... */ + if (ep->rep_pad_mr) { + rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); + ep->rep_pad_mr = NULL; + } + + if (ia->ri_id->qp) { + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + + rpcrdma_clean_cq(ep->rep_cq); + rc = ib_destroy_cq(ep->rep_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); + + return rc; +} + +/* + * Connect unconnected endpoint. + */ +int +rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct rdma_cm_id *id; + int rc = 0; + int retry_count = 0; + int reconnect = (ep->rep_connected != 0); + + if (reconnect) { + struct rpcrdma_xprt *xprt; +retry: + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc && rc != -ENOTCONN) + dprintk("RPC: %s: rpcrdma_ep_disconnect" + " status %i\n", __func__, rc); + rpcrdma_clean_cq(ep->rep_cq); + + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + id = rpcrdma_create_id(xprt, ia, + (struct sockaddr *)&xprt->rx_data.addr); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + goto out; + } + /* TEMP TEMP TEMP - fail if new device: + * Deregister/remarshal *all* requests! + * Close and recreate adapter, pd, etc! + * Re-determine all attributes still sane! + * More stuff I haven't thought of! + * Rrrgh! + */ + if (ia->ri_id->device != id->device) { + printk("RPC: %s: can't reconnect on " + "different device!\n", __func__); + rdma_destroy_id(id); + rc = -ENETDOWN; + goto out; + } + /* END TEMP */ + rdma_destroy_id(ia->ri_id); + ia->ri_id = id; + } + + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + goto out; + } + +/* XXX Tavor device performs badly with 2K MTU! */ +if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { + struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); + if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && + (pcid->vendor == PCI_VENDOR_ID_MELLANOX || + pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { + struct ib_qp_attr attr = { + .path_mtu = IB_MTU_1024 + }; + rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); + } +} + + /* Theoretically a client initiator_depth > 0 is not needed, + * but many peers fail to complete the connection unless they + * == responder_resources! */ + if (ep->rep_remote_cma.initiator_depth != + ep->rep_remote_cma.responder_resources) + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; + + ep->rep_connected = 0; + + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + if (rc) { + dprintk("RPC: %s: rdma_connect() failed with %i\n", + __func__, rc); + goto out; + } + + if (reconnect) + return 0; + + wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); + + /* + * Check state. A non-peer reject indicates no listener + * (ECONNREFUSED), which may be a transient state. All + * others indicate a transport condition which has already + * undergone a best-effort. + */ + if (ep->rep_connected == -ECONNREFUSED + && ++retry_count <= RDMA_CONNECT_RETRY_MAX) { + dprintk("RPC: %s: non-peer_reject, retry\n", __func__); + goto retry; + } + if (ep->rep_connected <= 0) { + /* Sometimes, the only way to reliably connect to remote + * CMs is to use same nonzero values for ORD and IRD. */ + ep->rep_remote_cma.initiator_depth = + ep->rep_remote_cma.responder_resources; + if (ep->rep_remote_cma.initiator_depth == 0) + ++ep->rep_remote_cma.initiator_depth; + if (ep->rep_remote_cma.responder_resources == 0) + ++ep->rep_remote_cma.responder_resources; + if (retry_count++ == 0) + goto retry; + rc = ep->rep_connected; + } else { + dprintk("RPC: %s: connected\n", __func__); + } + +out: + if (rc) + ep->rep_connected = rc; + return rc; +} + +/* + * rpcrdma_ep_disconnect + * + * This is separate from destroy to facilitate the ability + * to reconnect without recreating the endpoint. + * + * This call is not reentrant, and must not be made in parallel + * on the same endpoint. + */ +int +rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + rpcrdma_clean_cq(ep->rep_cq); + rc = rdma_disconnect(ia->ri_id); + if (!rc) { + /* returns without wait if not connected */ + wait_event_interruptible(ep->rep_connect_wait, + ep->rep_connected != 1); + dprintk("RPC: %s: after wait, %sconnected\n", __func__, + (ep->rep_connected == 1) ? "still " : "dis"); + } else { + dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); + ep->rep_connected = rc; + } + return rc; +} + +/* + * Initialize buffer memory + */ +int +rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) +{ + char *p; + size_t len; + int i, rc; + + buf->rb_max_requests = cdata->max_requests; + spin_lock_init(&buf->rb_lock); + atomic_set(&buf->rb_credits, 1); + + /* Need to allocate: + * 1. arrays for send and recv pointers + * 2. arrays of struct rpcrdma_req to fill in pointers + * 3. array of struct rpcrdma_rep for replies + * 4. padding, if any + * 5. mw's, if any + * Send/recv buffers in req/rep need to be registered + */ + + len = buf->rb_max_requests * + (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); + len += cdata->padding; + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MTHCAFMR: + /* TBD we are perhaps overallocating here */ + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; + default: + break; + } + + /* allocate 1, 4 and 5 in one shot */ + p = kzalloc(len, GFP_KERNEL); + if (p == NULL) { + dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", + __func__, len); + rc = -ENOMEM; + goto out; + } + buf->rb_pool = p; /* for freeing it later */ + + buf->rb_send_bufs = (struct rpcrdma_req **) p; + p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; + buf->rb_recv_bufs = (struct rpcrdma_rep **) p; + p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; + + /* + * Register the zeroed pad buffer, if any. + */ + if (cdata->padding) { + rc = rpcrdma_register_internal(ia, p, cdata->padding, + &ep->rep_pad_mr, &ep->rep_pad); + if (rc) + goto out; + } + p += cdata->padding; + + /* + * Allocate the fmr's, or mw's for mw_bind chunk registration. + * We "cycle" the mw's in order to minimize rkey reuse, + * and also reduce unbind-to-bind collision. + */ + INIT_LIST_HEAD(&buf->rb_mws); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MTHCAFMR: + { + struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; + struct ib_fmr_attr fa = { + RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT + }; + /* TBD we are perhaps overallocating here */ + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + r->r.fmr = ib_alloc_fmr(ia->ri_pd, + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, + &fa); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr" + " failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + } + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + { + struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; + /* Allocate one extra request's worth, for full cycling */ + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { + r->r.mw = ib_alloc_mw(ia->ri_pd); + if (IS_ERR(r->r.mw)) { + rc = PTR_ERR(r->r.mw); + dprintk("RPC: %s: ib_alloc_mw" + " failed %i\n", __func__, rc); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); + ++r; + } + } + break; + default: + break; + } + + /* + * Allocate/init the request/reply buffers. Doing this + * using kmalloc for now -- one for each buf. + */ + for (i = 0; i < buf->rb_max_requests; i++) { + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + len = cdata->inline_wsize + sizeof(struct rpcrdma_req); + /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ + /* Typical ~2400b, so rounding up saves work later */ + if (len < 4096) + len = 4096; + req = kmalloc(len, GFP_KERNEL); + if (req == NULL) { + dprintk("RPC: %s: request buffer %d alloc" + " failed\n", __func__, i); + rc = -ENOMEM; + goto out; + } + memset(req, 0, sizeof(struct rpcrdma_req)); + buf->rb_send_bufs[i] = req; + buf->rb_send_bufs[i]->rl_buffer = buf; + + rc = rpcrdma_register_internal(ia, req->rl_base, + len - offsetof(struct rpcrdma_req, rl_base), + &buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + if (rc) + goto out; + + buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); + + len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); + rep = kmalloc(len, GFP_KERNEL); + if (rep == NULL) { + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, i); + rc = -ENOMEM; + goto out; + } + memset(rep, 0, sizeof(struct rpcrdma_rep)); + buf->rb_recv_bufs[i] = rep; + buf->rb_recv_bufs[i]->rr_buffer = buf; + init_waitqueue_head(&rep->rr_unbind); + + rc = rpcrdma_register_internal(ia, rep->rr_base, + len - offsetof(struct rpcrdma_rep, rr_base), + &buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + if (rc) + goto out; + + } + dprintk("RPC: %s: max_requests %d\n", + __func__, buf->rb_max_requests); + /* done */ + return 0; +out: + rpcrdma_buffer_destroy(buf); + return rc; +} + +/* + * Unregister and destroy buffer memory. Need to deal with + * partial initialization, so it's callable from failed create. + * Must be called before destroying endpoint, as registrations + * reference it. + */ +void +rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) +{ + int rc, i; + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + + /* clean up in reverse order from create + * 1. recv mr memory (mr free, then kfree) + * 1a. bind mw memory + * 2. send mr memory (mr free, then kfree) + * 3. padding (if any) [moved to rpcrdma_ep_destroy] + * 4. arrays + */ + dprintk("RPC: %s: entering\n", __func__); + + for (i = 0; i < buf->rb_max_requests; i++) { + if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { + rpcrdma_deregister_internal(ia, + buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + kfree(buf->rb_recv_bufs[i]); + } + if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { + while (!list_empty(&buf->rb_mws)) { + struct rpcrdma_mw *r; + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MTHCAFMR: + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_fmr" + " failed %i\n", + __func__, rc); + break; + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + rc = ib_dealloc_mw(r->r.mw); + if (rc) + dprintk("RPC: %s:" + " ib_dealloc_mw" + " failed %i\n", + __func__, rc); + break; + default: + break; + } + } + rpcrdma_deregister_internal(ia, + buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + kfree(buf->rb_send_bufs[i]); + } + } + + kfree(buf->rb_pool); +} + +/* + * Get a set of request/reply buffers. + * + * Reply buffer (if needed) is attached to send buffer upon return. + * Rule: + * rb_send_index and rb_recv_index MUST always be pointing to the + * *next* available buffer (non-NULL). They are incremented after + * removing buffers, and decremented *before* returning them. + */ +struct rpcrdma_req * +rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) +{ + struct rpcrdma_req *req; + unsigned long flags; + + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_send_index == buffers->rb_max_requests) { + spin_unlock_irqrestore(&buffers->rb_lock, flags); + dprintk("RPC: %s: out of request buffers\n", __func__); + return ((struct rpcrdma_req *)NULL); + } + + req = buffers->rb_send_bufs[buffers->rb_send_index]; + if (buffers->rb_send_index < buffers->rb_recv_index) { + dprintk("RPC: %s: %d extra receives outstanding (ok)\n", + __func__, + buffers->rb_recv_index - buffers->rb_send_index); + req->rl_reply = NULL; + } else { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + if (!list_empty(&buffers->rb_mws)) { + int i = RPCRDMA_MAX_SEGS - 1; + do { + struct rpcrdma_mw *r; + r = list_entry(buffers->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + } while (--i >= 0); + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); + return req; +} + +/* + * Put request/reply buffers back into pool. + * Pre-decrement counter/array index. + */ +void +rpcrdma_buffer_put(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + int i; + unsigned long flags; + + BUG_ON(req->rl_nchunks != 0); + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_send_bufs[--buffers->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; + init_waitqueue_head(&req->rl_reply->rr_unbind); + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_MTHCAFMR: + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* + * Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ + i = 1; + do { + struct rpcrdma_mw **mw; + mw = &req->rl_segments[i].mr_chunk.rl_mw; + list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); + *mw = NULL; + } while (++i < RPCRDMA_MAX_SEGS); + list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, + &buffers->rb_mws); + req->rl_segments[0].mr_chunk.rl_mw = NULL; + break; + default: + break; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Recover reply buffers from pool. + * This happens when recovering from error conditions. + * Post-increment counter/array index. + */ +void +rpcrdma_recv_buffer_get(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + unsigned long flags; + + if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ + buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; + spin_lock_irqsave(&buffers->rb_lock, flags); + if (buffers->rb_recv_index < buffers->rb_max_requests) { + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; + } + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Put reply buffers back into pool when not attached to + * request. This happens in error conditions, and when + * aborting unbinds. Pre-decrement counter/array index. + */ +void +rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buffers = rep->rr_buffer; + unsigned long flags; + + rep->rr_func = NULL; + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; + spin_unlock_irqrestore(&buffers->rb_lock, flags); +} + +/* + * Wrappers for internal-use kmalloc memory registration, used by buffer code. + */ + +int +rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, + struct ib_mr **mrp, struct ib_sge *iov) +{ + struct ib_phys_buf ipb; + struct ib_mr *mr; + int rc; + + /* + * All memory passed here was kmalloc'ed, therefore phys-contiguous. + */ + iov->addr = ib_dma_map_single(ia->ri_id->device, + va, len, DMA_BIDIRECTIONAL); + iov->length = len; + + if (ia->ri_bind_mem != NULL) { + *mrp = NULL; + iov->lkey = ia->ri_bind_mem->lkey; + return 0; + } + + ipb.addr = iov->addr; + ipb.size = iov->length; + mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, + IB_ACCESS_LOCAL_WRITE, &iov->addr); + + dprintk("RPC: %s: phys convert: 0x%llx " + "registered 0x%llx length %d\n", + __func__, ipb.addr, iov->addr, len); + + if (IS_ERR(mr)) { + *mrp = NULL; + rc = PTR_ERR(mr); + dprintk("RPC: %s: failed with %i\n", __func__, rc); + } else { + *mrp = mr; + iov->lkey = mr->lkey; + rc = 0; + } + + return rc; +} + +int +rpcrdma_deregister_internal(struct rpcrdma_ia *ia, + struct ib_mr *mr, struct ib_sge *iov) +{ + int rc; + + ib_dma_unmap_single(ia->ri_id->device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + + if (NULL == mr) + return 0; + + rc = ib_dereg_mr(mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); + return rc; +} + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +static void +rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) +{ + seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + seg->mr_dmalen = seg->mr_len; + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(ia->ri_id->device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(ia->ri_id->device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); +} + +static void +rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(ia->ri_id->device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + +int +rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + int nsegs, int writing, struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : + IB_ACCESS_REMOTE_READ); + struct rpcrdma_mr_seg *seg1 = seg; + int i; + int rc = 0; + + switch (ia->ri_memreg_strategy) { + +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + rpcrdma_map_one(ia, seg, writing); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + nsegs = 1; + break; +#endif + + /* Registration using fast memory registration */ + case RPCRDMA_MTHCAFMR: + { + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff = offset_in_page(seg->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_DATA_SEGS) + nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + nsegs = i; + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, + physaddrs, nsegs, seg1->mr_dma); + if (rc) { + dprintk("RPC: %s: failed ib_map_phys_fmr " + "%u@0x%llx+%i (%d)... status %i\n", __func__, + len, (unsigned long long)seg1->mr_dma, + pageoff, nsegs, rc); + while (nsegs--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = nsegs; + seg1->mr_len = len; + } + } + break; + + /* Registration using memory windows */ + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + { + struct ib_mw_bind param; + rpcrdma_map_one(ia, seg, writing); + param.mr = ia->ri_bind_mem; + param.wr_id = 0ULL; /* no send cookie */ + param.addr = seg->mr_dma; + param.length = seg->mr_len; + param.send_flags = 0; + param.mw_access_flags = mem_priv; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_bind_mw(ia->ri_id->qp, + seg->mr_chunk.rl_mw->r.mw, ¶m); + if (rc) { + dprintk("RPC: %s: failed ib_bind_mw " + "%u@0x%llx status %i\n", + __func__, seg->mr_len, + (unsigned long long)seg->mr_dma, rc); + rpcrdma_unmap_one(ia, seg); + } else { + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; + seg->mr_base = param.addr; + seg->mr_nsegs = 1; + nsegs = 1; + } + } + break; + + /* Default registration each time */ + default: + { + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; + int len = 0; + if (nsegs > RPCRDMA_MAX_DATA_SEGS) + nsegs = RPCRDMA_MAX_DATA_SEGS; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(ia, seg, writing); + ipb[i].addr = seg->mr_dma; + ipb[i].size = seg->mr_len; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) + break; + } + nsegs = i; + seg1->mr_base = seg1->mr_dma; + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, + ipb, nsegs, mem_priv, &seg1->mr_base); + if (IS_ERR(seg1->mr_chunk.rl_mr)) { + rc = PTR_ERR(seg1->mr_chunk.rl_mr); + dprintk("RPC: %s: failed ib_reg_phys_mr " + "%u@0x%llx (%d)... status %i\n", + __func__, len, + (unsigned long long)seg1->mr_dma, nsegs, rc); + while (nsegs--) + rpcrdma_unmap_one(ia, --seg); + } else { + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; + seg1->mr_nsegs = nsegs; + seg1->mr_len = len; + } + } + break; + } + if (rc) + return -1; + + return nsegs; +} + +int +rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + struct rpcrdma_xprt *r_xprt, void *r) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + int nsegs = seg->mr_nsegs, rc; + + switch (ia->ri_memreg_strategy) { + +#if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: + BUG_ON(nsegs != 1); + rpcrdma_unmap_one(ia, seg); + rc = 0; + break; +#endif + + case RPCRDMA_MTHCAFMR: + { + LIST_HEAD(l); + list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + } + if (rc) + dprintk("RPC: %s: failed ib_unmap_fmr," + " status %i\n", __func__, rc); + break; + + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + { + struct ib_mw_bind param; + BUG_ON(nsegs != 1); + param.mr = ia->ri_bind_mem; + param.addr = 0ULL; /* unbind */ + param.length = 0; + param.mw_access_flags = 0; + if (r) { + param.wr_id = (u64) (unsigned long) r; + param.send_flags = IB_SEND_SIGNALED; + INIT_CQCOUNT(&r_xprt->rx_ep); + } else { + param.wr_id = 0ULL; + param.send_flags = 0; + DECR_CQCOUNT(&r_xprt->rx_ep); + } + rc = ib_bind_mw(ia->ri_id->qp, + seg->mr_chunk.rl_mw->r.mw, ¶m); + rpcrdma_unmap_one(ia, seg); + } + if (rc) + dprintk("RPC: %s: failed ib_(un)bind_mw," + " status %i\n", __func__, rc); + else + r = NULL; /* will upcall on completion */ + break; + + default: + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); + seg1->mr_chunk.rl_mr = NULL; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + if (rc) + dprintk("RPC: %s: failed ib_dereg_mr," + " status %i\n", __func__, rc); + break; + } + if (r) { + struct rpcrdma_rep *rep = r; + void (*func)(struct rpcrdma_rep *) = rep->rr_func; + rep->rr_func = NULL; + func(rep); /* dereg done, callback now */ + } + return nsegs; +} + +/* + * Prepost any receive buffer, then post send. + * + * Receive buffer is donated to hardware, reclaimed upon recv completion. + */ +int +rpcrdma_ep_post(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_req *req) +{ + struct ib_send_wr send_wr, *send_wr_fail; + struct rpcrdma_rep *rep = req->rl_reply; + int rc; + + if (rep) { + rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (rc) + goto out; + req->rl_reply = NULL; + } + + send_wr.next = NULL; + send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.sg_list = req->rl_send_iov; + send_wr.num_sge = req->rl_niovs; + send_wr.opcode = IB_WR_SEND; + send_wr.imm_data = 0; + if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[3].addr, req->rl_send_iov[3].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[1].addr, req->rl_send_iov[1].length, + DMA_TO_DEVICE); + ib_dma_sync_single_for_device(ia->ri_id->device, + req->rl_send_iov[0].addr, req->rl_send_iov[0].length, + DMA_TO_DEVICE); + + if (DECR_CQCOUNT(ep) > 0) + send_wr.send_flags = 0; + else { /* Provider must take a send completion every now and then */ + INIT_CQCOUNT(ep); + send_wr.send_flags = IB_SEND_SIGNALED; + } + + rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); + if (rc) + dprintk("RPC: %s: ib_post_send returned %i\n", __func__, + rc); +out: + return rc; +} + +/* + * (Re)post a receive buffer. + */ +int +rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_rep *rep) +{ + struct ib_recv_wr recv_wr, *recv_wr_fail; + int rc; + + recv_wr.next = NULL; + recv_wr.wr_id = (u64) (unsigned long) rep; + recv_wr.sg_list = &rep->rr_iov; + recv_wr.num_sge = 1; + + ib_dma_sync_single_for_cpu(ia->ri_id->device, + rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); + + DECR_CQCOUNT(ep); + rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + + if (rc) + dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, + rc); + return rc; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h new file mode 100644 index 0000000..2427822 --- /dev/null +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_XPRT_RDMA_H +#define _LINUX_SUNRPC_XPRT_RDMA_H + +#include /* wait_queue_head_t, etc */ +#include /* spinlock_t, etc */ +#include /* atomic_t, etc */ + +#include /* RDMA connection api */ +#include /* RDMA verbs api */ + +#include /* rpc_xprt */ +#include /* RPC/RDMA protocol */ +#include /* xprt parameters */ + +/* + * Interface Adapter -- one per transport instance + */ +struct rpcrdma_ia { + struct rdma_cm_id *ri_id; + struct ib_pd *ri_pd; + struct ib_mr *ri_bind_mem; + struct completion ri_done; + int ri_async_rc; + enum rpcrdma_memreg ri_memreg_strategy; +}; + +/* + * RDMA Endpoint -- one per transport instance + */ + +struct rpcrdma_ep { + atomic_t rep_cqcount; + int rep_cqinit; + int rep_connected; + struct rpcrdma_ia *rep_ia; + struct ib_cq *rep_cq; + struct ib_qp_init_attr rep_attr; + wait_queue_head_t rep_connect_wait; + struct ib_sge rep_pad; /* holds zeroed pad */ + struct ib_mr *rep_pad_mr; /* holds zeroed pad */ + void (*rep_func)(struct rpcrdma_ep *); + struct rpc_xprt *rep_xprt; /* for rep_func */ + struct rdma_conn_param rep_remote_cma; + struct sockaddr_storage rep_remote_addr; +}; + +#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) +#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) + +/* + * struct rpcrdma_rep -- this structure encapsulates state required to recv + * and complete a reply, asychronously. It needs several pieces of + * state: + * o recv buffer (posted to provider) + * o ib_sge (also donated to provider) + * o status of reply (length, success or not) + * o bookkeeping state to get run by tasklet (list, etc) + * + * These are allocated during initialization, per-transport instance; + * however, the tasklet execution list itself is global, as it should + * always be pretty short. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + */ + +/* temporary static scatter/gather max */ +#define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */ +#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ +#define MAX_RPCRDMAHDR (\ + /* max supported RPC/RDMA header */ \ + sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ + (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) + +struct rpcrdma_buffer; + +struct rpcrdma_rep { + unsigned int rr_len; /* actual received reply length */ + struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ + struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ + void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ + struct list_head rr_list; /* tasklet list */ + wait_queue_head_t rr_unbind; /* optional unbind wait */ + struct ib_sge rr_iov; /* for posting */ + struct ib_mr *rr_handle; /* handle for mem in rr_iov */ + char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ +}; + +/* + * struct rpcrdma_req -- structure central to the request/reply sequence. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + * + * It includes pre-registered buffer memory for send AND recv. + * The recv buffer, however, is not owned by this structure, and + * is "donated" to the hardware when a recv is posted. When a + * reply is handled, the recv buffer used is given back to the + * struct rpcrdma_req associated with the request. + * + * In addition to the basic memory, this structure includes an array + * of iovs for send operations. The reason is that the iovs passed to + * ib_post_{send,recv} must not be modified until the work request + * completes. + * + * NOTES: + * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we + * marshal. The number needed varies depending on the iov lists that + * are passed to us, the memory registration mode we are in, and if + * physical addressing is used, the layout. + */ + +struct rpcrdma_mr_seg { /* chunk descriptors */ + union { /* chunk memory handles */ + struct ib_mr *rl_mr; /* if registered directly */ + struct rpcrdma_mw { /* if registered from region */ + union { + struct ib_mw *mw; + struct ib_fmr *fmr; + } r; + struct list_head mw_list; + } *rl_mw; + } mr_chunk; + u64 mr_base; /* registration result */ + u32 mr_rkey; /* registration result */ + u32 mr_len; /* length of chunk or segment */ + int mr_nsegs; /* number of segments in chunk or 0 */ + enum dma_data_direction mr_dir; /* segment mapping direction */ + dma_addr_t mr_dma; /* segment mapping address */ + size_t mr_dmalen; /* segment mapping length */ + struct page *mr_page; /* owning page, if any */ + char *mr_offset; /* kva if no page, else offset */ +}; + +struct rpcrdma_req { + size_t rl_size; /* actual length of buffer */ + unsigned int rl_niovs; /* 0, 2 or 4 */ + unsigned int rl_nchunks; /* non-zero if chunks */ + struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ + struct ib_sge rl_send_iov[4]; /* for active requests */ + struct ib_sge rl_iov; /* for posting */ + struct ib_mr *rl_handle; /* handle for mem in rl_iov */ + char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ + __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ +}; +#define rpcr_to_rdmar(r) \ + container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) + +/* + * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for + * inline requests/replies, and client/server credits. + * + * One of these is associated with a transport instance + */ +struct rpcrdma_buffer { + spinlock_t rb_lock; /* protects indexes */ + atomic_t rb_credits; /* most recent server credits */ + unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ + int rb_max_requests;/* client max requests */ + struct list_head rb_mws; /* optional memory windows/fmrs */ + int rb_send_index; + struct rpcrdma_req **rb_send_bufs; + int rb_recv_index; + struct rpcrdma_rep **rb_recv_bufs; + char *rb_pool; +}; +#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) + +/* + * Internal structure for transport instance creation. This + * exists primarily for modularity. + * + * This data should be set with mount options + */ +struct rpcrdma_create_data_internal { + struct sockaddr_storage addr; /* RDMA server address */ + unsigned int max_requests; /* max requests (slots) in flight */ + unsigned int rsize; /* mount rsize - max read hdr+data */ + unsigned int wsize; /* mount wsize - max write hdr+data */ + unsigned int inline_rsize; /* max non-rdma read data payload */ + unsigned int inline_wsize; /* max non-rdma write data payload */ + unsigned int padding; /* non-rdma write header padding */ +}; + +#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize) + +#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize) + +#define RPCRDMA_INLINE_PAD_VALUE(rq)\ + rpcx_to_rdmad(rq->rq_task->tk_xprt).padding + +/* + * Statistics for RPCRDMA + */ +struct rpcrdma_stats { + unsigned long read_chunk_count; + unsigned long write_chunk_count; + unsigned long reply_chunk_count; + + unsigned long long total_rdma_request; + unsigned long long total_rdma_reply; + + unsigned long long pullup_copy_count; + unsigned long long fixup_copy_count; + unsigned long hardway_register_count; + unsigned long failed_marshal_count; + unsigned long bad_reply_count; +}; + +/* + * RPCRDMA transport -- encapsulates the structures above for + * integration with RPC. + * + * The contained structures are embedded, not pointers, + * for convenience. This structure need not be visible externally. + * + * It is allocated and initialized during mount, and released + * during unmount. + */ +struct rpcrdma_xprt { + struct rpc_xprt xprt; + struct rpcrdma_ia rx_ia; + struct rpcrdma_ep rx_ep; + struct rpcrdma_buffer rx_buf; + struct rpcrdma_create_data_internal rx_data; + struct delayed_work rdma_connect; + struct rpcrdma_stats rx_stats; +}; + +#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) +#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) + +/* + * Interface Adapter calls - xprtrdma/verbs.c + */ +int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +void rpcrdma_ia_close(struct rpcrdma_ia *); + +/* + * Endpoint calls - xprtrdma/verbs.c + */ +int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); + +int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_req *); +int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_rep *); + +/* + * Buffer calls - xprtrdma/verbs.c + */ +int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, + struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); + +struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); +void rpcrdma_buffer_put(struct rpcrdma_req *); +void rpcrdma_recv_buffer_get(struct rpcrdma_req *); +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); + +int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, + struct ib_mr **, struct ib_sge *); +int rpcrdma_deregister_internal(struct rpcrdma_ia *, + struct ib_mr *, struct ib_sge *); + +int rpcrdma_register_external(struct rpcrdma_mr_seg *, + int, int, struct rpcrdma_xprt *); +int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, + struct rpcrdma_xprt *, void *); + +/* + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c + */ +void rpcrdma_conn_func(struct rpcrdma_ep *); +void rpcrdma_reply_handler(struct rpcrdma_rep *); + +/* + * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c + */ +int rpcrdma_marshal_req(struct rpc_rqst *); + +#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4ae7eed..e97e4ca 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -13,10 +13,14 @@ * (C) 1999 Trond Myklebust * * IP socket transport implementation, (C) 2005 Chuck Lever + * + * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005. + * */ #include #include +#include #include #include #include @@ -28,6 +32,7 @@ #include #include #include +#include #include #include @@ -260,14 +265,29 @@ struct sock_xprt { #define TCP_RCV_COPY_XID (1UL << 2) #define TCP_RCV_COPY_DATA (1UL << 3) -static void xs_format_peer_addresses(struct rpc_xprt *xprt) +static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) +{ + return (struct sockaddr *) &xprt->addr; +} + +static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) { - struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr; + return (struct sockaddr_in *) &xprt->addr; +} + +static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt) +{ + return (struct sockaddr_in6 *) &xprt->addr; +} + +static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr_in *addr = xs_addr_in(xprt); char *buf; buf = kzalloc(20, GFP_KERNEL); if (buf) { - snprintf(buf, 20, "%u.%u.%u.%u", + snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr)); } xprt->address_strings[RPC_DISPLAY_ADDR] = buf; @@ -279,26 +299,123 @@ static void xs_format_peer_addresses(struct rpc_xprt *xprt) } xprt->address_strings[RPC_DISPLAY_PORT] = buf; - if (xprt->prot == IPPROTO_UDP) - xprt->address_strings[RPC_DISPLAY_PROTO] = "udp"; - else - xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp"; + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + if (xprt->prot == IPPROTO_UDP) + snprintf(buf, 8, "udp"); + else + snprintf(buf, 8, "tcp"); + } + xprt->address_strings[RPC_DISPLAY_PROTO] = buf; buf = kzalloc(48, GFP_KERNEL); if (buf) { - snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s", + snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", NIPQUAD(addr->sin_addr.s_addr), ntohs(addr->sin_port), xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); } xprt->address_strings[RPC_DISPLAY_ALL] = buf; + + buf = kzalloc(10, GFP_KERNEL); + if (buf) { + snprintf(buf, 10, "%02x%02x%02x%02x", + NIPQUAD(addr->sin_addr.s_addr)); + } + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + snprintf(buf, 8, "%4hx", + ntohs(addr->sin_port)); + } + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; + + buf = kzalloc(30, GFP_KERNEL); + if (buf) { + snprintf(buf, 30, NIPQUAD_FMT".%u.%u", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port) >> 8, + ntohs(addr->sin_port) & 0xff); + } + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; + + xprt->address_strings[RPC_DISPLAY_NETID] = + kstrdup(xprt->prot == IPPROTO_UDP ? + RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL); +} + +static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr_in6 *addr = xs_addr_in6(xprt); + char *buf; + + buf = kzalloc(40, GFP_KERNEL); + if (buf) { + snprintf(buf, 40, NIP6_FMT, + NIP6(addr->sin6_addr)); + } + xprt->address_strings[RPC_DISPLAY_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + snprintf(buf, 8, "%u", + ntohs(addr->sin6_port)); + } + xprt->address_strings[RPC_DISPLAY_PORT] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + if (xprt->prot == IPPROTO_UDP) + snprintf(buf, 8, "udp"); + else + snprintf(buf, 8, "tcp"); + } + xprt->address_strings[RPC_DISPLAY_PROTO] = buf; + + buf = kzalloc(64, GFP_KERNEL); + if (buf) { + snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s", + NIP6(addr->sin6_addr), + ntohs(addr->sin6_port), + xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + } + xprt->address_strings[RPC_DISPLAY_ALL] = buf; + + buf = kzalloc(36, GFP_KERNEL); + if (buf) { + snprintf(buf, 36, NIP6_SEQFMT, + NIP6(addr->sin6_addr)); + } + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + snprintf(buf, 8, "%4hx", + ntohs(addr->sin6_port)); + } + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; + + buf = kzalloc(50, GFP_KERNEL); + if (buf) { + snprintf(buf, 50, NIP6_FMT".%u.%u", + NIP6(addr->sin6_addr), + ntohs(addr->sin6_port) >> 8, + ntohs(addr->sin6_port) & 0xff); + } + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; + + xprt->address_strings[RPC_DISPLAY_NETID] = + kstrdup(xprt->prot == IPPROTO_UDP ? + RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL); } static void xs_free_peer_addresses(struct rpc_xprt *xprt) { - kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); - kfree(xprt->address_strings[RPC_DISPLAY_PORT]); - kfree(xprt->address_strings[RPC_DISPLAY_ALL]); + int i; + + for (i = 0; i < RPC_DISPLAY_MAX; i++) + kfree(xprt->address_strings[i]); } #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) @@ -463,13 +580,14 @@ static int xs_udp_send_request(struct rpc_task *task) req->rq_xtime = jiffies; status = xs_sendpages(transport->sock, - (struct sockaddr *) &xprt->addr, + xs_addr(xprt), xprt->addrlen, xdr, req->rq_bytes_sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); + task->tk_bytes_sent += status; if (likely(status >= (int) req->rq_slen)) return 0; @@ -523,7 +641,8 @@ static int xs_tcp_send_request(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; - int status, retry = 0; + int status; + unsigned int retry = 0; xs_encode_tcp_record_marker(&req->rq_snd_buf); @@ -661,6 +780,7 @@ static void xs_destroy(struct rpc_xprt *xprt) xs_free_peer_addresses(xprt); kfree(xprt->slot); kfree(xprt); + module_put(THIS_MODULE); } static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) @@ -1139,14 +1259,23 @@ static unsigned short xs_get_random_port(void) */ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) { - struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr; + struct sockaddr *addr = xs_addr(xprt); dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); - sap->sin_port = htons(port); + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + break; + default: + BUG(); + } } -static int xs_bind(struct sock_xprt *transport, struct socket *sock) +static int xs_bind4(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_in myaddr = { .sin_family = AF_INET, @@ -1174,8 +1303,42 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) else port--; } while (err == -EADDRINUSE && port != transport->port); - dprintk("RPC: xs_bind "NIPQUAD_FMT":%u: %s (%d)\n", - NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err); + dprintk("RPC: %s "NIPQUAD_FMT":%u: %s (%d)\n", + __FUNCTION__, NIPQUAD(myaddr.sin_addr), + port, err ? "failed" : "ok", err); + return err; +} + +static int xs_bind6(struct sock_xprt *transport, struct socket *sock) +{ + struct sockaddr_in6 myaddr = { + .sin6_family = AF_INET6, + }; + struct sockaddr_in6 *sa; + int err; + unsigned short port = transport->port; + + if (!transport->xprt.resvport) + port = 0; + sa = (struct sockaddr_in6 *)&transport->addr; + myaddr.sin6_addr = sa->sin6_addr; + do { + myaddr.sin6_port = htons(port); + err = kernel_bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (!transport->xprt.resvport) + break; + if (err == 0) { + transport->port = port; + break; + } + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; + } while (err == -EADDRINUSE && port != transport->port); + dprintk("RPC: xs_bind6 "NIP6_FMT":%u: %s (%d)\n", + NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err); return err; } @@ -1183,38 +1346,69 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) static struct lock_class_key xs_key[2]; static struct lock_class_key xs_slock_key[2]; -static inline void xs_reclassify_socket(struct socket *sock) +static inline void xs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; + BUG_ON(sk->sk_lock.owner != NULL); - switch (sk->sk_family) { - case AF_INET: - sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS", - &xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]); - break; + sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", + &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); +} - case AF_INET6: - sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS", - &xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]); - break; +static inline void xs_reclassify_socket6(struct socket *sock) +{ + struct sock *sk = sock->sk; - default: - BUG(); - } + BUG_ON(sk->sk_lock.owner != NULL); + sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", + &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); } #else -static inline void xs_reclassify_socket(struct socket *sock) +static inline void xs_reclassify_socket4(struct socket *sock) +{ +} + +static inline void xs_reclassify_socket6(struct socket *sock) { } #endif +static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + transport->old_data_ready = sk->sk_data_ready; + transport->old_state_change = sk->sk_state_change; + transport->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; + + xprt_set_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + xs_udp_do_set_buffer_size(xprt); +} + /** - * xs_udp_connect_worker - set up a UDP socket + * xs_udp_connect_worker4 - set up a UDP socket * @work: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_udp_connect_worker(struct work_struct *work) +static void xs_udp_connect_worker4(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); @@ -1232,9 +1426,9 @@ static void xs_udp_connect_worker(struct work_struct *work) dprintk("RPC: can't create UDP transport socket (%d).\n", -err); goto out; } - xs_reclassify_socket(sock); + xs_reclassify_socket4(sock); - if (xs_bind(transport, sock)) { + if (xs_bind4(transport, sock)) { sock_release(sock); goto out; } @@ -1242,29 +1436,48 @@ static void xs_udp_connect_worker(struct work_struct *work) dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); - if (!transport->inet) { - struct sock *sk = sock->sk; + xs_udp_finish_connecting(xprt, sock); + status = 0; +out: + xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); +} - write_lock_bh(&sk->sk_callback_lock); +/** + * xs_udp_connect_worker6 - set up a UDP socket + * @work: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_udp_connect_worker6(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock = transport->sock; + int err, status = -EIO; - sk->sk_user_data = xprt; - transport->old_data_ready = sk->sk_data_ready; - transport->old_state_change = sk->sk_state_change; - transport->old_write_space = sk->sk_write_space; - sk->sk_data_ready = xs_udp_data_ready; - sk->sk_write_space = xs_udp_write_space; - sk->sk_no_check = UDP_CSUM_NORCV; - sk->sk_allocation = GFP_ATOMIC; + if (xprt->shutdown || !xprt_bound(xprt)) + goto out; - xprt_set_connected(xprt); + /* Start by resetting any existing state */ + xs_close(xprt); - /* Reset to new socket */ - transport->sock = sock; - transport->inet = sk; + if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); + goto out; + } + xs_reclassify_socket6(sock); - write_unlock_bh(&sk->sk_callback_lock); + if (xs_bind6(transport, sock) < 0) { + sock_release(sock); + goto out; } - xs_udp_do_set_buffer_size(xprt); + + dprintk("RPC: worker connecting xprt %p to address: %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); + + xs_udp_finish_connecting(xprt, sock); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1295,13 +1508,52 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) result); } +static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + transport->old_data_ready = sk->sk_data_ready; + transport->old_state_change = sk->sk_state_change; + transport->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + sk->sk_allocation = GFP_ATOMIC; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); +} + /** - * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint + * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint * @work: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_tcp_connect_worker(struct work_struct *work) +static void xs_tcp_connect_worker4(struct work_struct *work) { struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); @@ -1315,13 +1567,12 @@ static void xs_tcp_connect_worker(struct work_struct *work) if (!sock) { /* start from scratch */ if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport " - "socket (%d).\n", -err); + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } - xs_reclassify_socket(sock); + xs_reclassify_socket4(sock); - if (xs_bind(transport, sock)) { + if (xs_bind4(transport, sock) < 0) { sock_release(sock); goto out; } @@ -1332,43 +1583,70 @@ static void xs_tcp_connect_worker(struct work_struct *work) dprintk("RPC: worker connecting xprt %p to address: %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ALL]); - if (!transport->inet) { - struct sock *sk = sock->sk; - - write_lock_bh(&sk->sk_callback_lock); + status = xs_tcp_finish_connecting(xprt, sock); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), + sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_close(xprt); + break; + } + } +out: + xprt_wake_pending_tasks(xprt, status); +out_clear: + xprt_clear_connecting(xprt); +} - sk->sk_user_data = xprt; - transport->old_data_ready = sk->sk_data_ready; - transport->old_state_change = sk->sk_state_change; - transport->old_write_space = sk->sk_write_space; - sk->sk_data_ready = xs_tcp_data_ready; - sk->sk_state_change = xs_tcp_state_change; - sk->sk_write_space = xs_tcp_write_space; - sk->sk_allocation = GFP_ATOMIC; +/** + * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint + * @work: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_tcp_connect_worker6(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock = transport->sock; + int err, status = -EIO; - /* socket options */ - sk->sk_userlocks |= SOCK_BINDPORT_LOCK; - sock_reset_flag(sk, SOCK_LINGER); - tcp_sk(sk)->linger2 = 0; - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + if (xprt->shutdown || !xprt_bound(xprt)) + goto out; - xprt_clear_connected(xprt); + if (!sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } + xs_reclassify_socket6(sock); - /* Reset to new socket */ - transport->sock = sock; - transport->inet = sk; + if (xs_bind6(transport, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); - write_unlock_bh(&sk->sk_callback_lock); - } + dprintk("RPC: worker connecting xprt %p to address: %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); - /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; - status = kernel_connect(sock, (struct sockaddr *) &xprt->addr, - xprt->addrlen, O_NONBLOCK); + status = xs_tcp_finish_connecting(xprt, sock); dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), - sock->sk->sk_state); + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); if (status < 0) { switch (status) { case -EINPROGRESS: @@ -1508,7 +1786,8 @@ static struct rpc_xprt_ops xs_tcp_ops = { .print_stats = xs_tcp_print_stats, }; -static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size) +static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, + unsigned int slot_table_size) { struct rpc_xprt *xprt; struct sock_xprt *new; @@ -1549,8 +1828,9 @@ static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned * @args: rpc transport creation arguments * */ -struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) +struct rpc_xprt *xs_setup_udp(struct xprt_create *args) { + struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; @@ -1559,15 +1839,11 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); - if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0) - xprt_set_bound(xprt); - xprt->prot = IPPROTO_UDP; xprt->tsh_size = 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker); xprt->bind_timeout = XS_BIND_TO; xprt->connect_timeout = XS_UDP_CONN_TO; xprt->reestablish_timeout = XS_UDP_REEST_TO; @@ -1580,11 +1856,37 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) else xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); - xs_format_peer_addresses(xprt); + switch (addr->sa_family) { + case AF_INET: + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_udp_connect_worker4); + xs_format_ipv4_peer_addresses(xprt); + break; + case AF_INET6: + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, + xs_udp_connect_worker6); + xs_format_ipv6_peer_addresses(xprt); + break; + default: + kfree(xprt); + return ERR_PTR(-EAFNOSUPPORT); + } + dprintk("RPC: set up transport to address %s\n", xprt->address_strings[RPC_DISPLAY_ALL]); - return xprt; + if (try_module_get(THIS_MODULE)) + return xprt; + + kfree(xprt->slot); + kfree(xprt); + return ERR_PTR(-EINVAL); } /** @@ -1592,8 +1894,9 @@ struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) * @args: rpc transport creation arguments * */ -struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args) +struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) { + struct sockaddr *addr = args->dstaddr; struct rpc_xprt *xprt; struct sock_xprt *transport; @@ -1602,14 +1905,10 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args) return xprt; transport = container_of(xprt, struct sock_xprt, xprt); - if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0) - xprt_set_bound(xprt); - xprt->prot = IPPROTO_TCP; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; - INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker); xprt->bind_timeout = XS_BIND_TO; xprt->connect_timeout = XS_TCP_CONN_TO; xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; @@ -1622,15 +1921,55 @@ struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args) else xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); - xs_format_peer_addresses(xprt); + switch (addr->sa_family) { + case AF_INET: + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4); + xs_format_ipv4_peer_addresses(xprt); + break; + case AF_INET6: + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + xprt_set_bound(xprt); + + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6); + xs_format_ipv6_peer_addresses(xprt); + break; + default: + kfree(xprt); + return ERR_PTR(-EAFNOSUPPORT); + } + dprintk("RPC: set up transport to address %s\n", xprt->address_strings[RPC_DISPLAY_ALL]); - return xprt; + if (try_module_get(THIS_MODULE)) + return xprt; + + kfree(xprt->slot); + kfree(xprt); + return ERR_PTR(-EINVAL); } +static struct xprt_class xs_udp_transport = { + .list = LIST_HEAD_INIT(xs_udp_transport.list), + .name = "udp", + .owner = THIS_MODULE, + .ident = IPPROTO_UDP, + .setup = xs_setup_udp, +}; + +static struct xprt_class xs_tcp_transport = { + .list = LIST_HEAD_INIT(xs_tcp_transport.list), + .name = "tcp", + .owner = THIS_MODULE, + .ident = IPPROTO_TCP, + .setup = xs_setup_tcp, +}; + /** - * init_socket_xprt - set up xprtsock's sysctls + * init_socket_xprt - set up xprtsock's sysctls, register with RPC client * */ int init_socket_xprt(void) @@ -1640,11 +1979,14 @@ int init_socket_xprt(void) sunrpc_table_header = register_sysctl_table(sunrpc_table); #endif + xprt_register_transport(&xs_udp_transport); + xprt_register_transport(&xs_tcp_transport); + return 0; } /** - * cleanup_socket_xprt - remove xprtsock's sysctls + * cleanup_socket_xprt - remove xprtsock's sysctls, unregister * */ void cleanup_socket_xprt(void) @@ -1655,4 +1997,7 @@ void cleanup_socket_xprt(void) sunrpc_table_header = NULL; } #endif + + xprt_unregister_transport(&xs_udp_transport); + xprt_unregister_transport(&xs_tcp_transport); }