[openib-general] Re: [PATCH] [CMA] RDMA CM abstraction module
Michael S. Tsirkin
mst at mellanox.co.il
Mon Oct 10 06:57:24 PDT 2005
Quoting Sean Hefty <sean.hefty at intel.com>:
> Subject: [PATCH] [CMA] RDMA CM abstraction module
>
> The following patch adds in a basic RDMA connection management abstraction.
> It is functional, but needs additional work for handling device removal,
> plus several missing features.
>
> I'd like to merge this back into the trunk, and continue working on it
> from there.
>
> This depends on the ib_addr module.
>
> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>
>
>
> Index: include/rdma/rdma_cm.h
> ===================================================================
> --- include/rdma/rdma_cm.h (revision 0)
> +++ include/rdma/rdma_cm.h (revision 0)
> @@ -0,0 +1,201 @@
>
> [........... snip ...............]
>
> +
> +#if !defined(RDMA_CM_H)
> +#define RDMA_CM_H
> +
> +#include <linux/socket.h>
> +#include <rdma/ib_addr.h>
> +#include <rdma/ib_sa.h>
> +
> +/*
> + * Upon receiving a device removal event, users must destroy the
> associated
> + * RDMA identifier and release all resources allocated with the device.
> + */
> +enum rdma_event_type {
> + RDMA_EVENT_ADDR_RESOLVED,
> + RDMA_EVENT_ADDR_ERROR,
> + RDMA_EVENT_ROUTE_RESOLVED,
> + RDMA_EVENT_ROUTE_ERROR,
> + RDMA_EVENT_CONNECT_REQUEST,
> + RDMA_EVENT_CONNECT_ERROR,
> + RDMA_EVENT_UNREACHABLE,
> + RDMA_EVENT_REJECTED,
> + RDMA_EVENT_ESTABLISHED,
> + RDMA_EVENT_DISCONNECTED,
> + RDMA_EVENT_DEVICE_REMOVAL,
> +};
> +
> +struct rdma_addr {
> + struct sockaddr src_addr;
> + struct sockaddr dst_addr;
> + union {
> + struct ib_addr ibaddr;
> + } addr;
> +};
> +
> +struct rdma_route {
> + struct rdma_addr addr;
> + struct ib_sa_path_rec *path_rec;
> + int num_paths;
> +};
> +
> +struct rdma_event {
> + enum rdma_event_type event;
> + int status;
> + void *private_data;
> + u8 private_data_len;
> +};
Wouldnt is be a good idea to start names with rdma_cm
or rdma_cma or something like that?
For example, rdma_event_type is a bit confusing since this actually only
includes CM events. Similiar comments apply to other names.
> +struct rdma_id;
I propose renaming this to rdma_connection or something
else more specific than just "id". Makes sense?
> +/**
> + * rdma_event_handler - Callback used to report user events.
> + *
> + * Notes: Users may not call rdma_destroy_id from this callback to destroy
> + * the passed in id, or a corresponding listen id. Returning a
> + * non-zero value from the callback will destroy the corresponding id.
> + */
> +typedef int (*rdma_event_handler)(struct rdma_id *id, struct rdma_event *event);
> +
> +struct rdma_id {
> + struct ib_device *device;
> + void *context;
> + struct ib_qp *qp;
> + rdma_event_handler event_handler;
> + struct rdma_route route;
> +};
> +
> +struct rdma_id* rdma_create_id(rdma_event_handler event_handler, void
> *context);
> +
> +void rdma_destroy_id(struct rdma_id *id);
> +
> +/**
> + * rdma_bind_addr - Bind an RDMA identifier to a source address and
> + * associated RDMA device, if needed.
> + *
> + * @id: RDMA identifier.
> + * @addr: Local address information. Wildcard values are permitted.
> + *
> + * This associates a source address with the RDMA identifier before calling
> + * rdma_listen. If a specific local address is given, the RDMA identifier will
> + * be bound to a local RDMA device.
> + */
> +int rdma_bind_addr(struct rdma_id *id, struct sockaddr *addr);
> +
> +/**
> + * rdma_resolve_addr - Resolve destination and optional source addresses
> + * from IP addresses to an RDMA address. If successful, the specified
> + * rdma_id will be bound to a local device.
> + *
> + * @id: RDMA identifier.
> + * @src_addr: Source address information. This parameter may be NULL.
> + * @dst_addr: Destination address information.
> + * @timeout_ms: Time to wait for resolution to complete.
> + */
> +int rdma_resolve_addr(struct rdma_id *id, struct sockaddr *src_addr,
> + struct sockaddr *dst_addr, int timeout_ms);
> +
> +/**
> + * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
> + * into route information needed to establish a connection.
> + *
> + * This is called on the client side of a connection, but its use is optional.
> + * Users must have first called rdma_bind_addr to resolve a dst_addr
> + * into an RDMA address before calling this routine.
> + */
> +int rdma_resolve_route(struct rdma_id *id, int timeout_ms);
Not sure I understand what this does, since the only extra parameter is
timeout_ms.
> +/**
> + * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
> + * identifier.
> + */
> +int rdma_create_qp(struct rdma_id *id, struct ib_pd *pd,
> + struct ib_qp_init_attr *qp_init_attr);
> +
> +/**
> + * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
> + * identifier.
> + *
> + * Users must destroy any QP associated with an RDMA identifier before
> + * destroying the RDMA ID.
> + */
> +void rdma_destroy_qp(struct rdma_id *id);
Not sure what the intended usage is.
When does the user need to call this?
> +struct rdma_conn_param {
> + const void *private_data;
> + u8 private_data_len;
> + u8 responder_resources;
> + u8 initiator_depth;
> + u8 flow_control;
> + u8 retry_count; /* ignored when accepting */
> + u8 rnr_retry_count;
> +};
> +
> +/**
> + * rdma_connect - Initiate an active connection request.
> + *
> + * Users must have bound the rdma_id to a local device by having called
> + * rdma_resolve_addr before calling this routine. Users may also resolve the
> + * RDMA address to a route with rdma_resolve_route, but if a route has not
> + * been resolved, a default route will be selected.
> + *
> + * Note that the QP must be in the INIT state.
> + */
> +int rdma_connect(struct rdma_id *id, struct rdma_conn_param *conn_param);
> +
> +/**
> + * rdma_listen - This function is called by the passive side to
> + * listen for incoming connection requests.
> + *
> + * Users must have bound the rdma_id to a local address by calling
> + * rdma_bind_addr before calling this routine.
> + */
> +int rdma_listen(struct rdma_id *id);
> +
> +/**
> + * rdma_accept - Called on the passive side to accept a connection request
> + *
> + * Note that the QP must be in the INIT state.
> + */
> +int rdma_accept(struct rdma_id *id, struct rdma_conn_param
> *conn_param);
> +
> +/**
> + * rdma_reject - Called on the passive side to reject a connection request.
> + */
> +int rdma_reject(struct rdma_id *id, const void *private_data,
> + u8 private_data_len);
> +
> +/**
> + * rdma_disconnect - This function disconnects the associated QP.
> + */
> +int rdma_disconnect(struct rdma_id *id);
> +
> +#endif /* RDMA_CM_H */
> +
> Index: core/cma.c
> ===================================================================
> --- core/cma.c (revision 0)
> +++ core/cma.c (revision 0)
> @@ -0,0 +1,1207 @@
> +
> [ ......... snip .............. ]
>
> +#include <linux/in.h>
> +#include <linux/in6.h>
> +#include <linux/inetdevice.h>
> +#include <net/arp.h>
> +#include <net/neighbour.h>
> +#include <net/route.h>
> +#include <rdma/rdma_cm.h>
> +#include <rdma/ib_cache.h>
> +#include <rdma/ib_cm.h>
> +#include <rdma/ib_sa.h>
Are all of these headers really needed?
For example, I dont see arp.h used anywhere.
Am I missing something?
> +MODULE_AUTHOR("Guy German");
> +MODULE_DESCRIPTION("Generic RDMA CM Agent");
> +MODULE_LICENSE("Dual BSD/GPL");
> +
> +#define CMA_CM_RESPONSE_TIMEOUT 20
> +#define CMA_MAX_CM_RETRIES 3
> +
> +static void cma_add_one(struct ib_device *device);
> +static void cma_remove_one(struct ib_device *device);
> +
> +static struct ib_client cma_client = {
> + .name = "cma",
> + .add = cma_add_one,
> + .remove = cma_remove_one
> +};
> +
> +static DEFINE_SPINLOCK(lock);
> +static LIST_HEAD(dev_list);
> +
> +struct cma_device {
> + struct list_head list;
> + struct ib_device *device;
> + __be64 node_guid;
> + wait_queue_head_t wait;
> + atomic_t refcount;
> + struct list_head id_list;
> +};
> +
> +enum cma_state {
> + CMA_IDLE,
> + CMA_ADDR_QUERY,
> + CMA_ADDR_RESOLVED,
> + CMA_ROUTE_QUERY,
> + CMA_ROUTE_RESOLVED,
> + CMA_CONNECT,
> + CMA_ADDR_BOUND,
> + CMA_LISTEN,
> + CMA_DEVICE_REMOVAL,
> + CMA_DESTROYING
> +};
> +
> +/*
> + * Device removal can occur at anytime, so we need extra handling to
> + * serialize notifying the user of device removal with other callbacks.
> + * We do this by disabling removal notification while a callback is in process,
> + * and reporting it after the callback completes.
> + */
> +struct rdma_id_private {
> + struct rdma_id id;
> +
> + struct list_head list;
> + struct cma_device *cma_dev;
> +
> + enum cma_state state;
> + spinlock_t lock;
> + wait_queue_head_t wait;
> + atomic_t refcount;
> + atomic_t dev_remove;
> +
> + int timeout_ms;
> + struct ib_sa_query *query;
> + int query_id;
> + struct ib_cm_id *cm_id;
> +};
> +
> +struct cma_addr {
> + u8 version; /* CMA version: 7:4, IP version: 3:0 */
> + u8 reserved;
> + __be16 port;
> + struct {
> + union {
> + struct in6_addr ip6;
> + struct {
> + __be32 pad[3];
> + __be32 addr;
> + } ip4;
> + } ver;
> + } src_addr, dst_addr;
> +};
> +
> +static int cma_comp(struct rdma_id_private *id_priv, enum cma_state
> comp)
> +{
> + unsigned long flags;
> + int ret;
> +
> + spin_lock_irqsave(&id_priv->lock, flags);
> + ret = (id_priv->state == comp);
> + spin_unlock_irqrestore(&id_priv->lock, flags);
> + return ret;
> +}
> +
> +static int cma_comp_exch(struct rdma_id_private *id_priv,
> + enum cma_state comp, enum cma_state exch)
> +{
> + unsigned long flags;
> + int ret;
> +
> + spin_lock_irqsave(&id_priv->lock, flags);
> + if ((ret = (id_priv->state == comp)))
> + id_priv->state = exch;
> + spin_unlock_irqrestore(&id_priv->lock, flags);
> + return ret;
> +}
> +
> +static enum cma_state cma_exch(struct rdma_id_private *id_priv,
> + enum cma_state exch)
> +{
> + unsigned long flags;
> + enum cma_state old;
> +
> + spin_lock_irqsave(&id_priv->lock, flags);
> + old = id_priv->state;
> + id_priv->state = exch;
> + spin_unlock_irqrestore(&id_priv->lock, flags);
> + return old;
> +}
> +
> +static inline u8 cma_get_ip_ver(struct cma_addr *addr)
> +{
> + return addr->version & 0xF;
> +}
> +
> +static inline u8 cma_get_cma_ver(struct cma_addr *addr)
> +{
> + return addr->version >> 4;
> +}
> +
> +static inline void cma_set_vers(struct cma_addr *addr, u8 cma_ver, u8
> ip_ver)
> +{
> + addr->version = (cma_ver << 4) + (ip_ver & 0xF);
> +}
> +
> +static int cma_acquire_ib_dev(struct rdma_id_private *id_priv,
> + union ib_gid *gid)
> +{
> + struct cma_device *cma_dev;
> + unsigned long flags;
> + int ret = -ENODEV;
> + u8 port;
> +
> + spin_lock_irqsave(&lock, flags);
> + list_for_each_entry(cma_dev, &dev_list, list) {
> + ret = ib_find_cached_gid(cma_dev->device, gid, &port, NULL);
> + if (!ret) {
> + atomic_inc(&cma_dev->refcount);
> + id_priv->cma_dev = cma_dev;
> + id_priv->id.device = cma_dev->device;
> + list_add_tail(&id_priv->list, &cma_dev->id_list);
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&lock, flags);
> + return ret;
> +}
> +
> +static void cma_release_dev(struct rdma_id_private *id_priv)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&lock, flags);
> + list_del(&id_priv->list);
> + spin_unlock_irqrestore(&lock, flags);
> +
> + if (atomic_dec_and_test(&id_priv->cma_dev->refcount))
> + wake_up(&id_priv->cma_dev->wait);
> +}
> +
> +static void cma_deref_id(struct rdma_id_private *id_priv)
> +{
> + if (atomic_dec_and_test(&id_priv->refcount))
> + wake_up(&id_priv->wait);
> +}
> +
> +struct rdma_id* rdma_create_id(rdma_event_handler event_handler, void
> *context)
> +{
> + struct rdma_id_private *id_priv;
> +
> + id_priv = kmalloc(sizeof *id_priv, GFP_KERNEL);
> + if (!id_priv)
> + return NULL;
> + memset(id_priv, 0, sizeof *id_priv);
> +
> + id_priv->state = CMA_IDLE;
> + id_priv->id.context = context;
> + id_priv->id.event_handler = event_handler;
> + spin_lock_init(&id_priv->lock);
> + init_waitqueue_head(&id_priv->wait);
> + atomic_set(&id_priv->refcount, 1);
> + atomic_set(&id_priv->dev_remove, 1);
> +
> + return &id_priv->id;
> +}
> +EXPORT_SYMBOL(rdma_create_id);
> +
> +static int cma_init_ib_qp(struct rdma_id_private *id_priv, struct ib_qp
> *qp)
> +{
> + struct ib_qp_attr qp_attr;
> + struct ib_sa_path_rec *path_rec;
> + int ret;
> +
> + qp_attr.qp_state = IB_QPS_INIT;
> + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
> +
> + path_rec = id_priv->id.route.path_rec;
> + ret = ib_find_cached_gid(id_priv->id.device, &path_rec->sgid,
> + &qp_attr.port_num, NULL);
> + if (ret)
> + return ret;
> +
> + ret = ib_find_cached_pkey(id_priv->id.device, qp_attr.port_num,
> +
> id_priv->id.route.addr.addr.ibaddr.pkey,
> + &qp_attr.pkey_index);
> + if (ret)
> + return ret;
> +
> + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX | IB_QP_PORT);
> +}
> +
> +int rdma_create_qp(struct rdma_id *id, struct ib_pd *pd,
> + struct ib_qp_init_attr *qp_init_attr)
> +{
> + struct rdma_id_private *id_priv;
> + struct ib_qp *qp;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (id->device != pd->device)
> + return -EINVAL;
> +
> + qp = ib_create_qp(pd, qp_init_attr);
> + if (IS_ERR(qp))
> + return PTR_ERR(qp);
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = cma_init_ib_qp(id_priv, qp);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> +
> + if (ret)
> + goto err;
> +
> + id->qp = qp;
> + return 0;
> +err:
> + ib_destroy_qp(qp);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_create_qp);
What about replacing switch with one case statements with if statements.
Like this:
if (id->device->node_type == IB_NODE_CA)
ret = cma_init_ib_qp(id_priv, qp);
else
ret = -ENOSYS;
Or even
ret = id->device->node_type == IB_NODE_CA ? cma_init_ib_qp(id_priv, qp) : -ENOSYS;
I also wander why do we really need all these node_type checks.
The code above seems to imply that rdma_create_qp will fail
on non-CA. Why is that?
> +void rdma_destroy_qp(struct rdma_id *id)
> +{
> + ib_destroy_qp(id->qp);
> +}
> +EXPORT_SYMBOL(rdma_destroy_qp);
> +
> +static int cma_modify_ib_qp_rtr(struct rdma_id_private *id_priv)
> +{
> + struct ib_qp_attr qp_attr;
> + int qp_attr_mask, ret;
> +
> + /* Need to update QP attributes from default values. */
> + qp_attr.qp_state = IB_QPS_INIT;
> + ret = ib_cm_init_qp_attr(id_priv->cm_id, &qp_attr,
> &qp_attr_mask);
> + if (ret)
> + return ret;
> +
> + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
> + if (ret)
> + return ret;
> +
> + qp_attr.qp_state = IB_QPS_RTR;
> + ret = ib_cm_init_qp_attr(id_priv->cm_id, &qp_attr,
> &qp_attr_mask);
> + if (ret)
> + return ret;
> +
> + qp_attr.rq_psn = id_priv->id.qp->qp_num;
> + return ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
> +}
> +
> +static int cma_modify_ib_qp_rts(struct rdma_id_private *id_priv)
> +{
> + struct ib_qp_attr qp_attr;
> + int qp_attr_mask, ret;
> +
> + qp_attr.qp_state = IB_QPS_RTS;
> + ret = ib_cm_init_qp_attr(id_priv->cm_id, &qp_attr,
> &qp_attr_mask);
> + if (ret)
> + return ret;
> +
> + return ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
> +}
> +
> +static int cma_modify_qp_err(struct rdma_id *id)
> +{
> + struct ib_qp_attr qp_attr;
> +
> + qp_attr.qp_state = IB_QPS_ERR;
> + return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
> +}
> +
> +static int cma_verify_addr(struct cma_addr *addr,
> + struct sockaddr_in *ip_addr)
> +{
> + if (cma_get_cma_ver(addr) != 1 || cma_get_ip_ver(addr) != 4)
> + return -EINVAL;
> +
> + if (ip_addr->sin_port != be16_to_cpu(addr->port))
> + return -EINVAL;
> +
> + if (ip_addr->sin_addr.s_addr &&
> + (ip_addr->sin_addr.s_addr != be32_to_cpu(addr->dst_addr.
> + ver.ip4.addr)))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static int cma_notify_user(struct rdma_id_private *id_priv,
> + enum rdma_event_type type, int status,
> + void *data, u8 data_len)
> +{
> + struct rdma_event event;
> +
> + event.event = type;
> + event.status = status;
> + event.private_data = data;
> + event.private_data_len = data_len;
> +
> + return id_priv->id.event_handler(&id_priv->id, &event);
> +}
> +
> +static inline void cma_disable_dev_remove(struct rdma_id_private
> *id_priv)
> +{
> + atomic_inc(&id_priv->dev_remove);
> +}
> +
> +static inline void cma_deref_dev(struct rdma_id_private *id_priv)
> +{
> +// if (atomic_dec_and_test(&id_priv->dev_remove))
> +// wake_up(&id_priv->wait);
> +// return atomic_dec_and_test(&id_priv->dev_remove) ?
> +// cma_notify_user(id_priv, RDMA_EVENT_DEVICE_REMOVAL, -ENODEV,
> +// NULL, 0) : 0;
> +}
The above seems to need some cleanup.
Some of the comments above apply to the patch as a whole, so
I'm preserving the rest of it here for reference.
There aren't any more my comments below.
Thanks,
MST
----------------------------------------------
> +static void cma_cancel_addr(struct rdma_id_private *id_priv)
> +{
> + switch (id_priv->id.device->node_type) {
> + case IB_NODE_CA:
> + ib_addr_cancel(&id_priv->id.route.addr.addr.ibaddr);
> + break;
> + default:
> + break;
> + }
> +}
> +
> +static void cma_cancel_route(struct rdma_id_private *id_priv)
> +{
> + switch (id_priv->id.device->node_type) {
> + case IB_NODE_CA:
> + ib_sa_cancel_query(id_priv->query_id, id_priv->query);
> + break;
> + default:
> + break;
> + }
> +}
> +
> +static void cma_cancel_operation(struct rdma_id_private *id_priv,
> + enum cma_state state)
> +{
> + switch (state) {
> + case CMA_ADDR_QUERY:
> + cma_cancel_addr(id_priv);
> + break;
> + case CMA_ROUTE_QUERY:
> + cma_cancel_route(id_priv);
> + break;
> + default:
> + break;
> + }
> +}
> +
> +static void cma_free_id(struct rdma_id_private *id_priv)
> +{
> + if (id_priv->cma_dev) {
> + switch (id_priv->id.device->node_type) {
> + case IB_NODE_CA:
> + if (id_priv->cm_id && !IS_ERR(id_priv->cm_id))
> + ib_destroy_cm_id(id_priv->cm_id);
> + break;
> + default:
> + break;
> + }
> + cma_release_dev(id_priv);
> + }
> +
> + atomic_dec(&id_priv->refcount);
> + wait_event(id_priv->wait, !atomic_read(&id_priv->refcount));
> +
> + kfree(id_priv->id.route.path_rec);
> + kfree(id_priv);
> +}
> +
> +void rdma_destroy_id(struct rdma_id *id)
> +{
> + struct rdma_id_private *id_priv;
> + enum cma_state state;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> +
> + state = cma_exch(id_priv, CMA_DESTROYING);
> + cma_cancel_operation(id_priv, state);
> + cma_free_id(id_priv);
> +}
> +EXPORT_SYMBOL(rdma_destroy_id);
> +
> +static int cma_rep_recv(struct rdma_id_private *id_priv)
> +{
> + int ret;
> +
> + ret = cma_modify_ib_qp_rtr(id_priv);
> + if (ret)
> + goto reject;
> +
> + ret = cma_modify_ib_qp_rts(id_priv);
> + if (ret)
> + goto reject;
> +
> + ret = ib_send_cm_rtu(id_priv->cm_id, NULL, 0);
> + if (ret)
> + goto reject;
> +
> + return 0;
> +reject:
> + cma_modify_qp_err(&id_priv->id);
> + ib_send_cm_rej(id_priv->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
> + NULL, 0, NULL, 0);
> + return ret;
> +}
> +
> +static int cma_rtu_recv(struct rdma_id_private *id_priv)
> +{
> + int ret;
> +
> + ret = cma_modify_ib_qp_rts(id_priv);
> + if (ret)
> + goto reject;
> +
> + return 0;
> +reject:
> + cma_modify_qp_err(&id_priv->id);
> + ib_send_cm_rej(id_priv->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
> + NULL, 0, NULL, 0);
> + return ret;
> +}
> +
> +static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event
> *ib_event)
> +{
> + struct rdma_id_private *id_priv = cm_id->context;
> + enum rdma_event_type event;
> + u8 private_data_len = 0;
> + int ret = 0, status = 0;
> +
> + if (!cma_comp(id_priv, CMA_CONNECT))
> + return 0;
> +
> + switch (ib_event->event) {
> + case IB_CM_REQ_ERROR:
> + case IB_CM_REP_ERROR:
> + event = RDMA_EVENT_UNREACHABLE;
> + status = -ETIMEDOUT;
> + break;
> + case IB_CM_REP_RECEIVED:
> + status = cma_rep_recv(id_priv);
> + event = status ? RDMA_EVENT_CONNECT_ERROR :
> + RDMA_EVENT_ESTABLISHED;
> + private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
> + break;
> + case IB_CM_RTU_RECEIVED:
> + status = cma_rtu_recv(id_priv);
> + event = status ? RDMA_EVENT_CONNECT_ERROR :
> + RDMA_EVENT_ESTABLISHED;
> + break;
> + case IB_CM_DREQ_ERROR:
> + status = -ETIMEDOUT; /* fall through */
> + case IB_CM_DREQ_RECEIVED:
> + case IB_CM_DREP_RECEIVED:
> + event = RDMA_EVENT_DISCONNECTED;
> + break;
> + case IB_CM_TIMEWAIT_EXIT:
> + case IB_CM_MRA_RECEIVED:
> + /* ignore event */
> + goto out;
> + case IB_CM_REJ_RECEIVED:
> + cma_modify_qp_err(&id_priv->id);
> + status = ib_event->param.rej_rcvd.reason;
> + event = RDMA_EVENT_REJECTED;
> + break;
> + default:
> + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
> + ib_event->event);
> + goto out;
> + }
> +
> + ret = cma_notify_user(id_priv, event, status,
> ib_event->private_data,
> + private_data_len);
> + if (ret) {
> + /* Destroy the CM ID by returning a non-zero value. */
> + id_priv->cm_id = NULL;
> + rdma_destroy_id(&id_priv->id);
> + }
> +out:
> + return ret;
> +}
> +
> +static struct rdma_id_private* cma_new_id(struct rdma_id *listen_id,
> + struct ib_cm_event *ib_event)
> +{
> + struct rdma_id_private *id_priv;
> + struct rdma_id *id;
> + struct rdma_route *route;
> + struct sockaddr_in *ip_addr;
> + struct ib_sa_path_rec *path_rec;
> + struct cma_addr *addr;
> + int num_paths;
> +
> + ip_addr = (struct sockaddr_in *) &listen_id->route.addr.src_addr;
> + if (cma_verify_addr(ib_event->private_data, ip_addr))
> + return NULL;
> +
> + num_paths = 1 + (ib_event->param.req_rcvd.alternate_path != NULL);
> + path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
> + if (!path_rec)
> + return NULL;
> +
> + id = rdma_create_id(listen_id->event_handler, listen_id->context);
> + if (!id)
> + goto err;
> +
> + route = &id->route;
> + route->addr.src_addr = listen_id->route.addr.src_addr;
> + route->addr.dst_addr.sa_family = ip_addr->sin_family;
> +
> + ip_addr = (struct sockaddr_in *) &route->addr.dst_addr;
> + addr = ib_event->private_data;
> + ip_addr->sin_addr.s_addr = be32_to_cpu(addr->src_addr.ver.ip4.addr);
> +
> + route->num_paths = num_paths;
> + route->path_rec = path_rec;
> + path_rec[0] = *ib_event->param.req_rcvd.primary_path;
> + if (num_paths == 2)
> + path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
> +
> + route->addr.addr.ibaddr.sgid = path_rec->dgid;
> + route->addr.addr.ibaddr.dgid = path_rec->sgid;
> + route->addr.addr.ibaddr.pkey = be16_to_cpu(path_rec->pkey);
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + id_priv->state = CMA_CONNECT;
> + return id_priv;
> +err:
> + kfree(path_rec);
> + return NULL;
> +}
> +
> +static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event
> *ib_event)
> +{
> + struct rdma_id_private *listen_id, *conn_id;
> + int offset, ret;
> +
> + listen_id = cm_id->context;
> + conn_id = cma_new_id(&listen_id->id, ib_event);
> + if (!conn_id)
> + return -ENOMEM;
> +
> + ret = cma_acquire_ib_dev(conn_id, &conn_id->id.route.path_rec[0].sgid);
> + if (ret) {
> + ret = -ENODEV;
> + goto err;
> + }
> +
> + conn_id->cm_id = cm_id;
> + cm_id->context = conn_id;
> + cm_id->cm_handler = cma_ib_handler;
> + conn_id->state = CMA_CONNECT;
> +
> + offset = sizeof(struct cma_addr);
> + ret = cma_notify_user(conn_id, RDMA_EVENT_CONNECT_REQUEST, 0,
> + ib_event->private_data + offset,
> + IB_CM_REQ_PRIVATE_DATA_SIZE - offset);
> + if (ret) {
> + /* Destroy the CM ID by returning a non-zero value. */
> + conn_id->cm_id = NULL;
> + rdma_destroy_id(&conn_id->id);
> + }
> + return ret;
> +err:
> + rdma_destroy_id(&conn_id->id);
> + return ret;
> +}
> +
> +static __be64 cma_get_service_id(struct sockaddr *addr)
> +{
> + return cpu_to_be64(((u64)IB_OPENIB_OUI << 48) +
> + ((struct sockaddr_in *) addr)->sin_port);
> +}
> +
> +static int cma_ib_listen(struct rdma_id_private *id_priv)
> +{
> + __be64 svc_id;
> + int ret;
> +
> + id_priv->cm_id = ib_create_cm_id(id_priv->id.device, cma_req_handler,
> + id_priv);
> + if (IS_ERR(id_priv->cm_id))
> + return PTR_ERR(id_priv->cm_id);
> +
> + svc_id = cma_get_service_id(&id_priv->id.route.addr.src_addr);
> + ret = ib_cm_listen(id_priv->cm_id, svc_id, 0);
> + if (ret)
> + ib_destroy_cm_id(id_priv->cm_id);
> +
> + return ret;
> +}
> +
> +int rdma_listen(struct rdma_id *id)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
> + return -EINVAL;
> +
> + /* TODO: handle listen across multiple devices */
> + if (!id->device) {
> + ret = -ENOSYS;
> + goto err;
> + }
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = cma_ib_listen(id_priv);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> + if (ret)
> + goto err;
> +
> + return 0;
> +err:
> + cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
> + return ret;
> +};
> +EXPORT_SYMBOL(rdma_listen);
> +
> +static void cma_query_handler(int status, struct ib_sa_path_rec
> *path_rec,
> + void *context)
> +{
> + struct rdma_id_private *id_priv = context;
> + struct rdma_route *route = &id_priv->id.route;
> + enum rdma_event_type event = RDMA_EVENT_ROUTE_RESOLVED;
> +
> + if (!status) {
> + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
> + if (route->path_rec) {
> + route->num_paths = 1;
> + *route->path_rec = *path_rec;
> + if (!cma_comp_exch(id_priv, CMA_ROUTE_QUERY,
> + CMA_ROUTE_RESOLVED))
> {
> + kfree(route->path_rec);
> + goto out;
> + }
> + } else
> + status = -ENOMEM;
> + }
> +
> + if (status) {
> + if (!cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED))
> + goto out;
> + event = RDMA_EVENT_ROUTE_ERROR;
> + }
> +
> + if (cma_notify_user(id_priv, event, status, NULL, 0)) {
> + cma_deref_id(id_priv);
> + rdma_destroy_id(&id_priv->id);
> + return;
> + }
> +out:
> + cma_deref_id(id_priv);
> +}
> +
> +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int
> timeout_ms)
> +{
> + struct ib_addr *addr = &id_priv->id.route.addr.addr.ibaddr;
> + struct ib_sa_path_rec path_rec;
> + int ret;
> + u8 port;
> +
> + ret = ib_find_cached_gid(id_priv->id.device, &addr->sgid, &port, NULL);
> + if (ret)
> + return -ENODEV;
> +
> + memset(&path_rec, 0, sizeof path_rec);
> + path_rec.sgid = addr->sgid;
> + path_rec.dgid = addr->dgid;
> + path_rec.pkey = addr->pkey;
> + path_rec.numb_path = 1;
> +
> + id_priv->query_id = ib_sa_path_rec_get(id_priv->id.device,
> + port, &path_rec,
> + IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
> + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH,
> + timeout_ms, GFP_KERNEL,
> + cma_query_handler, id_priv, &id_priv->query);
> +
> + return (id_priv->query_id < 0) ? id_priv->query_id : 0;
> +}
> +
> +int rdma_resolve_route(struct rdma_id *id, int timeout_ms)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
> + return -EINVAL;
> +
> + atomic_inc(&id_priv->refcount);
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = cma_resolve_ib_route(id_priv, timeout_ms);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> + if (ret)
> + goto err;
> +
> + return 0;
> +err:
> + cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
> + cma_deref_id(id_priv);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_resolve_route);
> +
> +static void addr_handler(int status, struct sockaddr *src_addr,
> + struct ib_addr *ibaddr, void *context)
> +{
> + struct rdma_id_private *id_priv = context;
> + enum rdma_event_type event;
> +
> + if (!status)
> + status = cma_acquire_ib_dev(id_priv, &ibaddr->sgid);
> +
> + if (status) {
> + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_IDLE))
> + goto out;
> + event = RDMA_EVENT_ADDR_ERROR;
> + } else {
> + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED))
> + goto out;
> + id_priv->id.route.addr.src_addr = *src_addr;
> + event = RDMA_EVENT_ADDR_RESOLVED;
> + }
> +
> + if (cma_notify_user(id_priv, event, status, NULL, 0)) {
> + cma_deref_id(id_priv);
> + rdma_destroy_id(&id_priv->id);
> + return;
> + }
> +out:
> + cma_deref_id(id_priv);
> +}
> +
> +int rdma_resolve_addr(struct rdma_id *id, struct sockaddr *src_addr,
> + struct sockaddr *dst_addr, int timeout_ms)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_QUERY))
> + return -EINVAL;
> +
> + atomic_inc(&id_priv->refcount);
> + id->route.addr.dst_addr = *dst_addr;
> + ret = ib_resolve_addr(src_addr, dst_addr,
> &id->route.addr.addr.ibaddr,
> + timeout_ms, addr_handler, id_priv);
> + if (ret)
> + goto err;
> +
> + return 0;
> +err:
> + cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_IDLE);
> + cma_deref_id(id_priv);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_resolve_addr);
> +
> +int rdma_bind_addr(struct rdma_id *id, struct sockaddr *addr)
> +{
> + struct rdma_id_private *id_priv;
> + struct sockaddr_in *ip_addr = (struct sockaddr_in *) addr;
> + struct ib_addr *ibaddr = &id->route.addr.addr.ibaddr;
> + int ret;
> +
> + if (addr->sa_family != AF_INET)
> + return -EINVAL;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
> + return -EINVAL;
> +
> + if (ip_addr->sin_addr.s_addr) {
> + ret = ib_translate_addr(addr, &ibaddr->sgid, &ibaddr->pkey);
> + if (!ret)
> + ret = cma_acquire_ib_dev(id_priv, &ibaddr->sgid);
> + } else
> + ret = -ENOSYS; /* TODO: support wild card addresses */
> +
> + if (ret)
> + goto err;
> +
> + id->route.addr.src_addr = *addr;
> + return 0;
> +err:
> + cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_bind_addr);
> +
> +static void cma_format_addr(struct cma_addr *addr, struct rdma_route
> *route)
> +{
> + struct sockaddr_in *ip_addr;
> +
> + memset(addr, 0, sizeof *addr);
> + cma_set_vers(addr, 1, 4);
> +
> + ip_addr = (struct sockaddr_in *) &route->addr.src_addr;
> + addr->src_addr.ver.ip4.addr = cpu_to_be32(ip_addr->sin_addr.s_addr);
> +
> + ip_addr = (struct sockaddr_in *) &route->addr.dst_addr;
> + addr->dst_addr.ver.ip4.addr = cpu_to_be32(ip_addr->sin_addr.s_addr);
> + addr->port = cpu_to_be16(ip_addr->sin_port);
> +}
> +
> +static int cma_connect_ib(struct rdma_id_private *id_priv,
> + struct rdma_conn_param *conn_param)
> +{
> + struct ib_cm_req_param req;
> + struct rdma_route *route;
> + struct cma_addr *addr;
> + void *private_data;
> + int ret;
> +
> + memset(&req, 0, sizeof req);
> + req.private_data_len = sizeof *addr + conn_param->private_data_len;
> +
> + private_data = kmalloc(req.private_data_len, GFP_ATOMIC);
> + if (!private_data)
> + return -ENOMEM;
> +
> + id_priv->cm_id = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
> + id_priv);
> + if (IS_ERR(id_priv->cm_id)) {
> + ret = PTR_ERR(id_priv->cm_id);
> + goto out;
> + }
> +
> + addr = private_data;
> + route = &id_priv->id.route;
> + cma_format_addr(addr, route);
> +
> + if (conn_param->private_data && conn_param->private_data_len)
> + memcpy(addr + 1, conn_param->private_data,
> + conn_param->private_data_len);
> + req.private_data = private_data;
> +
> + req.primary_path = &route->path_rec[0];
> + if (route->num_paths == 2)
> + req.alternate_path = &route->path_rec[1];
> +
> + req.service_id = cma_get_service_id(&route->addr.dst_addr);
> + req.qp_num = id_priv->id.qp->qp_num;
> + req.qp_type = IB_QPT_RC;
> + req.starting_psn = req.qp_num;
> + req.responder_resources = conn_param->responder_resources;
> + req.initiator_depth = conn_param->initiator_depth;
> + req.flow_control = conn_param->flow_control;
> + req.retry_count = conn_param->retry_count;
> + req.rnr_retry_count = conn_param->rnr_retry_count;
> + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
> + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
> + req.max_cm_retries = CMA_MAX_CM_RETRIES;
> + req.srq = id_priv->id.qp->srq ? 1 : 0;
> +
> + ret = ib_send_cm_req(id_priv->cm_id, &req);
> +out:
> + kfree(private_data);
> + return ret;
> +}
> +
> +int rdma_connect(struct rdma_id *id, struct rdma_conn_param
> *conn_param)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
> + return -EINVAL;
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = cma_connect_ib(id_priv, conn_param);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> + if (ret)
> + goto err;
> +
> + return 0;
> +err:
> + cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_connect);
> +
> +static int cma_accept_ib(struct rdma_id_private *id_priv,
> + struct rdma_conn_param *conn_param)
> +{
> + struct ib_cm_rep_param rep;
> + int ret;
> +
> + ret = cma_modify_ib_qp_rtr(id_priv);
> + if (ret)
> + return ret;
> +
> + memset(&rep, 0, sizeof rep);
> + rep.qp_num = id_priv->id.qp->qp_num;
> + rep.starting_psn = rep.qp_num;
> + rep.private_data = conn_param->private_data;
> + rep.private_data_len = conn_param->private_data_len;
> + rep.responder_resources = conn_param->responder_resources;
> + rep.initiator_depth = conn_param->initiator_depth;
> + rep.target_ack_delay = CMA_CM_RESPONSE_TIMEOUT;
> + rep.failover_accepted = 0;
> + rep.flow_control = conn_param->flow_control;
> + rep.rnr_retry_count = conn_param->rnr_retry_count;
> + rep.srq = id_priv->id.qp->srq ? 1 : 0;
> +
> + return ib_send_cm_rep(id_priv->cm_id, &rep);
> +}
> +
> +int rdma_accept(struct rdma_id *id, struct rdma_conn_param *conn_param)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp(id_priv, CMA_CONNECT))
> + return -EINVAL;
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = cma_accept_ib(id_priv, conn_param);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> +
> + if (ret)
> + goto reject;
> +
> + return 0;
> +reject:
> + cma_modify_qp_err(id);
> + rdma_reject(id, NULL, 0);
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_accept);
> +
> +int rdma_reject(struct rdma_id *id, const void *private_data,
> + u8 private_data_len)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp(id_priv, CMA_CONNECT))
> + return -EINVAL;
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + ret = ib_send_cm_rej(id_priv->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
> + NULL, 0, private_data,
> private_data_len);
> + break;
> + default:
> + ret = -ENOSYS;
> + break;
> + }
> + return ret;
> +};
> +EXPORT_SYMBOL(rdma_reject);
> +
> +int rdma_disconnect(struct rdma_id *id)
> +{
> + struct rdma_id_private *id_priv;
> + int ret;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> + if (!cma_comp(id_priv, CMA_CONNECT))
> + return -EINVAL;
> +
> + ret = cma_modify_qp_err(id);
> + if (ret)
> + goto out;
> +
> + switch (id->device->node_type) {
> + case IB_NODE_CA:
> + /* Initiate or respond to a disconnect. */
> + if (ib_send_cm_dreq(id_priv->cm_id, NULL, 0))
> + ib_send_cm_drep(id_priv->cm_id, NULL, 0);
> + break;
> + default:
> + break;
> + }
> +out:
> + return ret;
> +}
> +EXPORT_SYMBOL(rdma_disconnect);
> +
> +/* TODO: add this to the device structure - see Roland's patch */
> +static __be64 get_ca_guid(struct ib_device *device)
> +{
> + struct ib_device_attr *device_attr;
> + __be64 guid;
> + int ret;
> +
> + device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
> + if (!device_attr)
> + return 0;
> +
> + ret = ib_query_device(device, device_attr);
> + guid = ret ? 0 : device_attr->node_guid;
> + kfree(device_attr);
> + return guid;
> +}
> +
> +static void cma_add_one(struct ib_device *device)
> +{
> + struct cma_device *cma_dev;
> + unsigned long flags;
> +
> + cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
> + if (!cma_dev)
> + return;
> +
> + cma_dev->device = device;
> + cma_dev->node_guid = get_ca_guid(device);
> + if (!cma_dev->node_guid)
> + goto err;
> +
> + init_waitqueue_head(&cma_dev->wait);
> + atomic_set(&cma_dev->refcount, 1);
> + INIT_LIST_HEAD(&cma_dev->id_list);
> + ib_set_client_data(device, &cma_client, cma_dev);
> +
> + spin_lock_irqsave(&lock, flags);
> + list_add_tail(&cma_dev->list, &dev_list);
> + spin_unlock_irqrestore(&lock, flags);
> + return;
> +err:
> + kfree(cma_dev);
> +}
> +
> +static int cma_remove_id_dev(struct rdma_id_private *id_priv)
> +{
> + enum cma_state state;
> +
> + /* Record that we want to remove the device */
> + state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
> + if (state == CMA_DESTROYING)
> + return 0;
> +
> + /* TODO: wait until safe to process removal. */
> +
> + /* Check for destruction from another callback. */
> + if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
> + return 0;
> +
> + return cma_notify_user(id_priv, RDMA_EVENT_DEVICE_REMOVAL, 0,
> NULL, 0);
> +}
> +
> +static void cma_process_remove(struct cma_device *cma_dev)
> +{
> + struct list_head remove_list;
> + struct rdma_id_private *id_priv;
> + unsigned long flags;
> + int ret;
> +
> + INIT_LIST_HEAD(&remove_list);
> +
> + spin_lock_irqsave(&lock, flags);
> + while (!list_empty(&cma_dev->id_list)) {
> + id_priv = list_entry(cma_dev->id_list.next,
> + struct rdma_id_private, list);
> + list_del(&id_priv->list);
> + list_add_tail(&id_priv->list, &remove_list);
> + atomic_inc(&id_priv->refcount);
> + spin_unlock_irqrestore(&lock, flags);
> +
> + ret = cma_remove_id_dev(id_priv);
> + cma_deref_id(id_priv);
> + if (ret)
> + rdma_destroy_id(&id_priv->id);
> +
> + spin_lock_irqsave(&lock, flags);
> + }
> + spin_unlock_irqrestore(&lock, flags);
> +
> + atomic_dec(&cma_dev->refcount);
> + wait_event(cma_dev->wait, !atomic_read(&cma_dev->refcount));
> +}
> +
> +static void cma_remove_one(struct ib_device *device)
> +{
> + struct cma_device *cma_dev;
> + unsigned long flags;
> +
> + cma_dev = ib_get_client_data(device, &cma_client);
> + if (!cma_dev)
> + return;
> +
> + spin_lock_irqsave(&lock, flags);
> + list_del(&cma_dev->list);
> + spin_unlock_irqrestore(&lock, flags);
> +
> + cma_process_remove(cma_dev);
> + kfree(cma_dev);
> +}
> +
> +static int cma_init(void)
> +{
> + return ib_register_client(&cma_client);
> +}
> +
> +static void cma_cleanup(void)
> +{
> + ib_unregister_client(&cma_client);
> +}
> +
> +module_init(cma_init);
> +module_exit(cma_cleanup);
--
MST
More information about the general
mailing list