[openib-general] [PATCH] iWARP Support added to the CMA
Sean Hefty
mshefty at ichips.intel.com
Thu Dec 29 16:53:09 PST 2005
Tom Tucker wrote:
I'm a lot slow to review this, but comments below. I'll start to address some
of them that affect the generic code next week, in particular changes to ib_addr.
> Index: core/cm.c
> ===================================================================
> --- core/cm.c (revision 4186)
> +++ core/cm.c (working copy)
> @@ -3227,6 +3227,10 @@
> int ret;
> u8 i;
>
> + /* Ignore RNIC devices */
> + if (device->node_type == IB_NODE_RNIC)
> + return;
> +
> cm_dev = kmalloc(sizeof(*cm_dev) + sizeof(*port) *
> device->phys_port_cnt, GFP_KERNEL);
> if (!cm_dev)
> @@ -3291,6 +3295,10 @@
> if (!cm_dev)
> return;
>
> + /* Ignore RNIC devices */
> + if (device->node_type == IB_NODE_RNIC)
> + return;
> +
> write_lock_irqsave(&cm.device_lock, flags);
> list_del(&cm_dev->list);
> write_unlock_irqrestore(&cm.device_lock, flags);
The changes to cm_remove_one() are not needed. ib_get_client_data() should
return NULL because IB_NODE_RNIC is skipped in cm_add_one().
> Index: core/addr.c
> ===================================================================
> --- core/addr.c (revision 4186)
> +++ core/addr.c (working copy)
> @@ -73,8 +73,13 @@
> if (!dev)
> return -EADDRNOTAVAIL;
>
> - *gid = *(union ib_gid *) (dev->dev_addr + 4);
> - *pkey = addr_get_pkey(dev);
> + if (dev->type == ARPHRD_INFINIBAND) {
> + *gid = *(union ib_gid *) (dev->dev_addr + 4);
> + *pkey = addr_get_pkey(dev);
> + } else {
> + *gid = *(union ib_gid *) (dev->dev_addr);
> + *pkey = 0;
> + }
> dev_put(dev);
> return 0;
> }
If this call is being used, we should consider changing it to something more
generic, rather than returning a "gid" as the hardware address for a non-IB
device. One possibility is to make the API return the hardware address of a
given IP address. The CMA can then break that address into a GID/pkey pair if
needed.
> @@ -476,8 +498,22 @@
> state = cma_exch(id_priv, CMA_DESTROYING);
> cma_cancel_operation(id_priv, state);
>
> - if (id_priv->cm_id && !IS_ERR(id_priv->cm_id))
> - ib_destroy_cm_id(id_priv->cm_id);
> + if (id->device) {
> + switch (id->device->node_type) {
> + case IB_NODE_RNIC:
> + if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) {
> + iw_destroy_cm_id(id_priv->cm_id.iw);
> + id_priv->cm_id.iw = 0;
> + }
> + break;
> + default:
> + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) {
> + ib_destroy_cm_id(id_priv->cm_id.ib);
> + id_priv->cm_id.ib = 0;
The iw/ib devices should be set to NULL instead of assigned 0.
> + ret = cma_notify_user(id_priv,
> + event_type,
> + event->status,
> + event->private_data,
> + event->private_data_len);
> + if (ret) {
> + /* Destroy the CM ID by returning a non-zero value. */
> + id_priv->cm_id.iw = NULL;
> + cma_exch(id_priv, CMA_DESTROYING);
> + cma_release_remove(id_priv);
> + rdma_destroy_id(&id_priv->id);
> + return ret;
> + }
> +
> + cma_release_remove(id_priv);
> + return ret;
> +}
This looks different than the cma_ib_handler, and it makes me think that the
cma_ib_handler has a bug where it doesn't decrement dev_remove.
> +static int iw_conn_req_handler(struct iw_cm_id *cm_id,
> + struct iw_cm_event *iw_event)
> +{
> + struct rdma_cm_id* new_cm_id;
> + struct rdma_id_private *listen_id, *conn_id;
> + struct sockaddr_in* sin;
> + int ret;
> +
> + listen_id = cm_id->context;
> + atomic_inc(&listen_id->dev_remove);
> + if (!cma_comp(listen_id, CMA_LISTEN)) {
> + ret = -ECONNABORTED;
> + goto out;
> + }
> +
> + /* Create a new RDMA id the new IW CM ID */
> + new_cm_id = rdma_create_id(listen_id->id.event_handler,
> + listen_id->id.context);
> + if (!new_cm_id) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + conn_id = container_of(new_cm_id, struct rdma_id_private, id);
> + atomic_inc(&conn_id->dev_remove);
> + conn_id->state = CMA_CONNECT;
> +
> + /* New connection inherits device from parent */
> + cma_attach_to_dev(conn_id, listen_id->cma_dev);
cma_attach_to_dev doesn't provide synchronization around cma_dev->id_list.
Access to that list needs to be protected with 'mutex'. Other than that, I
think that this works fine.
> @@ -785,8 +950,9 @@
> goto out;
>
> list_add_tail(&id_priv->list, &listen_any_list);
> - list_for_each_entry(cma_dev, &dev_list, list)
> + list_for_each_entry(cma_dev, &dev_list, list) {
> cma_listen_on_dev(id_priv, cma_dev);
> + }
Please drop the extra braces.
> @@ -796,7 +962,6 @@
> {
> struct rdma_id_private *id_priv;
> int ret;
> -
> id_priv = container_of(id, struct rdma_id_private, id);
> if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
> return -EINVAL;
Please keep the blank line.
> @@ -890,6 +1058,30 @@
> +static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
> +{
> + enum rdma_cm_event_type event = RDMA_CM_EVENT_ROUTE_RESOLVED;
> + int rc;
Please use 'ret' instead of 'rc' to match the rest of the code.
> +
> + atomic_inc(&id_priv->dev_remove);
> +
> + if (!cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ROUTE_RESOLVED))
> + BUG_ON(1);
The device associated with the id could have been removed while the user was in
the process of making this call. We should simply fail the call here.
> +
> + rc = cma_notify_user(id_priv, event, 0, NULL, 0);
> + if (rc) {
> + cma_exch(id_priv, CMA_DESTROYING);
> + cma_release_remove(id_priv);
> + cma_deref_id(id_priv);
> + rdma_destroy_id(&id_priv->id);
> + return rc;
> + }
The callback needs to come from another thread other than the one that the user
called down with. Calling the user back in their own thread can make it
difficult for them to provide synchronization. You can use the rdma_wq that's
been exposed in ib_addr. Also, the user is likely to call this routine from
within a CMA callback (such as after ib_resolve_addr), so deadlock will occur if
you try to destroy the id.
> +
> + cma_release_remove(id_priv);
> + cma_deref_id(id_priv);
> + return rc;
> +}
> +
> int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
> {
> struct rdma_id_private *id_priv;
> @@ -952,20 +1147,133 @@
> +
> +/* Find the local interface with a route to the specified address and
> + * bind the CM ID to this interface's CMA device
> + */
> +static int cma_acquire_iw_dev(struct rdma_cm_id* id, struct sockaddr* addr)
> +{
> + int ret = -ENOENT;
> + struct cma_device* cma_dev;
> + struct rdma_id_private *id_priv;
> + struct sockaddr_in* sin;
> + struct rtable *rt = 0;
> + struct flowi fl;
> + struct net_device* netdev;
> + struct in_addr src_ip;
> + unsigned char* dev_addr;
> +
> + sin = (struct sockaddr_in*)addr;
> + if (sin->sin_family != AF_INET)
> + return -EINVAL;
> +
> + id_priv = container_of(id, struct rdma_id_private, id);
> +
> + /* If the address is local, use the device. If it is remote,
> + * look up a route to get the local address
> + */
> + netdev = ip_dev_find(sin->sin_addr.s_addr);
> + if (netdev) {
> + src_ip = sin->sin_addr;
> + dev_addr = netdev->dev_addr;
> + dev_put(netdev);
> + } else {
> + memset(&fl, 0, sizeof(fl));
> + fl.nl_u.ip4_u.daddr = sin->sin_addr.s_addr;
> + if (ip_route_output_key(&rt, &fl)) {
> + return -ENETUNREACH;
> + }
> + dev_addr = rt->idev->dev->dev_addr;
> + src_ip.s_addr = rt->rt_src;
> +
> + ip_rt_put(rt);
> + }
Can we push the above code into ib_addr?
> + down(&mutex);
> +
> + list_for_each_entry(cma_dev, &dev_list, list) {
> + if (memcmp(dev_addr,
> + &cma_dev->node_guid,
> + sizeof(cma_dev->node_guid)) == 0) {
> + /* If we find the device, then check if this
> + * is an iWARP device. If it is, then call the
> + * callback handler immediately because we
> + * already have the native address
> + */
I'm not following this comment. What callback is being invoked?
> + if (cma_dev->device->node_type == IB_NODE_RNIC) {
> + struct sockaddr_in* cm_sin;
> + /* Set our source address */
> + cm_sin = (struct sockaddr_in*)
> + &id_priv->id.route.addr.src_addr;
> + cm_sin->sin_family = AF_INET;
> + cm_sin->sin_addr.s_addr = src_ip.s_addr;
> +
> + /* Claim the device in the mutex */
> + cma_attach_to_dev(id_priv, cma_dev);
> + ret = 0;
> + break;
> + }
> + }
> + }
> + up(&mutex);
> +
> + return ret;
> +}
I'd like to see if it's possible to merge this call with cma_acquire_ib_dev()
and create a new routine, cma_acquire_dev() that can walk the device list and
check for both. Maybe if ib_addr returned the full hardware address, along with
a device type it might be possible. (Although the likelihood of a hardware
address collision seems near impossible.)
> int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
> struct sockaddr *dst_addr, int timeout_ms)
> {
> struct rdma_id_private *id_priv;
> - int ret;
> + int ret = 0;
>
> id_priv = container_of(id, struct rdma_id_private, id);
> if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_QUERY))
> return -EINVAL;
>
> atomic_inc(&id_priv->refcount);
> +
> id->route.addr.dst_addr = *dst_addr;
> - ret = ib_resolve_addr(src_addr, dst_addr, &id->route.addr.addr.ibaddr,
> - timeout_ms, addr_handler, id_priv);
> +
> + if (cma_acquire_iw_dev(id, dst_addr)==0) {
> +
> + enum rdma_cm_event_type event;
> +
> + cma_exch(id_priv, CMA_ADDR_RESOLVED);
> +
> + atomic_inc(&id_priv->dev_remove);
> +
> + event = RDMA_CM_EVENT_ADDR_RESOLVED;
> + if (cma_notify_user(id_priv, event, 0, NULL, 0)) {
> + cma_exch(id_priv, CMA_DESTROYING);
> + cma_deref_id(id_priv);
> + cma_release_remove(id_priv);
> + rdma_destroy_id(&id_priv->id);
> + return -EINVAL;
> + }
Similar to other comments. Callbacks should be scheduled to a separate thread.
The behavior is also slightly different. The IB code will return a source IP
address that may be used to connect to the destination address if one is not
given. This is needed in order to perform the reverse resolution on the remote
side.
> + cma_release_remove(id_priv);
> + cma_deref_id(id_priv);
> +
> + } else {
> +
> + ret = ib_resolve_addr(src_addr,
> + dst_addr, &id->route.addr.addr.ibaddr,
> + timeout_ms, addr_handler, id_priv);
We might be able to make this call generic by replacing ibaddr with source and
destination hardware addresses. Users that really want to know what 'gid'
they're on could be given functions that extract the gid and pkey from the
hardware addresses. I.e. struct ib_addr could be renamed to struct rdma_addr
and contain source and destination hardware addresses.
> @@ -980,10 +1288,13 @@
> int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
> {
> struct rdma_id_private *id_priv;
> + struct sockaddr_in* sin;
> struct ib_addr *ibaddr = &id->route.addr.addr.ibaddr;
> int ret;
>
> - if (addr->sa_family != AF_INET)
> + sin = (struct sockaddr_in*)addr;
> +
> + if (sin->sin_family != AF_INET)
> return -EINVAL;
Please remove this change. Right now, the check's only there because the code
does not fully support IPv6 addressing.
> int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
> {
> struct rdma_id_private *id_priv;
> int ret;
>
> id_priv = container_of(id, struct rdma_id_private, id);
> - if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
> + if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
Please undo extra white space at the end of the line.
> @@ -1190,7 +1551,6 @@
> {
> struct rdma_id_private *id_priv;
> int ret;
> -
Please add blank line back in.
> id_priv = container_of(id, struct rdma_id_private, id);
> if (!cma_comp(id_priv, CMA_CONNECT))
> return -EINVAL;
- Sean
More information about the general
mailing list