[ofa-general] [PATCHv4 06/10] ib_core: CMA device binding

Hal Rosenstock hal.rosenstock at gmail.com
Mon Aug 10 06:56:43 PDT 2009


On Wed, Aug 5, 2009 at 4:29 AM, Eli Cohen <eli at mellanox.co.il> wrote:

> Add support for RDMAoE device binding and IP --> GID resolution. Path
> resolving
> and multicast joining are implemented within cma.c by filling the responses
> and
> pushing the callbacks to the cma work queue. IP->GID resolution always
> yield
> IPv6 link local addresses - remote GIDs are derived from the destination
> MAC
> address of the remote port. Multicast GIDs are always mapped to broadcast
> MAC
> (all FFs). Some helper functions are added to ib_addr.h.
>
> Signed-off-by: Eli Cohen <eli at mellanox.co.il>
> ---
>  drivers/infiniband/core/cma.c  |  150
> ++++++++++++++++++++++++++++++++++++++-
>  drivers/infiniband/core/ucma.c |   25 +++++--
>  include/rdma/ib_addr.h         |   87 +++++++++++++++++++++++
>  3 files changed, 251 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index 866ff7f..8f5675b 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -58,6 +58,7 @@ MODULE_LICENSE("Dual BSD/GPL");
>  #define CMA_CM_RESPONSE_TIMEOUT 20
>  #define CMA_MAX_CM_RETRIES 15
>  #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
> +#define RDMAOE_PACKET_LIFETIME 18
>
>  static void cma_add_one(struct ib_device *device);
>  static void cma_remove_one(struct ib_device *device);
> @@ -174,6 +175,12 @@ struct cma_ndev_work {
>        struct rdma_cm_event    event;
>  };
>
> +struct rdmaoe_mcast_work {
> +       struct work_struct       work;
> +       struct rdma_id_private  *id;
> +       struct cma_multicast    *mc;
> +};
> +
>  union cma_ip_addr {
>        struct in6_addr ip6;
>        struct {
> @@ -348,6 +355,9 @@ static int cma_acquire_dev(struct rdma_id_private
> *id_priv)
>                        case RDMA_TRANSPORT_IWARP:
>                                iw_addr_get_sgid(dev_addr, &gid);
>                                break;
> +                       case RDMA_TRANSPORT_RDMAOE:
> +                               rdmaoe_addr_get_sgid(dev_addr, &gid);
> +                               break;
>                        default:
>                                return -ENODEV;
>                        }
> @@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private
> *id_priv,
>  {
>        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
>        int ret;
> +       u16 pkey;
> +
> +        if (rdma_port_get_transport(id_priv->id.device,
> id_priv->id.port_num) ==
> +           RDMA_TRANSPORT_IB)
> +               pkey = ib_addr_get_pkey(dev_addr);
> +       else
> +               pkey = 0xffff;
>
>        ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
> -                                 ib_addr_get_pkey(dev_addr),
> -                                 &qp_attr->pkey_index);
> +                                 pkey, &qp_attr->pkey_index);
>        if (ret)
>                return ret;
>
> @@ -609,6 +625,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct
> ib_qp_attr *qp_attr,
>        id_priv = container_of(id, struct rdma_id_private, id);
>        switch (rdma_port_get_transport(id_priv->id.device,
> id_priv->id.port_num)) {
>        case RDMA_TRANSPORT_IB:
> +       case RDMA_TRANSPORT_RDMAOE:
>                if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
>                        ret = cma_ib_init_qp_attr(id_priv, qp_attr,
> qp_attr_mask);
>                else
> @@ -836,7 +853,9 @@ static void cma_leave_mc_groups(struct rdma_id_private
> *id_priv)
>                mc = container_of(id_priv->mc_list.next,
>                                  struct cma_multicast, list);
>                list_del(&mc->list);
> -               ib_sa_free_multicast(mc->multicast.ib);
> +               if (rdma_port_get_transport(id_priv->cma_dev->device,
> id_priv->id.port_num) ==
> +                   RDMA_TRANSPORT_IB)
> +                       ib_sa_free_multicast(mc->multicast.ib);
>                kref_put(&mc->mcref, release_mc);
>        }
>  }
> @@ -855,6 +874,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
>                mutex_unlock(&lock);
>                switch (rdma_port_get_transport(id_priv->id.device,
> id_priv->id.port_num)) {
>                case RDMA_TRANSPORT_IB:
> +               case RDMA_TRANSPORT_RDMAOE:
>                        if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
>                                ib_destroy_cm_id(id_priv->cm_id.ib);
>                        break;
> @@ -1512,6 +1532,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
>        if (id->device) {
>                switch (rdma_port_get_transport(id->device, id->port_num)) {
>                case RDMA_TRANSPORT_IB:
> +               case RDMA_TRANSPORT_RDMAOE:
>                        ret = cma_ib_listen(id_priv);
>                        if (ret)
>                                goto err;
> @@ -1727,6 +1748,65 @@ static int cma_resolve_iw_route(struct
> rdma_id_private *id_priv, int timeout_ms)
>        return 0;
>  }
>
> +static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)
> +{
> +       struct rdma_route *route = &id_priv->id.route;
> +       struct rdma_addr *addr = &route->addr;
> +       struct cma_work *work;
> +       int ret;
> +       struct sockaddr_in *src_addr = (struct sockaddr_in
> *)&route->addr.src_addr;
> +       struct sockaddr_in *dst_addr = (struct sockaddr_in
> *)&route->addr.dst_addr;
> +
> +       if (src_addr->sin_family != dst_addr->sin_family)
> +               return -EINVAL;
> +
> +       work = kzalloc(sizeof *work, GFP_KERNEL);
> +       if (!work)
> +               return -ENOMEM;
> +
> +       work->id = id_priv;
> +       INIT_WORK(&work->work, cma_work_handler);
> +
> +       route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
> +       if (!route->path_rec) {
> +               ret = -ENOMEM;
> +               goto err;
> +       }
> +
> +       route->num_paths = 1;
> +
> +       rdmaoe_mac_to_ll(&route->path_rec->sgid,
> addr->dev_addr.src_dev_addr);
> +       rdmaoe_mac_to_ll(&route->path_rec->dgid,
> addr->dev_addr.dst_dev_addr);
> +
> +       route->path_rec->hop_limit = 2;


Does HopLimit need to be 2 ? Isn't this all subnet local ?


>
> +       route->path_rec->reversible = 1;
> +       route->path_rec->pkey = cpu_to_be16(0xffff);
> +       route->path_rec->mtu_selector = 2;
> +       route->path_rec->mtu = rdmaoe_get_mtu(addr->dev_addr.src_dev->mtu);
> +       route->path_rec->rate_selector = 2;
> +       route->path_rec->rate = rdmaoe_get_rate(addr->dev_addr.src_dev);
> +       route->path_rec->packet_life_time_selector = 2;
> +       route->path_rec->packet_life_time = RDMAOE_PACKET_LIFETIME;
> +
> +       work->old_state = CMA_ROUTE_QUERY;
> +       work->new_state = CMA_ROUTE_RESOLVED;
> +       if (!route->path_rec->mtu || !route->path_rec->rate) {
> +               work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
> +               work->event.status = -1;
> +       } else {
> +               work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
> +               work->event.status = 0;
> +       }
> +
> +       queue_work(cma_wq, &work->work);
> +
> +       return 0;
> +
> +err:
> +       kfree(work);
> +       return ret;
> +}
> +
>  int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
>  {
>        struct rdma_id_private *id_priv;
> @@ -1744,6 +1824,9 @@ int rdma_resolve_route(struct rdma_cm_id *id, int
> timeout_ms)
>        case RDMA_TRANSPORT_IWARP:
>                ret = cma_resolve_iw_route(id_priv, timeout_ms);
>                break;
> +       case RDMA_TRANSPORT_RDMAOE:
> +               ret = cma_resolve_rdmaoe_route(id_priv);
> +               break;
>        default:
>                ret = -ENOSYS;
>                break;
> @@ -2419,6 +2502,7 @@ int rdma_connect(struct rdma_cm_id *id, struct
> rdma_conn_param *conn_param)
>
>        switch (rdma_port_get_transport(id->device, id->port_num)) {
>        case RDMA_TRANSPORT_IB:
> +       case RDMA_TRANSPORT_RDMAOE:
>                if (cma_is_ud_ps(id->ps))
>                        ret = cma_resolve_ib_udp(id_priv, conn_param);
>                else
> @@ -2532,6 +2616,7 @@ int rdma_accept(struct rdma_cm_id *id, struct
> rdma_conn_param *conn_param)
>
>        switch (rdma_port_get_transport(id->device, id->port_num)) {
>        case RDMA_TRANSPORT_IB:
> +       case RDMA_TRANSPORT_RDMAOE:
>                if (cma_is_ud_ps(id->ps))
>                        ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
>                                                conn_param->private_data,
> @@ -2593,6 +2678,7 @@ int rdma_reject(struct rdma_cm_id *id, const void
> *private_data,
>
>        switch (rdma_port_get_transport(id->device, id->port_num)) {
>        case RDMA_TRANSPORT_IB:
> +       case RDMA_TRANSPORT_RDMAOE:
>                if (cma_is_ud_ps(id->ps))
>                        ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
>                                                private_data,
> private_data_len);
> @@ -2624,6 +2710,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
>
>        switch (rdma_port_get_transport(id->device, id->port_num)) {
>        case RDMA_TRANSPORT_IB:
> +       case RDMA_TRANSPORT_RDMAOE:
>                ret = cma_modify_qp_err(id_priv);
>                if (ret)
>                        goto out;
> @@ -2752,6 +2839,55 @@ static int cma_join_ib_multicast(struct
> rdma_id_private *id_priv,
>        return 0;
>  }
>
> +
> +static void rdmaoe_mcast_work_handler(struct work_struct *work)
> +{
> +       struct rdmaoe_mcast_work *mw = container_of(work, struct
> rdmaoe_mcast_work, work);
> +       struct cma_multicast *mc = mw->mc;
> +       struct ib_sa_multicast *m = mc->multicast.ib;
> +
> +       mc->multicast.ib->context = mc;
> +       cma_ib_mc_handler(0, m);
> +       kfree(m);
> +       kfree(mw);
> +}
> +
> +static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv,
> +                                    struct cma_multicast *mc)
> +{
> +       struct rdmaoe_mcast_work *work;
> +       struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
> +
> +       if (cma_zero_addr((struct sockaddr *)&mc->addr))
> +               return -EINVAL;
> +
> +       work = kzalloc(sizeof *work, GFP_KERNEL);
> +       if (!work)
> +               return -ENOMEM;
> +
> +       mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast),
> GFP_KERNEL);
> +       if (!mc->multicast.ib) {
> +               kfree(work);
> +               return -ENOMEM;
> +       }
> +
> +       cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr,
> &mc->multicast.ib->rec.mgid);
> +       mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
> +       if (id_priv->id.ps == RDMA_PS_UDP)
> +               mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
> +       mc->multicast.ib->rec.rate = rdmaoe_get_rate(dev_addr->src_dev);
> +       mc->multicast.ib->rec.hop_limit = 1;


Similar to the unicast comment above, is HopLimit 1 needed for multicast ?

-- Hal


>
> +       mc->multicast.ib->rec.mtu = rdmaoe_get_mtu(dev_addr->src_dev->mtu);
> +       rdmaoe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
> +       work->id = id_priv;
> +       work->mc = mc;
> +       INIT_WORK(&work->work, rdmaoe_mcast_work_handler);
> +
> +       queue_work(cma_wq, &work->work);
> +
> +       return 0;
> +}
> +
>  int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
>                        void *context)
>  {
> @@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
> sockaddr *addr,
>        case RDMA_TRANSPORT_IB:
>                ret = cma_join_ib_multicast(id_priv, mc);
>                break;
> +       case RDMA_TRANSPORT_RDMAOE:
> +               ret = cma_rdmaoe_join_multicast(id_priv, mc);
> +               break;
>        default:
>                ret = -ENOSYS;
>                break;
> @@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
> sockaddr *addr,
>                spin_unlock_irq(&id_priv->lock);
>                kfree(mc);
>        }
> +
>        return ret;
>  }
>  EXPORT_SYMBOL(rdma_join_multicast);
> @@ -2813,7 +2953,9 @@ void rdma_leave_multicast(struct rdma_cm_id *id,
> struct sockaddr *addr)
>                                ib_detach_mcast(id->qp,
>                                                &mc->multicast.ib->rec.mgid,
>                                                mc->multicast.ib->rec.mlid);
> -                       ib_sa_free_multicast(mc->multicast.ib);
> +                       if
> (rdma_port_get_transport(id_priv->cma_dev->device, id_priv->id.port_num) ==
> +                           RDMA_TRANSPORT_IB)
> +                               ib_sa_free_multicast(mc->multicast.ib);
>                        kref_put(&mc->mcref, release_mc);
>                        return;
>                }
> diff --git a/drivers/infiniband/core/ucma.c
> b/drivers/infiniband/core/ucma.c
> index 24d9510..c7c9e92 100644
> --- a/drivers/infiniband/core/ucma.c
> +++ b/drivers/infiniband/core/ucma.c
> @@ -553,7 +553,8 @@ static ssize_t ucma_resolve_route(struct ucma_file
> *file,
>  }
>
>  static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
> -                              struct rdma_route *route)
> +                              struct rdma_route *route,
> +                              enum rdma_transport_type tt)
>  {
>        struct rdma_dev_addr *dev_addr;
>
> @@ -561,10 +562,17 @@ static void ucma_copy_ib_route(struct
> rdma_ucm_query_route_resp *resp,
>        switch (route->num_paths) {
>        case 0:
>                dev_addr = &route->addr.dev_addr;
> -               ib_addr_get_dgid(dev_addr,
> -                                (union ib_gid *) &resp->ib_route[0].dgid);
> -               ib_addr_get_sgid(dev_addr,
> -                                (union ib_gid *) &resp->ib_route[0].sgid);
> +               if (tt == RDMA_TRANSPORT_IB) {
> +                       ib_addr_get_dgid(dev_addr,
> +                                        (union ib_gid *)
> &resp->ib_route[0].dgid);
> +                       ib_addr_get_sgid(dev_addr,
> +                                        (union ib_gid *)
> &resp->ib_route[0].sgid);
> +               } else {
> +                       rdmaoe_mac_to_ll((union ib_gid *)
> &resp->ib_route[0].dgid,
> +                                        dev_addr->dst_dev_addr);
> +                       rdmaoe_addr_get_sgid(dev_addr,
> +                                        (union ib_gid *)
> &resp->ib_route[0].sgid);
> +               }
>                resp->ib_route[0].pkey =
> cpu_to_be16(ib_addr_get_pkey(dev_addr));
>                break;
>        case 2:
> @@ -589,6 +597,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
>        struct ucma_context *ctx;
>        struct sockaddr *addr;
>        int ret = 0;
> +       enum rdma_transport_type tt;
>
>        if (out_len < sizeof(resp))
>                return -ENOSPC;
> @@ -614,9 +623,11 @@ static ssize_t ucma_query_route(struct ucma_file
> *file,
>
>        resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
>        resp.port_num = ctx->cm_id->port_num;
> -       switch (rdma_port_get_transport(ctx->cm_id->device,
> ctx->cm_id->port_num)) {
> +       tt = rdma_port_get_transport(ctx->cm_id->device,
> ctx->cm_id->port_num);
> +       switch (tt) {
>        case RDMA_TRANSPORT_IB:
> -               ucma_copy_ib_route(&resp, &ctx->cm_id->route);
> +       case RDMA_TRANSPORT_RDMAOE:
> +               ucma_copy_ib_route(&resp, &ctx->cm_id->route, tt);
>                break;
>        default:
>                break;
> diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
> index 483057b..66a848e 100644
> --- a/include/rdma/ib_addr.h
> +++ b/include/rdma/ib_addr.h
> @@ -39,6 +39,8 @@
>  #include <linux/netdevice.h>
>  #include <linux/socket.h>
>  #include <rdma/ib_verbs.h>
> +#include <linux/ethtool.h>
> +#include <rdma/ib_pack.h>
>
>  struct rdma_addr_client {
>        atomic_t refcount;
> @@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct
> rdma_dev_addr *dev_addr,
>        memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid);
>  }
>
> +static inline void rdmaoe_mac_to_ll(union ib_gid *gid, u8 *mac)
> +{
> +       memset(gid->raw, 0, 16);
> +       *((u32 *)gid->raw) = cpu_to_be32(0xfe800000);
> +       gid->raw[12] = 0xfe;
> +       gid->raw[11] = 0xff;
> +       memcpy(gid->raw + 13, mac + 3, 3);
> +       memcpy(gid->raw + 8, mac, 3);
> +       gid->raw[8] ^= 2;
> +}
> +
> +static inline void rdmaoe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
> +                                       union ib_gid *gid)
> +{
> +       rdmaoe_mac_to_ll(gid, dev_addr->src_dev_addr);
> +}
> +
> +static inline enum ib_mtu rdmaoe_get_mtu(int mtu)
> +{
> +       /*
> +        * reduce IB headers from effective RDMAoE MTU. 28 stands for
> +        * atomic header which is the biggest possible header after BTH
> +        */
> +       mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
> +
> +       if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
> +               return IB_MTU_4096;
> +       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
> +               return IB_MTU_2048;
> +       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
> +               return IB_MTU_1024;
> +       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
> +               return IB_MTU_512;
> +       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
> +               return IB_MTU_256;
> +       else
> +               return 0;
> +}
> +
> +static inline int rdmaoe_get_rate(struct net_device *dev)
> +{
> +       struct ethtool_cmd cmd;
> +
> +       if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
> +           dev->ethtool_ops->get_settings(dev, &cmd))
> +               return IB_RATE_PORT_CURRENT;
> +
> +       if (cmd.speed >= 40000)
> +               return IB_RATE_40_GBPS;
> +       else if (cmd.speed >= 30000)
> +               return IB_RATE_30_GBPS;
> +       else if (cmd.speed >= 20000)
> +               return IB_RATE_20_GBPS;
> +       else if (cmd.speed >= 10000)
> +               return IB_RATE_10_GBPS;
> +       else
> +               return IB_RATE_PORT_CURRENT;
> +}
> +
> +static inline int rdma_link_local_addr(struct in6_addr *addr)
> +{
> +       if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
> +           addr->s6_addr32[1] == 0)
> +               return 1;
> +       else
> +               return 0;
> +}
> +
> +static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
> +{
> +       memcpy(mac, &addr->s6_addr[8], 3);
> +       memcpy(mac + 3, &addr->s6_addr[13], 3);
> +       mac[0] ^= 2;
> +}
> +
> +static inline int rdma_is_multicast_addr(struct in6_addr *addr)
> +{
> +       return addr->s6_addr[0] == 0xff ? 1 : 0;
> +}
> +
> +static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
> +{
> +       memset(mac, 0xff, 6);
> +}
> +
>  #endif /* IB_ADDR_H */
> --
> 1.6.3.3
>
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit
> http://openib.org/mailman/listinfo/openib-general
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20090810/ca28b47a/attachment.html>


More information about the general mailing list