<br><br>
<div class="gmail_quote">On Wed, Aug 5, 2009 at 4:29 AM, Eli Cohen <span dir="ltr"><<a href="mailto:eli@mellanox.co.il">eli@mellanox.co.il</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">Add support for RDMAoE device binding and IP --> GID resolution. Path resolving<br>and multicast joining are implemented within cma.c by filling the responses and<br>
pushing the callbacks to the cma work queue. IP->GID resolution always yield<br>IPv6 link local addresses - remote GIDs are derived from the destination MAC<br>address of the remote port. Multicast GIDs are always mapped to broadcast MAC<br>
(all FFs). Some helper functions are added to ib_addr.h.<br><br>Signed-off-by: Eli Cohen <<a href="mailto:eli@mellanox.co.il">eli@mellanox.co.il</a>><br>---<br> drivers/infiniband/core/cma.c  |  150 ++++++++++++++++++++++++++++++++++++++-<br>
 drivers/infiniband/core/ucma.c |   25 +++++--<br> include/rdma/ib_addr.h         |   87 +++++++++++++++++++++++<br> 3 files changed, 251 insertions(+), 11 deletions(-)<br><br>diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c<br>
index 866ff7f..8f5675b 100644<br>--- a/drivers/infiniband/core/cma.c<br>+++ b/drivers/infiniband/core/cma.c<br>@@ -58,6 +58,7 @@ MODULE_LICENSE("Dual BSD/GPL");<br> #define CMA_CM_RESPONSE_TIMEOUT 20<br> #define CMA_MAX_CM_RETRIES 15<br>
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)<br>+#define RDMAOE_PACKET_LIFETIME 18<br><br> static void cma_add_one(struct ib_device *device);<br> static void cma_remove_one(struct ib_device *device);<br>@@ -174,6 +175,12 @@ struct cma_ndev_work {<br>
       struct rdma_cm_event    event;<br> };<br><br>+struct rdmaoe_mcast_work {<br>+       struct work_struct       work;<br>+       struct rdma_id_private  *id;<br>+       struct cma_multicast    *mc;<br>+};<br>+<br> union cma_ip_addr {<br>
       struct in6_addr ip6;<br>       struct {<br>@@ -348,6 +355,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)<br>                       case RDMA_TRANSPORT_IWARP:<br>                               iw_addr_get_sgid(dev_addr, &gid);<br>
                               break;<br>+                       case RDMA_TRANSPORT_RDMAOE:<br>+                               rdmaoe_addr_get_sgid(dev_addr, &gid);<br>+                               break;<br>                       default:<br>
                               return -ENODEV;<br>                       }<br>@@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,<br> {<br>       struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;<br>
       int ret;<br>+       u16 pkey;<br>+<br>+        if (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num) ==<br>+           RDMA_TRANSPORT_IB)<br>+               pkey = ib_addr_get_pkey(dev_addr);<br>
+       else<br>+               pkey = 0xffff;<br><br>       ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,<br>-                                 ib_addr_get_pkey(dev_addr),<br>-                                 &qp_attr->pkey_index);<br>
+                                 pkey, &qp_attr->pkey_index);<br>       if (ret)<br>               return ret;<br><br>@@ -609,6 +625,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,<br>
       id_priv = container_of(id, struct rdma_id_private, id);<br>       switch (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num)) {<br>       case RDMA_TRANSPORT_IB:<br>+       case RDMA_TRANSPORT_RDMAOE:<br>
               if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv-><a href="http://id.ps/" target="_blank">id.ps</a>))<br>                       ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);<br>               else<br>
@@ -836,7 +853,9 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)<br>               mc = container_of(id_priv->mc_list.next,<br>                                 struct cma_multicast, list);<br>               list_del(&mc->list);<br>
-               ib_sa_free_multicast(mc->multicast.ib);<br>+               if (rdma_port_get_transport(id_priv->cma_dev->device, id_priv->id.port_num) ==<br>+                   RDMA_TRANSPORT_IB)<br>+                       ib_sa_free_multicast(mc->multicast.ib);<br>
               kref_put(&mc->mcref, release_mc);<br>       }<br> }<br>@@ -855,6 +874,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)<br>               mutex_unlock(&lock);<br>               switch (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num)) {<br>
               case RDMA_TRANSPORT_IB:<br>+               case RDMA_TRANSPORT_RDMAOE:<br>                       if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))<br>                               ib_destroy_cm_id(id_priv->cm_id.ib);<br>
                       break;<br>@@ -1512,6 +1532,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)<br>       if (id->device) {<br>               switch (rdma_port_get_transport(id->device, id->port_num)) {<br>
               case RDMA_TRANSPORT_IB:<br>+               case RDMA_TRANSPORT_RDMAOE:<br>                       ret = cma_ib_listen(id_priv);<br>                       if (ret)<br>                               goto err;<br>
@@ -1727,6 +1748,65 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)<br>       return 0;<br> }<br><br>+static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)<br>+{<br>+       struct rdma_route *route = &id_priv->id.route;<br>
+       struct rdma_addr *addr = &route->addr;<br>+       struct cma_work *work;<br>+       int ret;<br>+       struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;<br>+       struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;<br>
+<br>+       if (src_addr->sin_family != dst_addr->sin_family)<br>+               return -EINVAL;<br>+<br>+       work = kzalloc(sizeof *work, GFP_KERNEL);<br>+       if (!work)<br>+               return -ENOMEM;<br>
+<br>+       work->id = id_priv;<br>+       INIT_WORK(&work->work, cma_work_handler);<br>+<br>+       route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);<br>+       if (!route->path_rec) {<br>
+               ret = -ENOMEM;<br>+               goto err;<br>+       }<br>+<br>+       route->num_paths = 1;<br>+<br>+       rdmaoe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr);<br>+       rdmaoe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr);<br>
+<br>+       route->path_rec->hop_limit = 2;</blockquote>
<div> </div>
<div>Does HopLimit need to be 2 ? Isn't this all subnet local ?</div>
<div> </div>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid"><span id=""></span><br>+       route->path_rec->reversible = 1;<br>+       route->path_rec->pkey = cpu_to_be16(0xffff);<br>
+       route->path_rec->mtu_selector = 2;<br>+       route->path_rec->mtu = rdmaoe_get_mtu(addr->dev_addr.src_dev->mtu);<br>+       route->path_rec->rate_selector = 2;<br>+       route->path_rec->rate = rdmaoe_get_rate(addr->dev_addr.src_dev);<br>
+       route->path_rec->packet_life_time_selector = 2;<br>+       route->path_rec->packet_life_time = RDMAOE_PACKET_LIFETIME;<br>+<br>+       work->old_state = CMA_ROUTE_QUERY;<br>+       work->new_state = CMA_ROUTE_RESOLVED;<br>
+       if (!route->path_rec->mtu || !route->path_rec->rate) {<br>+               work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;<br>+               work->event.status = -1;<br>+       } else {<br>+               work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;<br>
+               work->event.status = 0;<br>+       }<br>+<br>+       queue_work(cma_wq, &work->work);<br>+<br>+       return 0;<br>+<br>+err:<br>+       kfree(work);<br>+       return ret;<br>+}<br>+<br> int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)<br>
 {<br>       struct rdma_id_private *id_priv;<br>@@ -1744,6 +1824,9 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)<br>       case RDMA_TRANSPORT_IWARP:<br>               ret = cma_resolve_iw_route(id_priv, timeout_ms);<br>
               break;<br>+       case RDMA_TRANSPORT_RDMAOE:<br>+               ret = cma_resolve_rdmaoe_route(id_priv);<br>+               break;<br>       default:<br>               ret = -ENOSYS;<br>               break;<br>
@@ -2419,6 +2502,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)<br><br>       switch (rdma_port_get_transport(id->device, id->port_num)) {<br>       case RDMA_TRANSPORT_IB:<br>+       case RDMA_TRANSPORT_RDMAOE:<br>
               if (cma_is_ud_ps(id->ps))<br>                       ret = cma_resolve_ib_udp(id_priv, conn_param);<br>               else<br>@@ -2532,6 +2616,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)<br>
<br>       switch (rdma_port_get_transport(id->device, id->port_num)) {<br>       case RDMA_TRANSPORT_IB:<br>+       case RDMA_TRANSPORT_RDMAOE:<br>               if (cma_is_ud_ps(id->ps))<br>                       ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,<br>
                                               conn_param->private_data,<br>@@ -2593,6 +2678,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,<br><br>       switch (rdma_port_get_transport(id->device, id->port_num)) {<br>
       case RDMA_TRANSPORT_IB:<br>+       case RDMA_TRANSPORT_RDMAOE:<br>               if (cma_is_ud_ps(id->ps))<br>                       ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,<br>                                               private_data, private_data_len);<br>
@@ -2624,6 +2710,7 @@ int rdma_disconnect(struct rdma_cm_id *id)<br><br>       switch (rdma_port_get_transport(id->device, id->port_num)) {<br>       case RDMA_TRANSPORT_IB:<br>+       case RDMA_TRANSPORT_RDMAOE:<br>
               ret = cma_modify_qp_err(id_priv);<br>               if (ret)<br>                       goto out;<br>@@ -2752,6 +2839,55 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,<br>       return 0;<br>
 }<br><br>+<br>+static void rdmaoe_mcast_work_handler(struct work_struct *work)<br>+{<br>+       struct rdmaoe_mcast_work *mw = container_of(work, struct rdmaoe_mcast_work, work);<br>+       struct cma_multicast *mc = mw->mc;<br>
+       struct ib_sa_multicast *m = mc->multicast.ib;<br>+<br>+       mc->multicast.ib->context = mc;<br>+       cma_ib_mc_handler(0, m);<br>+       kfree(m);<br>+       kfree(mw);<br>+}<br>+<br>+static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv,<br>
+                                    struct cma_multicast *mc)<br>+{<br>+       struct rdmaoe_mcast_work *work;<br>+       struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;<br>+<br>+       if (cma_zero_addr((struct sockaddr *)&mc->addr))<br>
+               return -EINVAL;<br>+<br>+       work = kzalloc(sizeof *work, GFP_KERNEL);<br>+       if (!work)<br>+               return -ENOMEM;<br>+<br>+       mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);<br>
+       if (!mc->multicast.ib) {<br>+               kfree(work);<br>+               return -ENOMEM;<br>+       }<br>+<br>+       cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, &mc->multicast.ib->rec.mgid);<br>
+       mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);<br>+       if (id_priv-><a href="http://id.ps/" target="_blank">id.ps</a> == RDMA_PS_UDP)<br>+               mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);<br>
+       mc->multicast.ib->rec.rate = rdmaoe_get_rate(dev_addr->src_dev);<br>+       mc->multicast.ib->rec.hop_limit = 1;</blockquote>
<div> </div>
<div>Similar to the unicast comment above, is HopLimit 1 needed for multicast ?</div>
<div> </div>
<div>-- Hal</div>
<div> </div>
<blockquote class="gmail_quote" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid"><span id=""></span><br>+       mc->multicast.ib->rec.mtu = rdmaoe_get_mtu(dev_addr->src_dev->mtu);<br>
+       rdmaoe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);<br>+       work->id = id_priv;<br>+       work->mc = mc;<br>+       INIT_WORK(&work->work, rdmaoe_mcast_work_handler);<br>+<br>
+       queue_work(cma_wq, &work->work);<br>+<br>+       return 0;<br>+}<br>+<br> int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,<br>                       void *context)<br> {<br>@@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,<br>
       case RDMA_TRANSPORT_IB:<br>               ret = cma_join_ib_multicast(id_priv, mc);<br>               break;<br>+       case RDMA_TRANSPORT_RDMAOE:<br>+               ret = cma_rdmaoe_join_multicast(id_priv, mc);<br>
+               break;<br>       default:<br>               ret = -ENOSYS;<br>               break;<br>@@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,<br>               spin_unlock_irq(&id_priv->lock);<br>
               kfree(mc);<br>       }<br>+<br>       return ret;<br> }<br> EXPORT_SYMBOL(rdma_join_multicast);<br>@@ -2813,7 +2953,9 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)<br>                               ib_detach_mcast(id->qp,<br>
                                               &mc->multicast.ib->rec.mgid,<br>                                               mc->multicast.ib->rec.mlid);<br>-                       ib_sa_free_multicast(mc->multicast.ib);<br>
+                       if (rdma_port_get_transport(id_priv->cma_dev->device, id_priv->id.port_num) ==<br>+                           RDMA_TRANSPORT_IB)<br>+                               ib_sa_free_multicast(mc->multicast.ib);<br>
                       kref_put(&mc->mcref, release_mc);<br>                       return;<br>               }<br>diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c<br>index 24d9510..c7c9e92 100644<br>
--- a/drivers/infiniband/core/ucma.c<br>+++ b/drivers/infiniband/core/ucma.c<br>@@ -553,7 +553,8 @@ static ssize_t ucma_resolve_route(struct ucma_file *file,<br> }<br><br> static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,<br>
-                              struct rdma_route *route)<br>+                              struct rdma_route *route,<br>+                              enum rdma_transport_type tt)<br> {<br>       struct rdma_dev_addr *dev_addr;<br>
<br>@@ -561,10 +562,17 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,<br>       switch (route->num_paths) {<br>       case 0:<br>               dev_addr = &route->addr.dev_addr;<br>-               ib_addr_get_dgid(dev_addr,<br>
-                                (union ib_gid *) &resp->ib_route[0].dgid);<br>-               ib_addr_get_sgid(dev_addr,<br>-                                (union ib_gid *) &resp->ib_route[0].sgid);<br>+               if (tt == RDMA_TRANSPORT_IB) {<br>
+                       ib_addr_get_dgid(dev_addr,<br>+                                        (union ib_gid *) &resp->ib_route[0].dgid);<br>+                       ib_addr_get_sgid(dev_addr,<br>+                                        (union ib_gid *) &resp->ib_route[0].sgid);<br>
+               } else {<br>+                       rdmaoe_mac_to_ll((union ib_gid *) &resp->ib_route[0].dgid,<br>+                                        dev_addr->dst_dev_addr);<br>+                       rdmaoe_addr_get_sgid(dev_addr,<br>
+                                        (union ib_gid *) &resp->ib_route[0].sgid);<br>+               }<br>               resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));<br>               break;<br>
       case 2:<br>@@ -589,6 +597,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,<br>       struct ucma_context *ctx;<br>       struct sockaddr *addr;<br>       int ret = 0;<br>+       enum rdma_transport_type tt;<br>
<br>       if (out_len < sizeof(resp))<br>               return -ENOSPC;<br>@@ -614,9 +623,11 @@ static ssize_t ucma_query_route(struct ucma_file *file,<br><br>       resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;<br>
       resp.port_num = ctx->cm_id->port_num;<br>-       switch (rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id->port_num)) {<br>+       tt = rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id->port_num);<br>
+       switch (tt) {<br>       case RDMA_TRANSPORT_IB:<br>-               ucma_copy_ib_route(&resp, &ctx->cm_id->route);<br>+       case RDMA_TRANSPORT_RDMAOE:<br>+               ucma_copy_ib_route(&resp, &ctx->cm_id->route, tt);<br>
               break;<br>       default:<br>               break;<br>diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h<br>index 483057b..66a848e 100644<br>--- a/include/rdma/ib_addr.h<br>+++ b/include/rdma/ib_addr.h<br>
@@ -39,6 +39,8 @@<br> #include <linux/netdevice.h><br> #include <linux/socket.h><br> #include <rdma/ib_verbs.h><br>+#include <linux/ethtool.h><br>+#include <rdma/ib_pack.h><br><br> struct rdma_addr_client {<br>
       atomic_t refcount;<br>@@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr,<br>       memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid);<br> }<br><br>+static inline void rdmaoe_mac_to_ll(union ib_gid *gid, u8 *mac)<br>
+{<br>+       memset(gid->raw, 0, 16);<br>+       *((u32 *)gid->raw) = cpu_to_be32(0xfe800000);<br>+       gid->raw[12] = 0xfe;<br>+       gid->raw[11] = 0xff;<br>+       memcpy(gid->raw + 13, mac + 3, 3);<br>
+       memcpy(gid->raw + 8, mac, 3);<br>+       gid->raw[8] ^= 2;<br>+}<br>+<br>+static inline void rdmaoe_addr_get_sgid(struct rdma_dev_addr *dev_addr,<br>+                                       union ib_gid *gid)<br>
+{<br>+       rdmaoe_mac_to_ll(gid, dev_addr->src_dev_addr);<br>+}<br>+<br>+static inline enum ib_mtu rdmaoe_get_mtu(int mtu)<br>+{<br>+       /*<br>+        * reduce IB headers from effective RDMAoE MTU. 28 stands for<br>
+        * atomic header which is the biggest possible header after BTH<br>+        */<br>+       mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;<br>+<br>+       if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))<br>+               return IB_MTU_4096;<br>
+       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))<br>+               return IB_MTU_2048;<br>+       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))<br>+               return IB_MTU_1024;<br>+       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))<br>
+               return IB_MTU_512;<br>+       else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))<br>+               return IB_MTU_256;<br>+       else<br>+               return 0;<br>+}<br>+<br>+static inline int rdmaoe_get_rate(struct net_device *dev)<br>
+{<br>+       struct ethtool_cmd cmd;<br>+<br>+       if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||<br>+           dev->ethtool_ops->get_settings(dev, &cmd))<br>+               return IB_RATE_PORT_CURRENT;<br>
+<br>+       if (cmd.speed >= 40000)<br>+               return IB_RATE_40_GBPS;<br>+       else if (cmd.speed >= 30000)<br>+               return IB_RATE_30_GBPS;<br>+       else if (cmd.speed >= 20000)<br>+               return IB_RATE_20_GBPS;<br>
+       else if (cmd.speed >= 10000)<br>+               return IB_RATE_10_GBPS;<br>+       else<br>+               return IB_RATE_PORT_CURRENT;<br>+}<br>+<br>+static inline int rdma_link_local_addr(struct in6_addr *addr)<br>
+{<br>+       if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&<br>+           addr->s6_addr32[1] == 0)<br>+               return 1;<br>+       else<br>+               return 0;<br>+}<br>+<br>+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)<br>
+{<br>+       memcpy(mac, &addr->s6_addr[8], 3);<br>+       memcpy(mac + 3, &addr->s6_addr[13], 3);<br>+       mac[0] ^= 2;<br>+}<br>+<br>+static inline int rdma_is_multicast_addr(struct in6_addr *addr)<br>+{<br>
+       return addr->s6_addr[0] == 0xff ? 1 : 0;<br>+}<br>+<br>+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)<br>+{<br>+       memset(mac, 0xff, 6);<br>+}<br>+<br> #endif /* IB_ADDR_H */<br>--<br>
1.6.3.3<br><br>_______________________________________________<br>general mailing list<br><a href="mailto:general@lists.openfabrics.org">general@lists.openfabrics.org</a><br><a href="http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general" target="_blank">http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general</a><br>
<br>To unsubscribe, please visit <a href="http://openib.org/mailman/listinfo/openib-general" target="_blank">http://openib.org/mailman/listinfo/openib-general</a><br></blockquote></div><br>