[ofa-general] [PATCHv4 06/10] ib_core: CMA device binding

Sean Hefty sean.hefty at intel.com
Mon Aug 10 12:31:19 PDT 2009


>@@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private
>*id_priv,
> {
> 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
> 	int ret;
>+	u16 pkey;
>+
>+        if (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num)
>==

nit: It looks like the if is indented by spaces, instead of a tab.

>+static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)
>+{
>+	struct rdma_route *route = &id_priv->id.route;
>+	struct rdma_addr *addr = &route->addr;
>+	struct cma_work *work;
>+	int ret;
>+	struct sockaddr_in *src_addr = (struct sockaddr_in *)&route-
>>addr.src_addr;
>+	struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route-
>>addr.dst_addr;
>+
>+	if (src_addr->sin_family != dst_addr->sin_family)
>+		return -EINVAL;
>+
>+	work = kzalloc(sizeof *work, GFP_KERNEL);
>+	if (!work)
>+		return -ENOMEM;
>+
>+	work->id = id_priv;
>+	INIT_WORK(&work->work, cma_work_handler);
>+
>+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
>+	if (!route->path_rec) {
>+		ret = -ENOMEM;
>+		goto err;
>+	}
>+
>+	route->num_paths = 1;
>+
>+	rdmaoe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr);
>+	rdmaoe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr);
>+
>+	route->path_rec->hop_limit = 2;
>+	route->path_rec->reversible = 1;
>+	route->path_rec->pkey = cpu_to_be16(0xffff);
>+	route->path_rec->mtu_selector = 2;
>+	route->path_rec->mtu = rdmaoe_get_mtu(addr->dev_addr.src_dev->mtu);
>+	route->path_rec->rate_selector = 2;
>+	route->path_rec->rate = rdmaoe_get_rate(addr->dev_addr.src_dev);
>+	route->path_rec->packet_life_time_selector = 2;
>+	route->path_rec->packet_life_time = RDMAOE_PACKET_LIFETIME;
>+
>+	work->old_state = CMA_ROUTE_QUERY;
>+	work->new_state = CMA_ROUTE_RESOLVED;
>+	if (!route->path_rec->mtu || !route->path_rec->rate) {
>+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
>+		work->event.status = -1;

Any reason not to fail immediately here and leave the id state unchanged?

>+	} else {
>+		work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
>+		work->event.status = 0;
>+	}
>+
>+	queue_work(cma_wq, &work->work);
>+
>+	return 0;
>+
>+err:
>+	kfree(work);
>+	return ret;
>+}
>+
> int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
> {
> 	struct rdma_id_private *id_priv;
>@@ -1744,6 +1824,9 @@ int rdma_resolve_route(struct rdma_cm_id *id, int
>timeout_ms)
> 	case RDMA_TRANSPORT_IWARP:
> 		ret = cma_resolve_iw_route(id_priv, timeout_ms);
> 		break;
>+	case RDMA_TRANSPORT_RDMAOE:
>+		ret = cma_resolve_rdmaoe_route(id_priv);
>+		break;
> 	default:
> 		ret = -ENOSYS;
> 		break;
>@@ -2419,6 +2502,7 @@ int rdma_connect(struct rdma_cm_id *id, struct
>rdma_conn_param *conn_param)
>
> 	switch (rdma_port_get_transport(id->device, id->port_num)) {
> 	case RDMA_TRANSPORT_IB:
>+	case RDMA_TRANSPORT_RDMAOE:
> 		if (cma_is_ud_ps(id->ps))
> 			ret = cma_resolve_ib_udp(id_priv, conn_param);
> 		else
>@@ -2532,6 +2616,7 @@ int rdma_accept(struct rdma_cm_id *id, struct
>rdma_conn_param *conn_param)
>
> 	switch (rdma_port_get_transport(id->device, id->port_num)) {
> 	case RDMA_TRANSPORT_IB:
>+	case RDMA_TRANSPORT_RDMAOE:
> 		if (cma_is_ud_ps(id->ps))
> 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
> 						conn_param->private_data,
>@@ -2593,6 +2678,7 @@ int rdma_reject(struct rdma_cm_id *id, const void
>*private_data,
>
> 	switch (rdma_port_get_transport(id->device, id->port_num)) {
> 	case RDMA_TRANSPORT_IB:
>+	case RDMA_TRANSPORT_RDMAOE:
> 		if (cma_is_ud_ps(id->ps))
> 			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
> 						private_data, private_data_len);
>@@ -2624,6 +2710,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
>
> 	switch (rdma_port_get_transport(id->device, id->port_num)) {
> 	case RDMA_TRANSPORT_IB:
>+	case RDMA_TRANSPORT_RDMAOE:
> 		ret = cma_modify_qp_err(id_priv);
> 		if (ret)
> 			goto out;
>@@ -2752,6 +2839,55 @@ static int cma_join_ib_multicast(struct rdma_id_private
>*id_priv,
> 	return 0;
> }
>
>+
>+static void rdmaoe_mcast_work_handler(struct work_struct *work)
>+{
>+	struct rdmaoe_mcast_work *mw = container_of(work, struct
>rdmaoe_mcast_work, work);
>+	struct cma_multicast *mc = mw->mc;
>+	struct ib_sa_multicast *m = mc->multicast.ib;
>+
>+	mc->multicast.ib->context = mc;
>+	cma_ib_mc_handler(0, m);
>+	kfree(m);
>+	kfree(mw);
>+}
>+
>+static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv,
>+				     struct cma_multicast *mc)
>+{
>+	struct rdmaoe_mcast_work *work;
>+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
>+
>+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
>+		return -EINVAL;
>+
>+	work = kzalloc(sizeof *work, GFP_KERNEL);
>+	if (!work)
>+		return -ENOMEM;
>+
>+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
>+	if (!mc->multicast.ib) {
>+		kfree(work);
>+		return -ENOMEM;
>+	}

nit: I'd prefer to goto a common cleanup area to make it easier to add changes
in the future. 

>+
>+	cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, &mc->multicast.ib-
>>rec.mgid);
>+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
>+	if (id_priv->id.ps == RDMA_PS_UDP)
>+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
>+	mc->multicast.ib->rec.rate = rdmaoe_get_rate(dev_addr->src_dev);
>+	mc->multicast.ib->rec.hop_limit = 1;
>+	mc->multicast.ib->rec.mtu = rdmaoe_get_mtu(dev_addr->src_dev->mtu);

Do we need to check the rate/mtu here, like in resolve route?  Or should we be
good since we could successfully resolve the route?  Actually, can we just read
the data from the path record that gets stored with the id?

>+	rdmaoe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
>+	work->id = id_priv;
>+	work->mc = mc;
>+	INIT_WORK(&work->work, rdmaoe_mcast_work_handler);
>+
>+	queue_work(cma_wq, &work->work);
>+
>+	return 0;
>+}
>+
> int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
> 			void *context)
> {
>@@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
>sockaddr *addr,
> 	case RDMA_TRANSPORT_IB:
> 		ret = cma_join_ib_multicast(id_priv, mc);
> 		break;
>+	case RDMA_TRANSPORT_RDMAOE:
>+		ret = cma_rdmaoe_join_multicast(id_priv, mc);
>+		break;
> 	default:
> 		ret = -ENOSYS;
> 		break;
>@@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct
>sockaddr *addr,
> 		spin_unlock_irq(&id_priv->lock);
> 		kfree(mc);
> 	}
>+
> 	return ret;
> }
> EXPORT_SYMBOL(rdma_join_multicast);
>@@ -2813,7 +2953,9 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct
>sockaddr *addr)
> 				ib_detach_mcast(id->qp,
> 						&mc->multicast.ib->rec.mgid,
> 						mc->multicast.ib->rec.mlid);
>-			ib_sa_free_multicast(mc->multicast.ib);
>+			if (rdma_port_get_transport(id_priv->cma_dev->device,
>id_priv->id.port_num) ==
>+			    RDMA_TRANSPORT_IB)
>+				ib_sa_free_multicast(mc->multicast.ib);
> 			kref_put(&mc->mcref, release_mc);
> 			return;
> 		}
>diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
>index 24d9510..c7c9e92 100644
>--- a/drivers/infiniband/core/ucma.c
>+++ b/drivers/infiniband/core/ucma.c
>@@ -553,7 +553,8 @@ static ssize_t ucma_resolve_route(struct ucma_file *file,
> }
>
> static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
>-			       struct rdma_route *route)
>+			       struct rdma_route *route,
>+			       enum rdma_transport_type tt)
> {
> 	struct rdma_dev_addr *dev_addr;
>
>@@ -561,10 +562,17 @@ static void ucma_copy_ib_route(struct
>rdma_ucm_query_route_resp *resp,
> 	switch (route->num_paths) {
> 	case 0:
> 		dev_addr = &route->addr.dev_addr;
>-		ib_addr_get_dgid(dev_addr,
>-				 (union ib_gid *) &resp->ib_route[0].dgid);
>-		ib_addr_get_sgid(dev_addr,
>-				 (union ib_gid *) &resp->ib_route[0].sgid);
>+		if (tt == RDMA_TRANSPORT_IB) {
>+			ib_addr_get_dgid(dev_addr,
>+					 (union ib_gid *)
&resp->ib_route[0].dgid);
>+			ib_addr_get_sgid(dev_addr,
>+					 (union ib_gid *)
&resp->ib_route[0].sgid);
>+		} else {
>+			rdmaoe_mac_to_ll((union ib_gid *)
&resp->ib_route[0].dgid,
>+					 dev_addr->dst_dev_addr);
>+			rdmaoe_addr_get_sgid(dev_addr,
>+					 (union ib_gid *)
&resp->ib_route[0].sgid);
>+		}
> 		resp->ib_route[0].pkey =
cpu_to_be16(ib_addr_get_pkey(dev_addr));
> 		break;
> 	case 2:
>@@ -589,6 +597,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
> 	struct ucma_context *ctx;
> 	struct sockaddr *addr;
> 	int ret = 0;
>+	enum rdma_transport_type tt;
>
> 	if (out_len < sizeof(resp))
> 		return -ENOSPC;
>@@ -614,9 +623,11 @@ static ssize_t ucma_query_route(struct ucma_file *file,
>
> 	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
> 	resp.port_num = ctx->cm_id->port_num;
>-	switch (rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id-
>>port_num)) {
>+	tt = rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id->port_num);
>+	switch (tt) {
> 	case RDMA_TRANSPORT_IB:
>-		ucma_copy_ib_route(&resp, &ctx->cm_id->route);
>+	case RDMA_TRANSPORT_RDMAOE:
>+		ucma_copy_ib_route(&resp, &ctx->cm_id->route, tt);

It seems simpler to just add a new call ucma_copy_rdmaoe_route, rather than
merging those two transports into a single copy function that then branches
based on the transport.

> 		break;
> 	default:
> 		break;
>diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
>index 483057b..66a848e 100644
>--- a/include/rdma/ib_addr.h
>+++ b/include/rdma/ib_addr.h
>@@ -39,6 +39,8 @@
> #include <linux/netdevice.h>
> #include <linux/socket.h>
> #include <rdma/ib_verbs.h>
>+#include <linux/ethtool.h>
>+#include <rdma/ib_pack.h>
>
> struct rdma_addr_client {
> 	atomic_t refcount;
>@@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct rdma_dev_addr
>*dev_addr,
> 	memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid);
> }
>
>+static inline void rdmaoe_mac_to_ll(union ib_gid *gid, u8 *mac)
>+{
>+	memset(gid->raw, 0, 16);
>+	*((u32 *)gid->raw) = cpu_to_be32(0xfe800000);
>+	gid->raw[12] = 0xfe;
>+	gid->raw[11] = 0xff;
>+	memcpy(gid->raw + 13, mac + 3, 3);
>+	memcpy(gid->raw + 8, mac, 3);
>+	gid->raw[8] ^= 2;
>+}
>+
>+static inline void rdmaoe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
>+					union ib_gid *gid)
>+{
>+	rdmaoe_mac_to_ll(gid, dev_addr->src_dev_addr);
>+}
>+
>+static inline enum ib_mtu rdmaoe_get_mtu(int mtu)
>+{
>+	/*
>+	 * reduce IB headers from effective RDMAoE MTU. 28 stands for
>+	 * atomic header which is the biggest possible header after BTH
>+	 */
>+	mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
>+
>+	if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
>+		return IB_MTU_4096;
>+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
>+		return IB_MTU_2048;
>+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
>+		return IB_MTU_1024;
>+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
>+		return IB_MTU_512;
>+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
>+		return IB_MTU_256;
>+	else
>+		return 0;
>+}
>+
>+static inline int rdmaoe_get_rate(struct net_device *dev)
>+{
>+	struct ethtool_cmd cmd;
>+
>+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
>+	    dev->ethtool_ops->get_settings(dev, &cmd))
>+		return IB_RATE_PORT_CURRENT;
>+
>+	if (cmd.speed >= 40000)
>+		return IB_RATE_40_GBPS;
>+	else if (cmd.speed >= 30000)
>+		return IB_RATE_30_GBPS;
>+	else if (cmd.speed >= 20000)
>+		return IB_RATE_20_GBPS;
>+	else if (cmd.speed >= 10000)
>+		return IB_RATE_10_GBPS;
>+	else
>+		return IB_RATE_PORT_CURRENT;
>+}
>+
>+static inline int rdma_link_local_addr(struct in6_addr *addr)
>+{
>+	if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
>+	    addr->s6_addr32[1] == 0)
>+		return 1;
>+	else
>+		return 0;
>+}

just replace the 'if' with 'return'

>+
>+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
>+{
>+	memcpy(mac, &addr->s6_addr[8], 3);
>+	memcpy(mac + 3, &addr->s6_addr[13], 3);
>+	mac[0] ^= 2;
>+}
>+
>+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
>+{
>+	return addr->s6_addr[0] == 0xff ? 1 : 0;
>+}
>+
>+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
>+{
>+	memset(mac, 0xff, 6);
>+}

I don't think we want all of these inline, in particular rdmaoe_mac_to_ll,
rdmaoe_get_mtu , rdmaoe_get_rate.




More information about the general mailing list