[ofa-general] [PATCHv4 06/10] ib_core: CMA device binding
Eli Cohen
eli at mellanox.co.il
Wed Aug 5 01:29:29 PDT 2009
Add support for RDMAoE device binding and IP --> GID resolution. Path resolving
and multicast joining are implemented within cma.c by filling the responses and
pushing the callbacks to the cma work queue. IP->GID resolution always yield
IPv6 link local addresses - remote GIDs are derived from the destination MAC
address of the remote port. Multicast GIDs are always mapped to broadcast MAC
(all FFs). Some helper functions are added to ib_addr.h.
Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
drivers/infiniband/core/cma.c | 150 ++++++++++++++++++++++++++++++++++++++-
drivers/infiniband/core/ucma.c | 25 +++++--
include/rdma/ib_addr.h | 87 +++++++++++++++++++++++
3 files changed, 251 insertions(+), 11 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 866ff7f..8f5675b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -58,6 +58,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define RDMAOE_PACKET_LIFETIME 18
static void cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device);
@@ -174,6 +175,12 @@ struct cma_ndev_work {
struct rdma_cm_event event;
};
+struct rdmaoe_mcast_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct cma_multicast *mc;
+};
+
union cma_ip_addr {
struct in6_addr ip6;
struct {
@@ -348,6 +355,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, &gid);
break;
+ case RDMA_TRANSPORT_RDMAOE:
+ rdmaoe_addr_get_sgid(dev_addr, &gid);
+ break;
default:
return -ENODEV;
}
@@ -576,10 +586,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
int ret;
+ u16 pkey;
+
+ if (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num) ==
+ RDMA_TRANSPORT_IB)
+ pkey = ib_addr_get_pkey(dev_addr);
+ else
+ pkey = 0xffff;
ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
- ib_addr_get_pkey(dev_addr),
- &qp_attr->pkey_index);
+ pkey, &qp_attr->pkey_index);
if (ret)
return ret;
@@ -609,6 +625,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
id_priv = container_of(id, struct rdma_id_private, id);
switch (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
else
@@ -836,7 +853,9 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
mc = container_of(id_priv->mc_list.next,
struct cma_multicast, list);
list_del(&mc->list);
- ib_sa_free_multicast(mc->multicast.ib);
+ if (rdma_port_get_transport(id_priv->cma_dev->device, id_priv->id.port_num) ==
+ RDMA_TRANSPORT_IB)
+ ib_sa_free_multicast(mc->multicast.ib);
kref_put(&mc->mcref, release_mc);
}
}
@@ -855,6 +874,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
mutex_unlock(&lock);
switch (rdma_port_get_transport(id_priv->id.device, id_priv->id.port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
ib_destroy_cm_id(id_priv->cm_id.ib);
break;
@@ -1512,6 +1532,7 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
if (id->device) {
switch (rdma_port_get_transport(id->device, id->port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
ret = cma_ib_listen(id_priv);
if (ret)
goto err;
@@ -1727,6 +1748,65 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
return 0;
}
+static int cma_resolve_rdmaoe_route(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct rdma_addr *addr = &route->addr;
+ struct cma_work *work;
+ int ret;
+ struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
+ struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
+
+ if (src_addr->sin_family != dst_addr->sin_family)
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+
+ route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ route->num_paths = 1;
+
+ rdmaoe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr);
+ rdmaoe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr);
+
+ route->path_rec->hop_limit = 2;
+ route->path_rec->reversible = 1;
+ route->path_rec->pkey = cpu_to_be16(0xffff);
+ route->path_rec->mtu_selector = 2;
+ route->path_rec->mtu = rdmaoe_get_mtu(addr->dev_addr.src_dev->mtu);
+ route->path_rec->rate_selector = 2;
+ route->path_rec->rate = rdmaoe_get_rate(addr->dev_addr.src_dev);
+ route->path_rec->packet_life_time_selector = 2;
+ route->path_rec->packet_life_time = RDMAOE_PACKET_LIFETIME;
+
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ if (!route->path_rec->mtu || !route->path_rec->rate) {
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = -1;
+ } else {
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ work->event.status = 0;
+ }
+
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+err:
+ kfree(work);
+ return ret;
+}
+
int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
{
struct rdma_id_private *id_priv;
@@ -1744,6 +1824,9 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
case RDMA_TRANSPORT_IWARP:
ret = cma_resolve_iw_route(id_priv, timeout_ms);
break;
+ case RDMA_TRANSPORT_RDMAOE:
+ ret = cma_resolve_rdmaoe_route(id_priv);
+ break;
default:
ret = -ENOSYS;
break;
@@ -2419,6 +2502,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
switch (rdma_port_get_transport(id->device, id->port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
if (cma_is_ud_ps(id->ps))
ret = cma_resolve_ib_udp(id_priv, conn_param);
else
@@ -2532,6 +2616,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
switch (rdma_port_get_transport(id->device, id->port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
if (cma_is_ud_ps(id->ps))
ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
conn_param->private_data,
@@ -2593,6 +2678,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
switch (rdma_port_get_transport(id->device, id->port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
if (cma_is_ud_ps(id->ps))
ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
private_data, private_data_len);
@@ -2624,6 +2710,7 @@ int rdma_disconnect(struct rdma_cm_id *id)
switch (rdma_port_get_transport(id->device, id->port_num)) {
case RDMA_TRANSPORT_IB:
+ case RDMA_TRANSPORT_RDMAOE:
ret = cma_modify_qp_err(id_priv);
if (ret)
goto out;
@@ -2752,6 +2839,55 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
return 0;
}
+
+static void rdmaoe_mcast_work_handler(struct work_struct *work)
+{
+ struct rdmaoe_mcast_work *mw = container_of(work, struct rdmaoe_mcast_work, work);
+ struct cma_multicast *mc = mw->mc;
+ struct ib_sa_multicast *m = mc->multicast.ib;
+
+ mc->multicast.ib->context = mc;
+ cma_ib_mc_handler(0, m);
+ kfree(m);
+ kfree(mw);
+}
+
+static int cma_rdmaoe_join_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct rdmaoe_mcast_work *work;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+
+ if (cma_zero_addr((struct sockaddr *)&mc->addr))
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+ if (!mc->multicast.ib) {
+ kfree(work);
+ return -ENOMEM;
+ }
+
+ cma_set_mgid(id_priv, (struct sockaddr *)&mc->addr, &mc->multicast.ib->rec.mgid);
+ mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+ mc->multicast.ib->rec.rate = rdmaoe_get_rate(dev_addr->src_dev);
+ mc->multicast.ib->rec.hop_limit = 1;
+ mc->multicast.ib->rec.mtu = rdmaoe_get_mtu(dev_addr->src_dev->mtu);
+ rdmaoe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
+ work->id = id_priv;
+ work->mc = mc;
+ INIT_WORK(&work->work, rdmaoe_mcast_work_handler);
+
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+}
+
int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
void *context)
{
@@ -2782,6 +2918,9 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
case RDMA_TRANSPORT_IB:
ret = cma_join_ib_multicast(id_priv, mc);
break;
+ case RDMA_TRANSPORT_RDMAOE:
+ ret = cma_rdmaoe_join_multicast(id_priv, mc);
+ break;
default:
ret = -ENOSYS;
break;
@@ -2793,6 +2932,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
spin_unlock_irq(&id_priv->lock);
kfree(mc);
}
+
return ret;
}
EXPORT_SYMBOL(rdma_join_multicast);
@@ -2813,7 +2953,9 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
ib_detach_mcast(id->qp,
&mc->multicast.ib->rec.mgid,
mc->multicast.ib->rec.mlid);
- ib_sa_free_multicast(mc->multicast.ib);
+ if (rdma_port_get_transport(id_priv->cma_dev->device, id_priv->id.port_num) ==
+ RDMA_TRANSPORT_IB)
+ ib_sa_free_multicast(mc->multicast.ib);
kref_put(&mc->mcref, release_mc);
return;
}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 24d9510..c7c9e92 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -553,7 +553,8 @@ static ssize_t ucma_resolve_route(struct ucma_file *file,
}
static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
- struct rdma_route *route)
+ struct rdma_route *route,
+ enum rdma_transport_type tt)
{
struct rdma_dev_addr *dev_addr;
@@ -561,10 +562,17 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
switch (route->num_paths) {
case 0:
dev_addr = &route->addr.dev_addr;
- ib_addr_get_dgid(dev_addr,
- (union ib_gid *) &resp->ib_route[0].dgid);
- ib_addr_get_sgid(dev_addr,
- (union ib_gid *) &resp->ib_route[0].sgid);
+ if (tt == RDMA_TRANSPORT_IB) {
+ ib_addr_get_dgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].dgid);
+ ib_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ } else {
+ rdmaoe_mac_to_ll((union ib_gid *) &resp->ib_route[0].dgid,
+ dev_addr->dst_dev_addr);
+ rdmaoe_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ }
resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
break;
case 2:
@@ -589,6 +597,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
struct ucma_context *ctx;
struct sockaddr *addr;
int ret = 0;
+ enum rdma_transport_type tt;
if (out_len < sizeof(resp))
return -ENOSPC;
@@ -614,9 +623,11 @@ static ssize_t ucma_query_route(struct ucma_file *file,
resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
resp.port_num = ctx->cm_id->port_num;
- switch (rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id->port_num)) {
+ tt = rdma_port_get_transport(ctx->cm_id->device, ctx->cm_id->port_num);
+ switch (tt) {
case RDMA_TRANSPORT_IB:
- ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ case RDMA_TRANSPORT_RDMAOE:
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route, tt);
break;
default:
break;
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 483057b..66a848e 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -39,6 +39,8 @@
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <rdma/ib_verbs.h>
+#include <linux/ethtool.h>
+#include <rdma/ib_pack.h>
struct rdma_addr_client {
atomic_t refcount;
@@ -157,4 +159,89 @@ static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr,
memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid);
}
+static inline void rdmaoe_mac_to_ll(union ib_gid *gid, u8 *mac)
+{
+ memset(gid->raw, 0, 16);
+ *((u32 *)gid->raw) = cpu_to_be32(0xfe800000);
+ gid->raw[12] = 0xfe;
+ gid->raw[11] = 0xff;
+ memcpy(gid->raw + 13, mac + 3, 3);
+ memcpy(gid->raw + 8, mac, 3);
+ gid->raw[8] ^= 2;
+}
+
+static inline void rdmaoe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ rdmaoe_mac_to_ll(gid, dev_addr->src_dev_addr);
+}
+
+static inline enum ib_mtu rdmaoe_get_mtu(int mtu)
+{
+ /*
+ * reduce IB headers from effective RDMAoE MTU. 28 stands for
+ * atomic header which is the biggest possible header after BTH
+ */
+ mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
+
+ if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+ return IB_MTU_4096;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+ return IB_MTU_2048;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+ return IB_MTU_1024;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+ return IB_MTU_512;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+ return IB_MTU_256;
+ else
+ return 0;
+}
+
+static inline int rdmaoe_get_rate(struct net_device *dev)
+{
+ struct ethtool_cmd cmd;
+
+ if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
+ dev->ethtool_ops->get_settings(dev, &cmd))
+ return IB_RATE_PORT_CURRENT;
+
+ if (cmd.speed >= 40000)
+ return IB_RATE_40_GBPS;
+ else if (cmd.speed >= 30000)
+ return IB_RATE_30_GBPS;
+ else if (cmd.speed >= 20000)
+ return IB_RATE_20_GBPS;
+ else if (cmd.speed >= 10000)
+ return IB_RATE_10_GBPS;
+ else
+ return IB_RATE_PORT_CURRENT;
+}
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+ if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
+ addr->s6_addr32[1] == 0)
+ return 1;
+ else
+ return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+ memcpy(mac, &addr->s6_addr[8], 3);
+ memcpy(mac + 3, &addr->s6_addr[13], 3);
+ mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+ return addr->s6_addr[0] == 0xff ? 1 : 0;
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+ memset(mac, 0xff, 6);
+}
+
#endif /* IB_ADDR_H */
--
1.6.3.3
More information about the general
mailing list