[openib-general] Re: [PATCH] [ib_addr] generalize address to RDMA device translation
Tom Tucker
tom at opengridcomputing.com
Tue Jan 3 11:35:22 PST 2006
Sean:
This is a good start towards generalizing the IP address translation
service.
I think there is quite a bit of "hair" in this area that is "Part 2" of
the iWARP integration -- after the integration of the current stuff into
the trunk. I was going to introduce some of this "hair" at the face to
face in Sonoma, but we should probably get it going now.
There are three areas that need attention:
- ARP resolution
- ROUTE changes
- Path MTU changes
ARP Resolve
The iWARP side needs to be able to resolve an IP address to an Ethernet
address. Today this is not done for iWARP and it works because the
AMSO1100 does this itself in the hardware. Other iWARP devices probably
don't. This means that the logic in ib_at needs to be extended on the
iWARP side to call neigh_event_send (instead of arp_send) to resolve an
IP to an Ethernet address. The current method of calling arp_send
directly and "sniffing" for arp replies is probably not the best way to
go long term. It would be better to register for neighbor update events
(new mechanism) and be notified when the neighbor entry gets resolved.
This is better for two reasons: 1) it doesn't duplicate code already in
Linux, and 2) unlike IB, Ethernet MAC addresses may change for the next
hop while the connection is still active. The provider needs to know
this so it's hardware ARP tables can be updated.
ROUTE Changes
Two obvious cases, 1) the next hop changes due to normal network least-
cost routing, and 2) the user changes a route manually. Both events
would require the iWARP provider to be notified (via an event again) and
update its hardware
PathMTU
The new route to the remote peer has a hop with a smaller MTU than we're
currently using. Ouch! All my packets are going to be dropped until I
reduce my path MTU. The provider can't know unless he is either
filtering all ICMP traffic himself ("evil") or is notified via an event
("nice").
So all this said, my little brain had imagined this logic going in and
around the ib_at module in a wonderfully crafted bit of algorithmic art
-- once I figured out how to do it all ;-)
It sounds like you're beating the same bushes. How would you like to
proceed?
Tom
On Tue, 2006-01-03 at 10:13 -0800, Sean Hefty wrote:
> The following patch changes the ib_addr interface to make it more generic.
> The interface now translates IP addresses directly into source and destination
> device and broadcast addresses. The CMA is then responsible for interpreting
> these addresses as GIDs/PKey or MAC addresses. The intent is that this will
> simplify integrating support for other RDMA devices in the CMA.
>
> I'd like to get some feedback from the iWarp community on whether this approach
> works for them, or if different/additional changes are needed.
>
> Signed-off-by: Sean Hefty <sean.hefty at intel.com>
>
>
> Index: include/rdma/rdma_cm.h
> ===================================================================
> --- include/rdma/rdma_cm.h (revision 4651)
> +++ include/rdma/rdma_cm.h (working copy)
> @@ -68,9 +68,7 @@ struct rdma_addr {
> struct sockaddr dst_addr;
> u8 dst_pad[sizeof(struct sockaddr_in6) -
> sizeof(struct sockaddr)];
> - union {
> - struct ib_addr ibaddr;
> - } addr;
> + struct rdma_dev_addr dev_addr;
> };
>
> struct rdma_route {
> Index: include/rdma/ib_addr.h
> ===================================================================
> --- include/rdma/ib_addr.h (revision 4654)
> +++ include/rdma/ib_addr.h (working copy)
> @@ -32,26 +32,28 @@
>
> #include <linux/in.h>
> #include <linux/in6.h>
> +#include <linux/netdevice.h>
> #include <linux/socket.h>
> #include <rdma/ib_verbs.h>
>
> extern struct workqueue_struct *rdma_wq;
>
> -struct ib_addr {
> - union ib_gid sgid;
> - union ib_gid dgid;
> - u16 pkey;
> +struct rdma_dev_addr {
> + unsigned char src_dev_addr[MAX_ADDR_LEN];
> + unsigned char dst_dev_addr[MAX_ADDR_LEN];
> + unsigned char broadcast[MAX_ADDR_LEN];
> + enum ib_node_type dev_type;
> };
>
> /**
> - * ib_translate_addr - Translate a local IP address to an Infiniband GID and
> - * PKey.
> + * rdma_translate_ip - Translate a local IP address to an RDMA hardware
> + * address.
> */
> -int ib_translate_addr(struct sockaddr *addr, union ib_gid *gid, u16 *pkey);
> +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr);
>
> /**
> - * ib_resolve_addr - Resolve source and destination IP addresses to
> - * Infiniband network addresses.
> + * rdma_resolve_ip - Resolve source and destination IP addresses to
> + * RDMA hardware addresses.
> * @src_addr: An optional source address to use in the resolution. If a
> * source address is not provided, a usable address will be returned via
> * the callback.
> @@ -64,13 +66,13 @@ int ib_translate_addr(struct sockaddr *a
> * or been canceled. A status of 0 indicates success.
> * @context: User-specified context associated with the call.
> */
> -int ib_resolve_addr(struct sockaddr *src_addr, struct sockaddr *dst_addr,
> - struct ib_addr *addr, int timeout_ms,
> +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr,
> + struct rdma_dev_addr *addr, int timeout_ms,
> void (*callback)(int status, struct sockaddr *src_addr,
> - struct ib_addr *addr, void *context),
> + struct rdma_dev_addr *addr, void *context),
> void *context);
>
> -void ib_addr_cancel(struct ib_addr *addr);
> +void rdma_addr_cancel(struct rdma_dev_addr *addr);
>
> static inline int ip_addr_size(struct sockaddr *addr)
> {
> @@ -78,5 +80,38 @@ static inline int ip_addr_size(struct so
> sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
> }
>
> +static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
> +{
> + return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
> +}
> +
> +static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
> +{
> + dev_addr->broadcast[8] = pkey >> 8;
> + dev_addr->broadcast[9] = (unsigned char) pkey;
> +}
> +
> +static inline union ib_gid* ib_addr_get_sgid(struct rdma_dev_addr *dev_addr)
> +{
> + return (union ib_gid *) (dev_addr->src_dev_addr + 4);
> +}
> +
> +static inline void ib_addr_set_sgid(struct rdma_dev_addr *dev_addr,
> + union ib_gid *gid)
> +{
> + memcpy(dev_addr->src_dev_addr + 4, gid, sizeof *gid);
> +}
> +
> +static inline union ib_gid* ib_addr_get_dgid(struct rdma_dev_addr *dev_addr)
> +{
> + return (union ib_gid *) (dev_addr->dst_dev_addr + 4);
> +}
> +
> +static inline void ib_addr_set_dgid(struct rdma_dev_addr *dev_addr,
> + union ib_gid *gid)
> +{
> + memcpy(dev_addr->dst_dev_addr + 4, gid, sizeof *gid);
> +}
> +
> #endif /* IB_ADDR_H */
>
> Index: core/addr.c
> ===================================================================
> --- core/addr.c (revision 4654)
> +++ core/addr.c (working copy)
> @@ -42,10 +42,10 @@ struct addr_req {
> struct list_head list;
> struct sockaddr src_addr;
> struct sockaddr dst_addr;
> - struct ib_addr *addr;
> + struct rdma_dev_addr *addr;
> void *context;
> void (*callback)(int status, struct sockaddr *src_addr,
> - struct ib_addr *addr, void *context);
> + struct rdma_dev_addr *addr, void *context);
> unsigned long timeout;
> int status;
> };
> @@ -58,26 +58,39 @@ static DECLARE_WORK(work, process_req, N
> struct workqueue_struct *rdma_wq;
> EXPORT_SYMBOL(rdma_wq);
>
> -static u16 addr_get_pkey(struct net_device *dev)
> +static int copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
> + unsigned char *dst_dev_addr)
> {
> - return ((u16)dev->broadcast[8] << 8) | (u16)dev->broadcast[9];
> + switch (dev->type) {
> + case ARPHRD_INFINIBAND:
> + dev_addr->dev_type = IB_NODE_CA;
> + break;
> + default:
> + return -EADDRNOTAVAIL;
> + }
> +
> + memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
> + memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
> + if (dst_dev_addr)
> + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
> + return 0;
> }
>
> -int ib_translate_addr(struct sockaddr *addr, union ib_gid *gid, u16 *pkey)
> +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
> {
> struct net_device *dev;
> u32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
> + int ret;
>
> dev = ip_dev_find(ip);
> if (!dev)
> return -EADDRNOTAVAIL;
>
> - *gid = *(union ib_gid *) (dev->dev_addr + 4);
> - *pkey = addr_get_pkey(dev);
> + ret = copy_addr(dev_addr, dev, NULL);
> dev_put(dev);
> - return 0;
> + return ret;
> }
> -EXPORT_SYMBOL(ib_translate_addr);
> +EXPORT_SYMBOL(rdma_translate_ip);
>
> static void set_timeout(unsigned long time)
> {
> @@ -127,7 +140,7 @@ static void addr_send_arp(struct sockadd
>
> static int addr_resolve_remote(struct sockaddr_in *src_in,
> struct sockaddr_in *dst_in,
> - struct ib_addr *addr)
> + struct rdma_dev_addr *addr)
> {
> u32 src_ip = src_in->sin_addr.s_addr;
> u32 dst_ip = dst_in->sin_addr.s_addr;
> @@ -159,10 +172,7 @@ static int addr_resolve_remote(struct so
> src_in->sin_addr.s_addr = rt->rt_src;
> }
>
> - addr->sgid = *(union ib_gid *) (neigh->dev->dev_addr + 4);
> - addr->dgid = *(union ib_gid *) (neigh->ha + 4);
> - addr->pkey = addr_get_pkey(neigh->dev);
> -
> + ret = copy_addr(addr, neigh->dev, neigh->ha);
> err2:
> neigh_release(neigh);
> err1:
> @@ -212,12 +222,12 @@ static void process_req(void *data)
>
> static int addr_resolve_local(struct sockaddr_in *src_in,
> struct sockaddr_in *dst_in,
> - struct ib_addr *addr)
> + struct rdma_dev_addr *addr)
> {
> struct net_device *dev;
> u32 src_ip = src_in->sin_addr.s_addr;
> u32 dst_ip = dst_in->sin_addr.s_addr;
> - int ret = 0;
> + int ret;
>
> dev = ip_dev_find(dst_ip);
> if (!dev)
> @@ -226,25 +236,21 @@ static int addr_resolve_local(struct soc
> if (!src_ip) {
> src_in->sin_family = dst_in->sin_family;
> src_in->sin_addr.s_addr = dst_ip;
> - addr->sgid = *(union ib_gid *) (dev->dev_addr + 4);
> - addr->pkey = addr_get_pkey(dev);
> + ret = copy_addr(addr, dev, dev->dev_addr);
> } else {
> - ret = ib_translate_addr((struct sockaddr *)src_in,
> - &addr->sgid, &addr->pkey);
> - if (ret)
> - goto out;
> + ret = rdma_translate_ip((struct sockaddr *)src_in, addr);
> + if (!ret)
> + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
> }
>
> - addr->dgid = *(union ib_gid *) (dev->dev_addr + 4);
> -out:
> dev_put(dev);
> return ret;
> }
>
> -int ib_resolve_addr(struct sockaddr *src_addr, struct sockaddr *dst_addr,
> - struct ib_addr *addr, int timeout_ms,
> +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr,
> + struct rdma_dev_addr *addr, int timeout_ms,
> void (*callback)(int status, struct sockaddr *src_addr,
> - struct ib_addr *addr, void *context),
> + struct rdma_dev_addr *addr, void *context),
> void *context)
> {
> struct sockaddr_in *src_in, *dst_in;
> @@ -287,9 +293,9 @@ int ib_resolve_addr(struct sockaddr *src
> }
> return ret;
> }
> -EXPORT_SYMBOL(ib_resolve_addr);
> +EXPORT_SYMBOL(rdma_resolve_ip);
>
> -void ib_addr_cancel(struct ib_addr *addr)
> +void rdma_addr_cancel(struct rdma_dev_addr *addr)
> {
> struct addr_req *req, *temp_req;
>
> @@ -306,7 +312,7 @@ void ib_addr_cancel(struct ib_addr *addr
> }
> up(&mutex);
> }
> -EXPORT_SYMBOL(ib_addr_cancel);
> +EXPORT_SYMBOL(rdma_addr_cancel);
>
> static int addr_arp_recv(struct sk_buff *skb, struct net_device *dev,
> struct packet_type *pkt, struct net_device *orig_dev)
> Index: core/cma.c
> ===================================================================
> --- core/cma.c (revision 4655)
> +++ core/cma.c (working copy)
> @@ -213,12 +213,14 @@ static void cma_detach_from_dev(struct r
> id_priv->cma_dev = NULL;
> }
>
> -static int cma_acquire_ib_dev(struct rdma_id_private *id_priv,
> - union ib_gid *gid)
> +static int cma_acquire_ib_dev(struct rdma_id_private *id_priv)
> {
> struct cma_device *cma_dev;
> + union ib_gid *gid;
> int ret = -ENODEV;
>
> + gid = ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr);
> +
> down(&mutex);
> list_for_each_entry(cma_dev, &dev_list, list) {
> ret = ib_find_cached_gid(cma_dev->device, gid,
> @@ -232,6 +234,16 @@ static int cma_acquire_ib_dev(struct rdm
> return ret;
> }
>
> +static int cma_acquire_dev(struct rdma_id_private *id_priv)
> +{
> + switch (id_priv->id.route.addr.dev_addr.dev_type) {
> + case IB_NODE_CA:
> + return cma_acquire_ib_dev(id_priv);
> + default:
> + return -ENODEV;
> + }
> +}
> +
> static void cma_deref_id(struct rdma_id_private *id_priv)
> {
> if (atomic_dec_and_test(&id_priv->refcount))
> @@ -272,11 +284,13 @@ EXPORT_SYMBOL(rdma_create_id);
> static int cma_init_ib_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
> {
> struct ib_qp_attr qp_attr;
> - struct ib_addr *ibaddr = &id_priv->id.route.addr.addr.ibaddr;
> + struct rdma_dev_addr *dev_addr;
> int ret;
>
> + dev_addr = &id_priv->id.route.addr.dev_addr;
> ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
> - ibaddr->pkey, &qp_attr.pkey_index);
> + ib_addr_get_pkey(dev_addr),
> + &qp_attr.pkey_index);
> if (ret)
> return ret;
>
> @@ -520,7 +534,7 @@ static void cma_cancel_addr(struct rdma_
> {
> switch (id_priv->id.device->node_type) {
> case IB_NODE_CA:
> - ib_addr_cancel(&id_priv->id.route.addr.addr.ibaddr);
> + rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
> break;
> default:
> break;
> @@ -760,9 +774,10 @@ static struct rdma_id_private* cma_new_i
> if (rt->num_paths == 2)
> rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
>
> - rt->addr.addr.ibaddr.sgid = rt->path_rec[0].sgid;
> - rt->addr.addr.ibaddr.dgid = rt->path_rec[0].dgid;
> - rt->addr.addr.ibaddr.pkey = be16_to_cpu(rt->path_rec[0].pkey);
> + ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
> + ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
> + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
> + rt->addr.dev_addr.dev_type = IB_NODE_CA;
>
> id_priv = container_of(id, struct rdma_id_private, id);
> id_priv->state = CMA_CONNECT;
> @@ -791,7 +806,7 @@ static int cma_req_handler(struct ib_cm_
> }
>
> atomic_inc(&conn_id->dev_remove);
> - ret = cma_acquire_ib_dev(conn_id, &conn_id->id.route.path_rec[0].sgid);
> + ret = cma_acquire_ib_dev(conn_id);
> if (ret) {
> ret = -ENODEV;
> cma_release_remove(conn_id);
> @@ -1028,13 +1043,13 @@ out:
>
> static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
> {
> - struct ib_addr *addr = &id_priv->id.route.addr.addr.ibaddr;
> + struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
> struct ib_sa_path_rec path_rec;
>
> memset(&path_rec, 0, sizeof path_rec);
> - path_rec.sgid = addr->sgid;
> - path_rec.dgid = addr->dgid;
> - path_rec.pkey = cpu_to_be16(addr->pkey);
> + path_rec.sgid = *ib_addr_get_sgid(addr);
> + path_rec.dgid = *ib_addr_get_dgid(addr);
> + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
> path_rec.numb_path = 1;
>
> id_priv->query_id = ib_sa_path_rec_get(id_priv->id.device,
> @@ -1079,6 +1094,8 @@ EXPORT_SYMBOL(rdma_resolve_route);
> static int cma_bind_loopback(struct rdma_id_private *id_priv)
> {
> struct cma_device *cma_dev;
> + union ib_gid *gid;
> + u16 pkey;
> int ret;
>
> down(&mutex);
> @@ -1088,16 +1105,16 @@ static int cma_bind_loopback(struct rdma
> }
>
> cma_dev = list_entry(dev_list.next, struct cma_device, list);
> - ret = ib_get_cached_gid(cma_dev->device, 1, 0,
> - &id_priv->id.route.addr.addr.ibaddr.sgid);
> + gid = ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr);
> + ret = ib_get_cached_gid(cma_dev->device, 1, 0, gid);
> if (ret)
> goto out;
>
> - ret = ib_get_cached_pkey(cma_dev->device, 1, 0,
> - &id_priv->id.route.addr.addr.ibaddr.pkey);
> + ret = ib_get_cached_pkey(cma_dev->device, 1, 0, &pkey);
> if (ret)
> goto out;
>
> + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
> id_priv->id.port_num = 1;
> cma_attach_to_dev(id_priv, cma_dev);
> out:
> @@ -1106,7 +1123,7 @@ out:
> }
>
> static void addr_handler(int status, struct sockaddr *src_addr,
> - struct ib_addr *ibaddr, void *context)
> + struct rdma_dev_addr *dev_addr, void *context)
> {
> struct rdma_id_private *id_priv = context;
> enum rdma_cm_event_type event;
> @@ -1116,7 +1133,7 @@ static void addr_handler(int status, str
> if (!id_priv->cma_dev) {
> old_state = CMA_IDLE;
> if (!status)
> - status = cma_acquire_ib_dev(id_priv, &ibaddr->sgid);
> + status = cma_acquire_dev(id_priv);
> } else
> old_state = CMA_ADDR_BOUND;
>
> @@ -1169,6 +1186,7 @@ static int cma_resolve_loopback(struct r
> struct sockaddr *src_addr, enum cma_state state)
> {
> struct work_struct *work;
> + struct rdma_dev_addr *dev_addr;
> int ret;
>
> work = kmalloc(sizeof *work, GFP_KERNEL);
> @@ -1179,8 +1197,8 @@ static int cma_resolve_loopback(struct r
> ret = cma_bind_loopback(id_priv);
> if (ret)
> goto err;
> - id_priv->id.route.addr.addr.ibaddr.dgid =
> - id_priv->id.route.addr.addr.ibaddr.sgid;
> + dev_addr = &id_priv->id.route.addr.dev_addr;
> + ib_addr_set_dgid(dev_addr, ib_addr_get_sgid(dev_addr));
> if (!src_addr || cma_any_addr(src_addr))
> src_addr = &id_priv->id.route.addr.dst_addr;
> memcpy(&id_priv->id.route.addr.src_addr, src_addr,
> @@ -1217,8 +1235,8 @@ int rdma_resolve_addr(struct rdma_cm_id
> if (cma_loopback_addr(dst_addr))
> ret = cma_resolve_loopback(id_priv, src_addr, expected_state);
> else
> - ret = ib_resolve_addr(src_addr, dst_addr,
> - &id->route.addr.addr.ibaddr,
> + ret = rdma_resolve_ip(src_addr, dst_addr,
> + &id->route.addr.dev_addr,
> timeout_ms, addr_handler, id_priv);
> if (ret)
> goto err;
> @@ -1234,7 +1252,7 @@ EXPORT_SYMBOL(rdma_resolve_addr);
> int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
> {
> struct rdma_id_private *id_priv;
> - struct ib_addr *ibaddr;
> + struct rdma_dev_addr *dev_addr;
> int ret;
>
> if (addr->sa_family != AF_INET)
> @@ -1249,10 +1267,10 @@ int rdma_bind_addr(struct rdma_cm_id *id
> } else if (cma_loopback_addr(addr)) {
> ret = cma_bind_loopback(id_priv);
> } else {
> - ibaddr = &id->route.addr.addr.ibaddr;
> - ret = ib_translate_addr(addr, &ibaddr->sgid, &ibaddr->pkey);
> + dev_addr = &id->route.addr.dev_addr;
> + ret = rdma_translate_ip(addr, dev_addr);
> if (!ret)
> - ret = cma_acquire_ib_dev(id_priv, &ibaddr->sgid);
> + ret = cma_acquire_dev(id_priv);
> }
>
> if (ret)
> Index: core/ucma.c
> ===================================================================
> --- core/ucma.c (revision 4651)
> +++ core/ucma.c (working copy)
> @@ -419,17 +419,17 @@ static ssize_t ucma_resolve_route(struct
> static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
> struct rdma_route *route)
> {
> - struct ib_addr *ibaddr;
> + struct rdma_dev_addr *dev_addr;
>
> resp->num_paths = route->num_paths;
> switch (route->num_paths) {
> case 0:
> - ibaddr = &route->addr.addr.ibaddr;
> - memcpy(&resp->ib_route[0].dgid, ibaddr->dgid.raw,
> - sizeof ibaddr->dgid);
> - memcpy(&resp->ib_route[0].sgid, ibaddr->sgid.raw,
> - sizeof ibaddr->sgid);
> - resp->ib_route[0].pkey = cpu_to_be16(ibaddr->pkey);
> + dev_addr = &route->addr.dev_addr;
> + memcpy(&resp->ib_route[0].dgid, ib_addr_get_dgid(dev_addr),
> + sizeof(union ib_gid));
> + memcpy(&resp->ib_route[0].sgid, ib_addr_get_sgid(dev_addr),
> + sizeof(union ib_gid));
> + resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
> break;
> case 2:
> ib_copy_path_rec_to_user(&resp->ib_route[1],
>
>
More information about the general
mailing list