[ofa-general] [PATCHv2] make ipoib broadcast scope configurable

Rolf Manderscheid rvm at obsidianresearch.com
Thu Aug 30 17:03:00 PDT 2007


An ipoib subnet on an IB fabric that spans multiple IB subnets can't use
link-local scope in multicast GIDs.  This patch takes the scope from the
link level broadcast address of the ipoib device when mapping to an IB
multicast address and makes the scope in the link level broadcast address
configurable.  Since the mapping routines now have the broadcast address
available, they can also set the pkey in the MGID.  This cleans up some
assignments made to the MGID after the mapping.

Signed-off-by: Rolf Manderscheid <rvm at obsidianresearch.com>

---

I was probably a little overzealous in my application of the one-idea-per-patch
rule.  Hal pointed out that Roland already suggested a sysfs attribute or ethtool
hook to make the scope configurable.  This patch replaces the one entitled
"use scope from ipoib device link-level broadcast address in MGIDs", I hope
changing the title doesn't confuse things.  There is at least one blemish:
the mapping for the IPv6 all-nodes address is done rather early, so if the
broadcast scope is changed, the mapping must also be changed (that's done in
the code around the FIXME below).  I don't see a way around that except to have
ipv6 delay the join of the all-nodes group until after the link is brought up
(definitely a separate patch :-)

    Rolf

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9ffb998..b3d832c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2560,11 +2560,9 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
 		/* IPv6 address is an SA assigned MGID. */
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
-		ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
 		if (id_priv->id.ps == RDMA_PS_UDP)
 			mc_map[7] = 0x01;	/* Use RDMA CM signature */
-		mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
-		mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
 		*mgid = *(union ib_gid *) (mc_map + 4);
 	}
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 894b1dc..d2f5bb5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1017,6 +1017,43 @@ static ssize_t show_pkey(struct device *dev,
 }
 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
 
+static ssize_t show_bcast_scope(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "0x%x\n", priv->dev->broadcast[5] & 0xF);
+}
+
+static ssize_t set_bcast_scope(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+	int scope;
+
+	if (priv->dev->flags & IFF_UP)
+		return -EBUSY;
+
+	if (sscanf(buf, "%i", &scope) != 1)
+		return -EINVAL;
+
+	switch (scope) {
+	case 0x2: /* link-local */
+	case 0x5: /* site-local */
+	case 0x8: /* organization-local */
+	case 0xE: /* global */
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	priv->dev->broadcast[5] &= ~0xF;
+	priv->dev->broadcast[5] |= scope;
+	return count;
+}
+static DEVICE_ATTR(broadcast_scope, S_IWUSR | S_IRUGO, show_bcast_scope, set_bcast_scope);
+
 static ssize_t create_child(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
@@ -1138,6 +1175,8 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto sysfs_failed;
 	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
 		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_broadcast_scope))
+		goto sysfs_failed;
 
 	return priv->dev;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index aae3670..8088afc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -808,9 +808,11 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
 
-		/* Add in the P_Key */
-		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
-		mgid.raw[5] = priv->pkey & 0xff;
+		/* FIXME: ipv6 maps the all-nodes multicast group at device creation,
+		   so the mapping can change if the broadcast_scope is changed.  If
+		   the ipv6 core can delay joining the all-nodes group until after
+		   the link is brought up, then this can go away: */
+		mgid.raw[1] = (mgid.raw[1] & ~0xF) | (priv->dev->broadcast[5] & 0xF);
 
 		mcast = __ipoib_mcast_find(dev, &mgid);
 		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 3ec7d07..03faeb0 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -268,18 +268,21 @@ static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf)
 	buf[0] = 0x00;
 }
 
-static inline void ipv6_ib_mc_map(struct in6_addr *addr, char *buf)
+static inline void ipv6_ib_mc_map(const struct in6_addr *addr,
+				  const unsigned char *broadcast, char *buf)
 {
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x60;		/* IPv6 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];	/* P_Key */
+	buf[9]  = broadcast[9];
 	memcpy(buf + 10, addr->s6_addr + 6, 10);
 }
 #endif
diff --git a/include/net/ip.h b/include/net/ip.h
index abf2820..a85342c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -266,20 +266,22 @@ static inline void ip_eth_mc_map(__be32 naddr, char *buf)
  *	Leave P_Key as 0 to be filled in by driver.
  */
 
-static inline void ip_ib_mc_map(__be32 naddr, char *buf)
+static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
 {
 	__u32 addr;
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	addr    = ntohl(naddr);
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x40;		/* IPv4 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];		/* P_Key */
+	buf[9]  = broadcast[9];
 	buf[10] = 0;
 	buf[11] = 0;
 	buf[12] = 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 9ab9d53..feb643c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -214,7 +214,7 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 		ip_tr_mc_map(addr, haddr);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ip_ib_mc_map(addr, haddr);
+		ip_ib_mc_map(addr, dev->broadcast, haddr);
 		return 0;
 	default:
 		if (dir) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0358e60..8275625 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -308,7 +308,7 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
 		ipv6_arcnet_mc_map(addr, buf);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ipv6_ib_mc_map(addr, buf);
+		ipv6_ib_mc_map(addr, dev->broadcast, buf);
 		return 0;
 	default:
 		if (dir) {



More information about the general mailing list