[ofa-general] [RFC] IPoIB-UD S/G 4K MTU support against ofa-1.3-rc2

Shirley Ma mashirle at us.ibm.com
Sat Jan 26 13:42:15 PST 2008


This patch is built against OFED-1.3-rc2. One node touch test has been
done. I send out for early review comments while more test is going on.
I will integrate your comments in my test immediately. I will break this
patch into several smaller ones for 2.6.25 submission later.

This patch allows IPoIB-UD MTU up to 4092 (4K - IPOIB_ENCAP_LEN) when
HCA can support 4K MTU. In this patch, APIs for S/G buffer allocation in
IPoIB-CM mode has been made generic so IPoIB-UD and IPoIB-CM can share
the S/G code. When PAGE_SIZE is equal or greater than IPOIB_UD_BUF_SIZE
+ bytes padding to align IP header, Only one buffer is needed for 4K MTU
buffer allocation, otherwise, two buffers allocation is needed in S/G.
The node IPoIB link MTU size is the minimum value of admin configurable
MTU through ifconfig and IPoIB default broadcast group MTU size. When
Subnet Manager enables default broadcast group during start up, this
subnet IPoIB link MTU will be the value of default broadcast group MTU
size. For any node IB MTU smaller than this value, the node can't join
this IPoIB subnet. For any node IB MTU is greater than this value, the
node will join this IPoIB subnet and this value will be set as its IPOIB
link MTU. If Subnet Manager disables default broadcast group during
start up, the first bring up node in this subnet will create the default
IPoIB broadcast group based on the negotiation with the Subnet Manager.

Sign-off-by: Shirley Ma <xma at us.ibm.com>
---
diff -urpN ipoib-orig/ipoib_cm.c ipoib-4kmtu/ipoib_cm.c
--- ipoib-orig/ipoib_cm.c	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib_cm.c	2008-01-26 23:52:42.000000000 -0600
@@ -72,17 +72,6 @@ static struct ib_send_wr ipoib_cm_rx_dra
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
-static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int
frags,
-				  u64 mapping[IPOIB_CM_RX_SG])
-{
-	int i;
-
-	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE,
DMA_FROM_DEVICE);
-
-	for (i = 0; i < frags; ++i)
-		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE,
DMA_FROM_DEVICE);
-}
-
 static int ipoib_cm_post_receive(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -97,8 +86,9 @@ static int ipoib_cm_post_receive(struct 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-				      priv->cm.srq_ring[id].mapping);
+		ipoib_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				   IPOIB_CM_HEAD_SIZE,
+				   priv->cm.srq_ring[id].mapping);
 		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
 		priv->cm.srq_ring[id].skb = NULL;
 	}
@@ -106,57 +96,6 @@ static int ipoib_cm_post_receive(struct 
 	return ret;
 }
 
-static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
int id, int frags,
-					     u64 mapping[IPOIB_CM_RX_SG])
-{
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct sk_buff *skb;
-	int i;
-
-	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
-	if (unlikely(!skb))
-		return NULL;
-
-	/*
-	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
-	 * IP header to a multiple of 16.
-	 */
-	skb_reserve(skb, 12);
-
-	mapping[0] = ib_dma_map_single(priv->ca, skb->data,
IPOIB_CM_HEAD_SIZE,
-				       DMA_FROM_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
-
-	for (i = 0; i < frags; i++) {
-		struct page *page = alloc_page(GFP_ATOMIC);
-
-		if (!page)
-			goto partial_error;
-		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
-
-		mapping[i + 1] = ib_dma_map_page(priv->ca,
skb_shinfo(skb)->frags[i].page,
-						 0, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
-			goto partial_error;
-	}
-
-	priv->cm.srq_ring[id].skb = skb;
-	return skb;
-
-partial_error:
-
-	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE,
DMA_FROM_DEVICE);
-
-	for (; i > 0; --i)
-		ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE,
DMA_FROM_DEVICE);
-
-	dev_kfree_skb_any(skb);
-	return NULL;
-}
-
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
 {
 	struct ib_send_wr *bad_wr;
@@ -367,38 +306,6 @@ static int ipoib_cm_rx_handler(struct ib
 		return 0;
 	}
 }
-/* Adjust length of skb with fragments to match received data */
-static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
-			  unsigned int length, struct sk_buff *toskb)
-{
-	int i, num_frags;
-	unsigned int size;
-
-	/* put header into skb */
-	size = min(length, hdr_space);
-	skb->tail += size;
-	skb->len += size;
-	length -= size;
-
-	num_frags = skb_shinfo(skb)->nr_frags;
-	for (i = 0; i < num_frags; i++) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		if (length == 0) {
-			/* don't need this page */
-			skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE);
-			--skb_shinfo(skb)->nr_frags;
-		} else {
-			size = min(length, (unsigned) PAGE_SIZE);
-
-			frag->size = size;
-			skb->data_len += size;
-			skb->truesize += size;
-			skb->len += size;
-			length -= size;
-		}
-	}
-}
 
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
@@ -453,7 +360,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
-	newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
+	newskb = ipoib_alloc_rx_skb(dev, wr_id, frags, IPOIB_CM_HEAD_SIZE, 12,
mapping);
 	if (unlikely(!newskb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
@@ -464,7 +371,9 @@ void ipoib_cm_handle_rx_wc(struct net_de
 		goto repost;
 	}
 
-	ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping);
+	priv->cm.srq_ring[wr_id].skb = newskb;
+	ipoib_dma_unmap_rx(priv, frags, IPOIB_CM_HEAD_SIZE, 
+			   priv->cm.srq_ring[wr_id].mapping);
 	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof
*mapping);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
@@ -1334,8 +1243,10 @@ int ipoib_cm_dev_init(struct net_device 
 	priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
-					   priv->cm.srq_ring[i].mapping)) {
+		priv->cm.srq_ring[i].skb = ipoib_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG
- 1,
+							       IPOIB_CM_HEAD_SIZE, 12,
+					 		       priv->cm.srq_ring[i].mapping);
+		if (!priv->cm.srq_ring[i].skb) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 			ipoib_cm_dev_cleanup(dev);
 			return -ENOMEM;
@@ -1370,8 +1281,9 @@ void ipoib_cm_dev_cleanup(struct net_dev
 		return;
 	for (i = 0; i < ipoib_recvq_size; ++i)
 		if (priv->cm.srq_ring[i].skb) {
-			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-					      priv->cm.srq_ring[i].mapping);
+			ipoib_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					   IPOIB_CM_HEAD_SIZE,
+					   priv->cm.srq_ring[i].mapping);
 			dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
 			priv->cm.srq_ring[i].skb = NULL;
 		}
diff -urpN ipoib-orig/ipoib.h ipoib-4kmtu/ipoib.h
--- ipoib-orig/ipoib.h	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib.h	2008-01-26 21:28:03.000000000 -0600
@@ -56,10 +56,9 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE         = 2048,
-	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
 	IPOIB_ENCAP_LEN 	  = 4,
+	IPOIB_MAX_IB_MTU	  = 4096, /* max ib device payload is 4096 */
+	IPOIB_UD_MAX_RX_SG	  = ALIGN(IPOIB_MAX_IB_MTU + IB_GRH_BYTES + 4,
PAGE_SIZE) / PAGE_SIZE,  /* padding to align IP header */	
 
 	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header
to 16 */
 	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
@@ -141,7 +140,7 @@ struct ipoib_mcast {
 
 struct ipoib_rx_buf {
 	struct sk_buff *skb;
-	u64		mapping;
+	u64		mapping[MAX_SKB_FRAGS + 1];
 };
 
 struct ipoib_tx_buf {
@@ -281,14 +280,9 @@ struct ipoib_cm_tx {
 	struct ib_wc         ibwc[IPOIB_NUM_WC];
 };
 
-struct ipoib_cm_rx_buf {
-	struct sk_buff *skb;
-	u64 mapping[IPOIB_CM_RX_SG];
-};
-
 struct ipoib_cm_dev_priv {
 	struct ib_srq  	       *srq;
-	struct ipoib_cm_rx_buf *srq_ring;
+	struct ipoib_rx_buf *srq_ring;
 	struct ib_cm_id        *id;
 	struct list_head        passive_ids;   /* state: LIVE */
 	struct list_head        rx_error_list; /* state: ERROR */
@@ -391,6 +385,9 @@ struct ipoib_dev_priv {
 	struct dentry *path_dentry;
 #endif
 	struct ipoib_ethtool_st etool;
+	unsigned int   max_ib_mtu;
+	struct ib_sge        rx_sge[IPOIB_UD_MAX_RX_SG];
+	struct ib_recv_wr    rx_wr;
 };
 
 struct ipoib_ah {
@@ -487,6 +484,14 @@ int ipoib_ib_dev_stop(struct net_device 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int
port);
 void ipoib_dev_cleanup(struct net_device *dev);
 
+void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, int
head_size,
+			u64 *mapping);
+void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+		   unsigned int length, struct sk_buff *toskb);
+struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev,
+				   int id, int frags, int head_size,
+				   int pad, u64 *mapping);
+
 void ipoib_mcast_join_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, void *mgid, struct
sk_buff *skb);
 
@@ -536,6 +541,11 @@ void ipoib_drain_cq(struct net_device *d
 
 void ipoib_set_ethtool_ops(struct net_device *dev);
 
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES) /* padding to
align IP header */
+#define IPOIB_UD_HEAD_SIZE(ib_mtu)	(IPOIB_UD_BUF_SIZE(ib_mtu)) %
PAGE_SIZE
+#define IPOIB_UD_RX_SG(ib_mtu)	ALIGN(IPOIB_UD_BUF_SIZE(ib_mtu),
PAGE_SIZE) / PAGE_SIZE
+
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
 #define IPOIB_FLAGS_RC          0x80
diff -urpN ipoib-orig/ipoib_ib.c ipoib-4kmtu/ipoib_ib.c
--- ipoib-orig/ipoib_ib.c	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib_ib.c	2008-01-26 22:48:41.000000000 -0600
@@ -90,63 +90,118 @@ void ipoib_free_ah(struct kref *kref)
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
-static int ipoib_ib_post_receive(struct net_device *dev, int id)
+/* Adjust length of skb with fragments to match received data */
+void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+			  unsigned int length, struct sk_buff *toskb)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ib_sge list;
-	struct ib_recv_wr param;
-	struct ib_recv_wr *bad_wr;
-	int ret;
+	int i, num_frags;
+	unsigned int size;
 
-	list.addr     = priv->rx_ring[id].mapping;
-	list.length   = IPOIB_BUF_SIZE;
-	list.lkey     = priv->mr->lkey;
-
-	param.next    = NULL;
-	param.wr_id   = id | IPOIB_OP_RECV;
-	param.sg_list = &list;
-	param.num_sge = 1;
+	/* put header into skb */
+	size = min(length, hdr_space);
+	skb->tail += size;
+	skb->len += size;
+	length -= size;
 
-	ret = ib_post_recv(priv->qp, &param, &bad_wr);
-	if (unlikely(ret)) {
-		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
-		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
-		dev_kfree_skb_any(priv->rx_ring[id].skb);
-		priv->rx_ring[id].skb = NULL;
+	num_frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < num_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		if (length == 0) {
+			/* don't need this page */
+			skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE);
+			--skb_shinfo(skb)->nr_frags;
+		} else {
+			size = min(length, (unsigned) PAGE_SIZE);
+
+			frag->size = size;
+			skb->data_len += size;
+			skb->truesize += size;
+			skb->len += size;
+			length -= size;
+		}
 	}
+}
 
-	return ret;
+void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, int
head_size, u64 *mapping)
+{
+	int i;
+	ib_dma_unmap_single(priv->ca, mapping[0], head_size,
+			    DMA_FROM_DEVICE);
+	for (i = 0; i < frags; i++) 
+             ib_dma_unmap_single(priv->ca, mapping[i+1], PAGE_SIZE, 
+				 DMA_FROM_DEVICE); 
 }
 
-static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
+struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id, int
frags,
+				   int head_size, int pad, u64 *mapping)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
-	u64 addr;
+	int i;
 
-	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
-	if (!skb)
-		return -ENOMEM;
+	skb = dev_alloc_skb(head_size + pad);
+	if (unlikely(!skb))
+		return NULL;
 
 	/*
-	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
-	 * header.  So we need 4 more bytes to get to 48 and align the
+	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
 	 * IP header to a multiple of 16.
 	 */
-	skb_reserve(skb, 4);
+	skb_reserve(skb, pad);
 
-	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
-				 DMA_FROM_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, head_size,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
 		dev_kfree_skb_any(skb);
-		return -EIO;
+		return NULL;
 	}
 
-	priv->rx_ring[id].skb     = skb;
-	priv->rx_ring[id].mapping = addr;
+	for (i = 0; i < frags; i++) {
+		struct page *page = alloc_page(GFP_ATOMIC);
 
-	return 0;
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
+
+		mapping[i + 1] = ib_dma_map_page(priv->ca,
skb_shinfo(skb)->frags[i].page,
+						 0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
+			goto partial_error;
+	}
+	
+	return skb;
+
+partial_error:
+
+	ib_dma_unmap_single(priv->ca, mapping[0], head_size, DMA_FROM_DEVICE);
+
+	for (; i > 0; --i)
+		ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE,
DMA_FROM_DEVICE);
+
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int ret, i;
+
+	priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
+	for (i = 0; i < IPOIB_UD_RX_SG(priv->max_ib_mtu); ++i)
+		priv->rx_sge[i].addr = priv->rx_ring[id].mapping[i];
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_dma_unmap_rx(priv, IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1, 
+				   IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu),
priv->rx_ring[id].mapping);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -154,13 +209,24 @@ static int ipoib_ib_post_receives(struct
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i;
 
+	for (i = 0; i < IPOIB_UD_RX_SG(priv->max_ib_mtu); ++i)
+		priv->rx_sge[i].lkey = priv->mr->lkey;
+	priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu);
+	for (i = 0; i < IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1; ++i)
+		priv->rx_sge[i+1].length = PAGE_SIZE;
+	priv->rx_wr.num_sge = IPOIB_UD_RX_SG(priv->max_ib_mtu);
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (ipoib_alloc_rx_skb(dev, i)) {
-			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+		priv->rx_ring[i].skb = ipoib_alloc_rx_skb(dev, i,
IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1,
+							  IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu), 4, 
+							  priv->rx_ring[i].mapping);
+		if (!priv->rx_ring[i].skb)
 			return -ENOMEM;
-		}
 		if (ipoib_ib_post_receive(dev, i)) {
 			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			ipoib_dev_cleanup(dev);
 			return -EIO;
 		}
 	}
@@ -172,9 +238,9 @@ static void ipoib_ib_handle_rx_wc(struct
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *newskb;
+	u64 mapping[IPOIB_UD_RX_SG(priv->max_ib_mtu)];
 	struct ipoib_header *header;
-	u64 addr;
 
 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -186,15 +252,15 @@ static void ipoib_ib_handle_rx_wc(struct
 	}
 
 	skb  = priv->rx_ring[wr_id].skb;
-	addr = priv->rx_ring[wr_id].mapping;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		if (wc->status != IB_WC_WR_FLUSH_ERR)
 			ipoib_warn(priv, "failed recv event "
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
-		ib_dma_unmap_single(priv->ca, addr,
-				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		ipoib_dma_unmap_rx(priv, IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1,
+				   IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu), 
+				   priv->rx_ring[wr_id].mapping);
 		dev_kfree_skb_any(skb);
 		priv->rx_ring[wr_id].skb = NULL;
 		return;
@@ -211,17 +277,23 @@ static void ipoib_ib_handle_rx_wc(struct
 	 * If we can't allocate a new RX buffer, dump
 	 * this packet and reuse the old buffer.
 	 */
-	if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+	newskb = ipoib_alloc_rx_skb(dev, wr_id,
IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1, 
+				    		IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu), 4, mapping);
+	if (unlikely(!newskb)) {
+		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
 		++priv->stats.rx_dropped;
 		goto repost;
 	}
+	priv->rx_ring[wr_id].skb = newskb;
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
-	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	ipoib_dma_unmap_rx(priv, IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1, 
+			   IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu),
priv->rx_ring[wr_id].mapping);
+	memcpy(priv->rx_ring[wr_id].mapping, mapping,
IPOIB_UD_RX_SG(priv->max_ib_mtu) * sizeof *mapping);
 
-	skb_put(skb, wc->byte_len);
+	skb_put_frags(skb, IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu), wc->byte_len,
newskb);
 	skb_pull(skb, IB_GRH_BYTES);
 
 	header = (struct ipoib_header *)skb->data;
@@ -692,10 +764,10 @@ int ipoib_ib_dev_stop(struct net_device 
 				rx_req = &priv->rx_ring[i];
 				if (!rx_req->skb)
 					continue;
-				ib_dma_unmap_single(priv->ca,
-						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
-						    DMA_FROM_DEVICE);
+				ipoib_dma_unmap_rx(priv,
+						   IPOIB_UD_RX_SG(priv->max_ib_mtu) - 1, 
+						   IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu), 
+						   priv->rx_ring[i].mapping);
 				dev_kfree_skb_any(rx_req->skb);
 				rx_req->skb = NULL;
 			}
diff -urpN ipoib-orig/ipoib_main.c ipoib-4kmtu/ipoib_main.c
--- ipoib-orig/ipoib_main.c	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib_main.c	2008-01-26 21:28:03.000000000 -0600
@@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) {
 		return -EINVAL;
 	}
 
@@ -1053,10 +1053,6 @@ static void ipoib_setup(struct net_devic
 		set_bit(IPOIB_FLAG_HW_CSUM, &priv->flags);
 	}
 
-	/* MTU will be reset when mcast join happens */
-	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
-
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
 	netif_carrier_off(dev);
@@ -1208,6 +1204,7 @@ static struct net_device *ipoib_add_port
 					 struct ib_device *hca, u8 port)
 {
 	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
 	int result = -ENOMEM;
 
 	priv = ipoib_intf_alloc(format);
@@ -1218,6 +1215,18 @@ static struct net_device *ipoib_add_port
 
 	priv->dev->features |= NETIF_F_HIGHDMA;
 
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n", 
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu	 = priv->admin_mtu = priv->dev->mtu;
+
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff -urpN ipoib-orig/ipoib_multicast.c ipoib-4kmtu/ipoib_multicast.c
--- ipoib-orig/ipoib_multicast.c	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib_multicast.c	2008-01-26 21:28:03.000000000 -0600
@@ -567,9 +567,7 @@ void ipoib_mcast_join_task(struct work_s
 		return;
 	}
 
-	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-		IPOIB_ENCAP_LEN;
-
+	priv->mcast_mtu =
IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 	if (!ipoib_cm_admin_enabled(dev))
 		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 
diff -urpN ipoib-orig/ipoib_verbs.c ipoib-4kmtu/ipoib_verbs.c
--- ipoib-orig/ipoib_verbs.c	2008-01-26 20:52:49.000000000 -0600
+++ ipoib-4kmtu/ipoib_verbs.c	2008-01-26 21:28:03.000000000 -0600
@@ -150,7 +150,7 @@ int ipoib_transport_dev_init(struct net_
 			.max_send_wr  = ipoib_sendq_size,
 			.max_recv_wr  = ipoib_recvq_size,
 			.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
-			.max_recv_sge = 1
+			.max_recv_sge = IPOIB_UD_RX_SG(priv->max_ib_mtu) 
 		},
 		.sq_sig_type = IB_SIGNAL_ALL_WR,
 		.qp_type     = IB_QPT_UD,
@@ -208,6 +208,16 @@ int ipoib_transport_dev_init(struct net_
 	priv->tx_wr.sg_list 	= priv->tx_sge;
 	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
 
+	priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE(priv->max_ib_mtu);
+	for (i = 0; i < IPOIB_UD_RX_SG(priv->max_ib_mtu); ++i) {
+		priv->rx_sge[i].lkey = priv->mr->lkey;
+		priv->rx_sge[i+1].length = PAGE_SIZE;
+	}
+	priv->rx_sge[i+1].length = PAGE_SIZE;
+	priv->rx_wr.num_sge = IPOIB_UD_RX_SG(priv->max_ib_mtu);
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
 	return 0;
 
 out_free_cq:





More information about the general mailing list