[PATCH, please test] IPoIB: recycle RX bufs (was: [openib-general] Re: ib0: ipoib_ib_post_receive failed for buf 111 ib0: failed to allocate receive buffer)

Roland Dreier rolandd at cisco.com
Thu Oct 13 16:14:46 PDT 2005


    Roland> My plan is to change the receive handling of IPoIB
    Roland> slightly, so that if it can't allocate a new receive
    Roland> buffer, it reposts the old buffer and drops the packet it
    Roland> just received.

Here's a patch that changes IPoIB to use this scheme.  This should be
much more robust when the system gets low on GFP_ATOMIC memory.

I'd appreciate it if people could stress test and benchmark this.  It
works well for me, but I'm wondering if this patch has any effect on
performance (either better or worse).

Helen, it would be especially interesting if you could run your test
with this patch and without increasing min_free_kbytes, since you are
able to reproduce GFP_ATOMIC failures.  I'd be curious to know what
you see in /sys/class/net/ib0/statistics/rx_dropped after running the test.

Thanks,
  Roland

--- infiniband/ulp/ipoib/ipoib_main.c	(revision 3707)
+++ infiniband/ulp/ipoib/ipoib_main.c	(working copy)
@@ -729,7 +729,7 @@ int ipoib_dev_init(struct net_device *de
 
 	/* Allocate RX/TX "rings" to hold queued skbs */
 
-	priv->rx_ring =	kmalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf),
+	priv->rx_ring =	kmalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf),
 				GFP_KERNEL);
 	if (!priv->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
@@ -737,9 +737,9 @@ int ipoib_dev_init(struct net_device *de
 		goto out;
 	}
 	memset(priv->rx_ring, 0,
-	       IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf));
+	       IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf));
 
-	priv->tx_ring = kmalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf),
+	priv->tx_ring = kmalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf),
 				GFP_KERNEL);
 	if (!priv->tx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
@@ -747,7 +747,7 @@ int ipoib_dev_init(struct net_device *de
 		goto out_rx_ring_cleanup;
 	}
 	memset(priv->tx_ring, 0,
-	       IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf));
+	       IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf));
 
 	/* priv->tx_head & tx_tail are already 0 */
 
--- infiniband/ulp/ipoib/ipoib.h	(revision 3726)
+++ infiniband/ulp/ipoib/ipoib.h	(working copy)
@@ -100,7 +100,12 @@ struct ipoib_pseudoheader {
 
 struct ipoib_mcast;
 
-struct ipoib_buf {
+struct ipoib_rx_buf {
+	struct sk_buff *skb;
+	dma_addr_t	mapping;
+};
+
+struct ipoib_tx_buf {
 	struct sk_buff *skb;
 	DECLARE_PCI_UNMAP_ADDR(mapping)
 };
@@ -150,14 +155,14 @@ struct ipoib_dev_priv {
 	unsigned int admin_mtu;
 	unsigned int mcast_mtu;
 
-	struct ipoib_buf *rx_ring;
+	struct ipoib_rx_buf *rx_ring;
 
-	spinlock_t        tx_lock;
-	struct ipoib_buf *tx_ring;
-	unsigned          tx_head;
-	unsigned          tx_tail;
-	struct ib_sge     tx_sge;
-	struct ib_send_wr tx_wr;
+	spinlock_t           tx_lock;
+	struct ipoib_tx_buf *tx_ring;
+	unsigned             tx_head;
+	unsigned             tx_tail;
+	struct ib_sge        tx_sge;
+	struct ib_send_wr    tx_wr;
 
 	struct ib_wc ibwc[IPOIB_NUM_WC];
 
--- infiniband/ulp/ipoib/ipoib_ib.c	(revision 3726)
+++ infiniband/ulp/ipoib/ipoib_ib.c	(working copy)
@@ -95,57 +95,65 @@ void ipoib_free_ah(struct kref *kref)
 	}
 }
 
-static inline int ipoib_ib_receive(struct ipoib_dev_priv *priv,
-				   unsigned int wr_id,
-				   dma_addr_t addr)
-{
-	struct ib_sge list = {
-		.addr    = addr,
-		.length  = IPOIB_BUF_SIZE,
-		.lkey    = priv->mr->lkey,
-	};
-	struct ib_recv_wr param = {
-		.wr_id 	    = wr_id | IPOIB_OP_RECV,
-		.sg_list    = &list,
-		.num_sge    = 1,
-	};
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sge list;
+	struct ib_recv_wr param;
 	struct ib_recv_wr *bad_wr;
+	int ret;
+
+	list.addr     = priv->rx_ring[id].mapping;
+	list.length   = IPOIB_BUF_SIZE;
+	list.lkey     = priv->mr->lkey;
+
+	param.next    = NULL;
+	param.wr_id   = id | IPOIB_OP_RECV;
+	param.sg_list = &list;
+	param.num_sge = 1;
+
+	ret = ib_post_recv(priv->qp, &param, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		dma_unmap_single(priv->ca->dma_device,
+				 priv->rx_ring[id].mapping,
+				 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
+	}
 
-	return ib_post_recv(priv->qp, &param, &bad_wr);
+	return ret;
 }
 
-static int ipoib_ib_post_receive(struct net_device *dev, int id)
+static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
 	dma_addr_t addr;
-	int ret;
 
 	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
-	if (!skb) {
-		ipoib_warn(priv, "failed to allocate receive buffer\n");
-
-		priv->rx_ring[id].skb = NULL;
+	if (!skb)
 		return -ENOMEM;
-	}
-	skb_reserve(skb, 4);	/* 16 byte align IP header */
-	priv->rx_ring[id].skb = skb;
+
+	/*
+	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
+	 * header.  So we need 4 more bytes to get to 48 and align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 4);
+
 	addr = dma_map_single(priv->ca->dma_device,
 			      skb->data, IPOIB_BUF_SIZE,
 			      DMA_FROM_DEVICE);
-	pci_unmap_addr_set(&priv->rx_ring[id], mapping, addr);
-
-	ret = ipoib_ib_receive(priv, id, addr);
-	if (ret) {
-		ipoib_warn(priv, "ipoib_ib_receive failed for buf %d (%d)\n",
-			   id, ret);
-		dma_unmap_single(priv->ca->dma_device, addr,
-				 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(addr))) {
 		dev_kfree_skb_any(skb);
-		priv->rx_ring[id].skb = NULL;
+		return -EIO;
 	}
 
-	return ret;
+	priv->rx_ring[id].skb     = skb;
+	priv->rx_ring[id].mapping = addr;
+
+	return 0;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -154,6 +162,10 @@ static int ipoib_ib_post_receives(struct
 	int i;
 
 	for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+		if (ipoib_alloc_rx_skb(dev, i)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			return -ENOMEM;
+		}
 		if (ipoib_ib_post_receive(dev, i)) {
 			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
 			return -EIO;
@@ -176,28 +188,36 @@ static void ipoib_ib_handle_wc(struct ne
 		wr_id &= ~IPOIB_OP_RECV;
 
 		if (wr_id < IPOIB_RX_RING_SIZE) {
-			struct sk_buff *skb = priv->rx_ring[wr_id].skb;
-
-			priv->rx_ring[wr_id].skb = NULL;
+			struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
+			dma_addr_t      addr = priv->rx_ring[wr_id].mapping;
 
-			dma_unmap_single(priv->ca->dma_device,
-					 pci_unmap_addr(&priv->rx_ring[wr_id],
-							mapping),
-					 IPOIB_BUF_SIZE,
-					 DMA_FROM_DEVICE);
-
-			if (wc->status != IB_WC_SUCCESS) {
+			if (unlikely(wc->status != IB_WC_SUCCESS)) {
 				if (wc->status != IB_WC_WR_FLUSH_ERR)
 					ipoib_warn(priv, "failed recv event "
 						   "(status=%d, wrid=%d vend_err %x)\n",
 						   wc->status, wr_id, wc->vendor_err);
+				dma_unmap_single(priv->ca->dma_device, addr,
+						 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
 				dev_kfree_skb_any(skb);
+				priv->rx_ring[wr_id].skb = NULL;
 				return;
 			}
 
+			/*
+			 * If we can't allocate a new RX buffer, dump
+			 * this packet and reuse the old buffer.
+			 */
+			if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+				++priv->stats.rx_dropped;
+				goto repost;
+			}
+
 			ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 				       wc->byte_len, wc->slid);
 
+			dma_unmap_single(priv->ca->dma_device, addr,
+					 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+
 			skb_put(skb, wc->byte_len);
 			skb_pull(skb, IB_GRH_BYTES);
 
@@ -220,8 +240,8 @@ static void ipoib_ib_handle_wc(struct ne
 				dev_kfree_skb_any(skb);
 			}
 
-			/* repost receive */
-			if (ipoib_ib_post_receive(dev, wr_id))
+		repost:
+			if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
 				ipoib_warn(priv, "ipoib_ib_post_receive failed "
 					   "for buf %d\n", wr_id);
 		} else
@@ -229,7 +249,7 @@ static void ipoib_ib_handle_wc(struct ne
 				   wr_id);
 
 	} else {
-		struct ipoib_buf *tx_req;
+		struct ipoib_tx_buf *tx_req;
 		unsigned long flags;
 
 		if (wr_id >= IPOIB_TX_RING_SIZE) {
@@ -302,7 +322,7 @@ void ipoib_send(struct net_device *dev, 
 		struct ipoib_ah *address, u32 qpn)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_buf *tx_req;
+	struct ipoib_tx_buf *tx_req;
 	dma_addr_t addr;
 
 	if (skb->len > dev->mtu + INFINIBAND_ALEN) {
@@ -468,7 +488,7 @@ int ipoib_ib_dev_stop(struct net_device 
 	struct ib_qp_attr qp_attr;
 	int attr_mask;
 	unsigned long begin;
-	struct ipoib_buf *tx_req;
+	struct ipoib_tx_buf *tx_req;
 	int i;
 
 	/* Kill the existing QP and allocate a new one */



More information about the general mailing list