[ofa-general] socket buffer accounting with UDP/ipoib

Thu Jul 5 14:34:35 PDT 2007

Copying small packets into a new skb is actually a fairly
well-established optimization to avoid the overhead of allocating a
new skb.  For example look for RX_COPY_THRESHOLD in tg3 or copybreak
in e1000.  So this approach makes sense to me.  However, a few
comments about your patch:

 > +#define SKB_LEN_THOLD 150

150 is probably not the right value; the cost of copying half a
cacheline is probably nearly the same as a full cacheline, so this
should probably be a multiple of 64 (or at least 32, since I don't
know of any arch with smaller than 32-byte cachelines).

With that said I don't know what the right value is here.  256 seems
to be a popular choice; I guess it is system-dependent but I don't
think it makes sense to add yet another knob to adjust this.

 > +		if (skb->len < SKB_LEN_THOLD) {
 > +			nskb = dev_alloc_skb(skb->len);
 > +			if (!nskb) {
 > +				ipoib_warn(priv, "failed to allocate skb\n");
 > +				return;
 > +			}
 > +			memcpy(nskb->data, skb->data, skb->len);

should be skb_copy_from_linear_data()

 > +			skb_put(nskb, skb->len);
 > +			nskb->protocol = skb->protocol;
 > +			dev_kfree_skb_any(skb);

and there's no point in freeing the old skb... we should repost it to
the receive queue instead.

 > +			skb = nskb;
 > +		}

And I think we would want something similar for ipoib_cm.c too.

Your patch also made me look again at how we handle packets the HCA
replicates back to us... there's no reason to free the skb and
allocate a new one; we could just repost the same skb again.  So the
patch below seems like it might help multicast senders.  What do
people think about putting this into 2.6.23?

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 8404f05..1094488 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -197,6 +197,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	}
 
 	/*
+	 * Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
+		goto repost;
+
+	/*
 	 * If we can't allocate a new RX buffer, dump
 	 * this packet and reuse the old buffer.
 	 */
@@ -213,24 +220,18 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	skb_put(skb, wc->byte_len);
 	skb_pull(skb, IB_GRH_BYTES);
 
-	if (wc->slid != priv->local_lid ||
-	    wc->src_qp != priv->qp->qp_num) {
-		skb->protocol = ((struct ipoib_header *) skb->data)->proto;
-		skb_reset_mac_header(skb);
-		skb_pull(skb, IPOIB_ENCAP_LEN);
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
 
-		dev->last_rx = jiffies;
-		++priv->stats.rx_packets;
-		priv->stats.rx_bytes += skb->len;
+	dev->last_rx = jiffies;
+	++priv->stats.rx_packets;
+	priv->stats.rx_bytes += skb->len;
 
-		skb->dev = dev;
-		/* XXX get correct PACKET_ type here */
-		skb->pkt_type = PACKET_HOST;
-		netif_receive_skb(skb);
-	} else {
-		ipoib_dbg_data(priv, "dropping loopback packet\n");
-		dev_kfree_skb_any(skb);
-	}
+	skb->dev = dev;
+	/* XXX get correct PACKET_ type here */
+	skb->pkt_type = PACKET_HOST;
+	netif_receive_skb(skb);
 
 repost:
 	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))