[ofa-general] socket buffer accounting with UDP/ipoib

Eli Cohen eli at mellanox.co.il
Thu Jul 5 06:55:22 PDT 2007


In UDP tests we have been running here, I noticed that when using high
rate of UDP packets over ipoib, there are sometimes cases of packet
drop. Investigating farther I found that the packets are dropped since
the socket buffer is exhausted and we fail in the following code:

net/core/sock.c

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int err = 0;
	int skb_len;

	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
	   number of warnings when compiling with -W --ANK
	 */
	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
	    (unsigned)sk->sk_rcvbuf) {
		err = -ENOMEM;
		goto out;
	}


In the condition above skb->truesize is about the same as the size
allocated for the skb; for small packets, this will charge the socket
far more than it actually consumed.

I used the following patch to make things better in this regard which
passes up to the stack smaller skbs. I am not saying this is the best
way to handle this but I would like to hear opinions as for how we
should address this problem.

Index: connectx_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- connectx_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-07-05 16:54:56.000000000 +0300
+++ connectx_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-07-05 17:10:32.000000000 +0300
@@ -50,6 +50,8 @@
 		 "Enable data path debug tracing if > 0");
 #endif
 
+#define SKB_LEN_THOLD 150
+
 static DEFINE_MUTEX(pkey_mutex);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -169,7 +171,7 @@
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *nskb;
 	u64 addr;
 
 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
@@ -223,6 +225,19 @@
 		++priv->stats.rx_packets;
 		priv->stats.rx_bytes += skb->len;
 
+		if (skb->len < SKB_LEN_THOLD) {
+			nskb = dev_alloc_skb(skb->len);
+			if (!nskb) {
+				ipoib_warn(priv, "failed to allocate skb\n");
+				return;
+			}
+			memcpy(nskb->data, skb->data, skb->len);
+			skb_put(nskb, skb->len);
+			nskb->protocol = skb->protocol;
+			dev_kfree_skb_any(skb);
+			skb = nskb;
+		}
+
 		skb->dev = dev;
 		/* XXX get correct PACKET_ type here */
 		skb->pkt_type = PACKET_HOST;
@@ -350,7 +365,6 @@
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int n, i;
 
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 	do {
 		n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
@@ -363,6 +377,7 @@
 				ipoib_ib_handle_tx_wc(dev, wc);
 		}
 	} while (n == IPOIB_NUM_WC);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 }
 #endif
 




More information about the general mailing list