[ofa-general] IPOB CM (NOSRQ) [PATCH V9] patch

Tue Oct 9 15:51:27 PDT 2007

I skipped over most of the code restructuring comments and focus mainly 
on design or issues.  (Although code restructuring patches tend not to 
be written or easily accepted unless they fix a bug, and I would 
personally like to see at least some of the ones previously mentioned 
addressed before this code is merged.  The ones listed below should be 
trivial to incorporate before merging.)

> This version incorporates some of Sean's comments, especially
> relating to locking.
> 
> Sean's comments regarding module parameters, code restructure, 
> ipoib_cm_rx state and the like will require more discussion and 
> subsequent testing. They will be addressed with additional set 
> of patches later on.
> 
> This patch has been tested with linux-2.6.23-rc5 derived from Roland's
> for-2.6.24 git tree on ppc64 machines using IBM HCA.
> 
> Signed-off-by: Pradeep Satyanarayana <pradeeps at linux.vnet.ibm.com>
> --- 
> 
> --- a/linux-2.6.23-rc5/drivers/infiniband/ulp/ipoib/ipoib.h	2007-07-31 12:14:30.000000000 -0500
> +++ b/linux-2.6.23-rc5/drivers/infiniband/ulp/ipoib/ipoib.h	2007-09-18 14:31:07.000000000 -0500
> @@ -95,11 +95,13 @@ enum {
>  	IPOIB_MCAST_FLAG_ATTACHED = 3,
>  };
>  
> +#define CM_PACKET_SIZE (ALIGN(IPOIB_CM_MTU, PAGE_SIZE))
>  #define	IPOIB_OP_RECV   (1ul << 31)
> +
>  #ifdef CONFIG_INFINIBAND_IPOIB_CM
> -#define	IPOIB_CM_OP_SRQ (1ul << 30)
> +#define	IPOIB_CM_OP_RECV (1ul << 30)
>  #else
> -#define	IPOIB_CM_OP_SRQ (0)
> +#define	IPOIB_CM_OP_RECV (0)
>  #endif
>  
>  /* structs */
> @@ -166,11 +168,14 @@ enum ipoib_cm_state {
>  };
>  
>  struct ipoib_cm_rx {
> -	struct ib_cm_id     *id;
> -	struct ib_qp        *qp;
> -	struct list_head     list;
> -	struct net_device   *dev;
> -	unsigned long        jiffies;
> +	struct ib_cm_id     	*id;
> +	struct ib_qp        	*qp;
> +	struct ipoib_cm_rx_buf  *rx_ring; /* Used by no srq only */
> +	struct list_head     	 list;
> +	struct net_device   	*dev;
> +	unsigned long        	 jiffies;
> +	u32                      index; /* wr_ids are distinguished by index
> +					 * to identify the QP -no srq only */
>  	enum ipoib_cm_state  state;
>  };
>  
> @@ -215,6 +220,8 @@ struct ipoib_cm_dev_priv {
>  	struct ib_wc            ibwc[IPOIB_NUM_WC];
>  	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
>  	struct ib_recv_wr       rx_wr;
> +	struct ipoib_cm_rx	**rx_index_table; /* See ipoib_cm_dev_init()
> +						   *for usage of this element */
>  };
>  
>  /*
> @@ -438,6 +445,7 @@ void ipoib_drain_cq(struct net_device *d
>  /* We don't support UC connections at the moment */
>  #define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
>  
> +extern int max_rc_qp;
>  static inline int ipoib_cm_admin_enabled(struct net_device *dev)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> --- a/linux-2.6.23-rc5/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-07-31 12:14:30.000000000 -0500
> +++ b/linux-2.6.23-rc5/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-09-18 17:04:06.000000000 -0500
> @@ -49,6 +49,18 @@ MODULE_PARM_DESC(cm_data_debug_level,
>  
>  #include "ipoib.h"
>  
> +int max_rc_qp = 128;
> +static int max_recv_buf = 1024; /* Default is 1024 MB */
> +
> +module_param_named(nosrq_max_rc_qp, max_rc_qp, int, 0444);
> +MODULE_PARM_DESC(nosrq_max_rc_qp, "Max number of no srq RC QPs supported; must be a power of 2");

I thought you were going to remove the power of 2 restriction.

And to re-start this discussion, I think we should separate the maximum 
number of QPs from whether we use SRQ, and let the QP type (UD, UC, RC) 
be  controllable.  Smaller clusters may perform better without using 
SRQ, even if it is available.  And supporting UC versus RC seems like it 
should only take a few additional lines of code.

> +module_param_named(max_receive_buffer, max_recv_buf, int, 0644);
> +MODULE_PARM_DESC(max_receive_buffer, "Max Receive Buffer Size in MB");

Based on your response to my feedback, it sounds like the only reason 
we're keeping this parameter around is in case the admin sets some of 
the other values (max QPs, message size, RQ size) incorrectly.

I agree with Roland that we need to come up with the correct user 
interface here, and I'm not convinced that what we have is the most 
adaptable for where the code could go.  What about replacing the 2 
proposed parameters with these 3?

qp_type - ud, uc, rc
use_srq - yes/no (default if available)
max_conn_qp - uc or rc limit

> +
> +static atomic_t current_rc_qp = ATOMIC_INIT(0); /* Active number of RC QPs for no srq */
> +
> +#define NOSRQ_INDEX_MASK      (max_rc_qp -1)

Just reserve lower bits of the wr_id for the rx_table to avoid the power 
of 2 restriction.

>  #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
>  
>  #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
> @@ -81,20 +93,21 @@ static void ipoib_cm_dma_unmap_rx(struct
>  		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
>  }
>  
> -static int ipoib_cm_post_receive(struct net_device *dev, int id)
> +static int post_receive_srq(struct net_device *dev, u64 id)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ib_recv_wr *bad_wr;
>  	int i, ret;
>  
> -	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
> +	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_RECV;
>  
>  	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
>  		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
>  
>  	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
>  	if (unlikely(ret)) {
> -		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
> +		ipoib_warn(priv, "post srq failed for buf %lld (%d)\n",
> +			   (unsigned long long)id, ret);
>  		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
>  				      priv->cm.srq_ring[id].mapping);
>  		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
> @@ -104,12 +117,47 @@ static int ipoib_cm_post_receive(struct 
>  	return ret;
>  }
>  
> -static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
> +static int post_receive_nosrq(struct net_device *dev, u64 id)
> +{
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
> +	struct ib_recv_wr *bad_wr;
> +	int i, ret;
> +	u32 index;
> +	u32 wr_id;
> +	struct ipoib_cm_rx *rx_ptr;
> +
> +	index = id  & NOSRQ_INDEX_MASK;
> +	wr_id = id >> 32;
> +
> +	rx_ptr = priv->cm.rx_index_table[index];
> +
> +	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_RECV;
> +
> +	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
> +		priv->cm.rx_sge[i].addr = rx_ptr->rx_ring[wr_id].mapping[i];
> +
> +	ret = ib_post_recv(rx_ptr->qp, &priv->cm.rx_wr, &bad_wr);
> +	if (unlikely(ret)) {
> +		ipoib_warn(priv, "post recv failed for buf %d (%d)\n",
> +			   wr_id, ret);
> +		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
> +				      rx_ptr->rx_ring[wr_id].mapping);
> +		dev_kfree_skb_any(rx_ptr->rx_ring[wr_id].skb);
> +		rx_ptr->rx_ring[wr_id].skb = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, u64 id,
> +					     int frags,
>  					     u64 mapping[IPOIB_CM_RX_SG])
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct sk_buff *skb;
>  	int i;
> +	struct ipoib_cm_rx *rx_ptr;
> +	u32 index, wr_id;
>  
>  	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
>  	if (unlikely(!skb))
> @@ -141,7 +189,14 @@ static struct sk_buff *ipoib_cm_alloc_rx
>  			goto partial_error;
>  	}
>  
> -	priv->cm.srq_ring[id].skb = skb;
> +	if (priv->cm.srq)
> +		priv->cm.srq_ring[id].skb = skb;
> +	else {
> +		index = id  & NOSRQ_INDEX_MASK;
> +		wr_id = id >> 32;
> +		rx_ptr = priv->cm.rx_index_table[index];
> +		rx_ptr->rx_ring[wr_id].skb = skb;
> +	}
>  	return skb;
>  
>  partial_error:
> @@ -203,11 +258,14 @@ static struct ib_qp *ipoib_cm_create_rx_
>  		.recv_cq = priv->cq,
>  		.srq = priv->cm.srq,
>  		.cap.max_send_wr = 1, /* For drain WR */
> +		.cap.max_recv_wr = ipoib_recvq_size + 1,
>  		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
>  		.sq_sig_type = IB_SIGNAL_ALL_WR,
>  		.qp_type = IB_QPT_RC,
>  		.qp_context = p,
>  	};
> +	if (!priv->cm.srq)
> +		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;

We can still toss this check.

>  	return ib_create_qp(priv->pd, &attr);
>  }
>  
> @@ -281,12 +339,131 @@ static int ipoib_cm_send_rep(struct net_
>  	rep.private_data_len = sizeof data;
>  	rep.flow_control = 0;
>  	rep.rnr_retry_count = req->rnr_retry_count;
> -	rep.srq = 1;
>  	rep.qp_num = qp->qp_num;
>  	rep.starting_psn = psn;
> +	rep.srq = !!priv->cm.srq;
>  	return ib_send_cm_rep(cm_id, &rep);
>  }
>  
> +static void init_context_and_add_list(struct ib_cm_id *cm_id,
> +				    struct ipoib_cm_rx *p,
> +				    struct ipoib_dev_priv *priv)
> +{
> +	cm_id->context = p;
> +	p->jiffies = jiffies;
> +	spin_lock_irq(&priv->lock);
> +	if (list_empty(&priv->cm.passive_ids))
> +		queue_delayed_work(ipoib_workqueue,
> +				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
> +	if (priv->cm.srq) {
> +		/* Add this entry to passive ids list head, but do not re-add
> +		 * it if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush
> +		 * list.
> +		 */
> +		if (p->state == IPOIB_CM_RX_LIVE)
> +			list_move(&p->list, &priv->cm.passive_ids);
> +	}
> +	spin_unlock_irq(&priv->lock);
> +}
> +
> +static int allocate_and_post_rbuf_nosrq(struct ib_cm_id *cm_id,
> +					struct ipoib_cm_rx *p, unsigned psn)
> +{
> +	struct net_device *dev = cm_id->context;
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
> +	int ret;
> +	u32 index;
> +	u64 i, recv_mem_used;
> +
> +	/* In the SRQ case there is a common rx buffer called the srq_ring.
> +	 * However, for the no srq case we create an rx_ring for every
> +	 * struct ipoib_cm_rx.
> +	 */
> +	p->rx_ring = kzalloc(ipoib_recvq_size * sizeof *p->rx_ring, GFP_KERNEL);
> +	if (!p->rx_ring) {
> +		printk(KERN_WARNING "Failed to allocate rx_ring for 0x%x\n",
> +		       p->qp->qp_num);
> +		return -ENOMEM;
> +	}
> +
> +	spin_lock_irq(&priv->lock);
> +	list_add(&p->list, &priv->cm.passive_ids);
> +	spin_unlock_irq(&priv->lock);
> +
> +	init_context_and_add_list(cm_id, p, priv);
> +	spin_lock_irq(&priv->lock);

Just to avoid any possible races, how about just holding the lock 
throughout and remove acquiring it in init_context_and_add_list()?

> +
> +	for (index = 0; index < max_rc_qp; index++)
> +		if (priv->cm.rx_index_table[index] == NULL)
> +			break;
> +
> +	recv_mem_used = (u64)ipoib_recvq_size *
> +			(u64)atomic_inc_return(&current_rc_qp) * CM_PACKET_SIZE;
> +	if ((index == max_rc_qp) ||
> +	    (recv_mem_used >= max_recv_buf * (1ul << 20))) {
> +		spin_unlock_irq(&priv->lock);
> +		ipoib_warn(priv, "no srq has reached the configurable limit "
> +			   "of either %d RC QPs or, max recv buf size of "
> +			   "0x%x MB\n", max_rc_qp, max_recv_buf);
> +
> +		/* We send a REJ to the remote side indicating that we
> +		 * have no more free RC QPs and leave it to the remote side
> +		 * to take appropriate action. This should leave the
> +		 * current set of QPs unaffected and any subsequent REQs
> +		 * will be able to use RC QPs if they are available.
> +		 */
> +		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
> +		ret = -EINVAL;
> +		goto err_alloc_and_post;
> +	}
> +
> +	priv->cm.rx_index_table[index] = p;
> +
> +	/* We will subsequently use this stored pointer while freeing
> +	 * resources in stale task
> +	 */
> +	p->index = index;
> +	spin_unlock_irq(&priv->lock);
> +
> +	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
> +	if (ret) {
> +		ipoib_warn(priv, "ipoib_cm_modify_rx_qp() failed %d\n", ret);
> +		ipoib_cm_dev_cleanup(dev);
> +		goto err_alloc_and_post;
> +	}
> +
> +	for (i = 0; i < ipoib_recvq_size; ++i) {
> +		if (!ipoib_cm_alloc_rx_skb(dev, i << 32 | index,
> +					   IPOIB_CM_RX_SG - 1,
> +					   p->rx_ring[i].mapping)) {
> +			ipoib_warn(priv, "failed to allocate receive "
> +				   "buffer %d\n", (int)i);
> +			ipoib_cm_dev_cleanup(dev);
> +			ret = -ENOMEM;
> +			goto err_alloc_and_post;
> +		}
> +
> +		ret = post_receive_nosrq(dev, i << 32 | index);
> +		if (ret) {
> +			ipoib_warn(priv, "post_receive_nosrq "
> +				   "failed for  buf %lld\n", (unsigned long long)i);
> +			ipoib_cm_dev_cleanup(dev);
> +			ret = -EIO;
> +			goto err_alloc_and_post;
> +		}
> +	}
> +
> +	return 0;
> +
> +err_alloc_and_post:
> +	atomic_dec(&current_rc_qp);
> +	kfree(p->rx_ring);
> +	spin_lock_irq(&priv->lock);
> +	list_del_init(&p->list);
> +	spin_unlock_irq(&priv->lock);
> +	return ret;
> +}
> +
>  static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
>  {
>  	struct net_device *dev = cm_id->context;
> @@ -301,9 +478,6 @@ static int ipoib_cm_req_handler(struct i
>  		return -ENOMEM;
>  	p->dev = dev;
>  	p->id = cm_id;
> -	cm_id->context = p;
> -	p->state = IPOIB_CM_RX_LIVE;
> -	p->jiffies = jiffies;
>  	INIT_LIST_HEAD(&p->list);
>  
>  	p->qp = ipoib_cm_create_rx_qp(dev, p);
> @@ -313,19 +487,21 @@ static int ipoib_cm_req_handler(struct i
>  	}
>  
>  	psn = random32() & 0xffffff;
> -	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
> -	if (ret)
> -		goto err_modify;
> +	if (!priv->cm.srq) {
> +		ret = allocate_and_post_rbuf_nosrq(cm_id, p, psn);
> +		if (ret)
> +			goto err_modify;
> +	} else {
> +		p->rx_ring = NULL;
> +		ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
> +		if (ret)
> +			goto err_modify;
> +	}
>  
> -	spin_lock_irq(&priv->lock);
> -	queue_delayed_work(ipoib_workqueue,
> -			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
> -	/* Add this entry to passive ids list head, but do not re-add it
> -	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
> -	p->jiffies = jiffies;
> -	if (p->state == IPOIB_CM_RX_LIVE)
> -		list_move(&p->list, &priv->cm.passive_ids);
> -	spin_unlock_irq(&priv->lock);
> +	if (priv->cm.srq) {
> +		p->state = IPOIB_CM_RX_LIVE;
> +		init_context_and_add_list(cm_id, p, priv);
> +	}

Merge this if() statement with the if() immediately above it.

>  
>  	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
>  	if (ret) {
> @@ -398,29 +574,60 @@ static void skb_put_frags(struct sk_buff
>  	}
>  }
>  
> -void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
> +static void timer_check_srq(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *p)
> +{
> +	unsigned long flags;
> +
> +	if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
> +		spin_lock_irqsave(&priv->lock, flags);
> +		p->jiffies = jiffies;
> +		/* Move this entry to list head, but do
> +		 * not re-add it if it has been removed.
> +		 */
> +		if (p->state == IPOIB_CM_RX_LIVE)
> +			list_move(&p->list, &priv->cm.passive_ids);
> +		spin_unlock_irqrestore(&priv->lock, flags);
> +	}
> +}
> +
> +static void timer_check_nosrq(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *p)
> +{
> +	unsigned long flags;
> +
> +	if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
> +		spin_lock_irqsave(&priv->lock, flags);
> +		p->jiffies = jiffies;
> +		/* Move this entry to list head, but do
> +		 * not re-add it if it has been removed. */
> +		if (!list_empty(&p->list))
> +			list_move(&p->list, &priv->cm.passive_ids);
> +		spin_unlock_irqrestore(&priv->lock, flags);
> +	}
> +}
> +
> +void handle_rx_wc_srq(struct net_device *dev, struct ib_wc *wc)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> -	unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
> +	u64 wr_id = wc->wr_id & ~IPOIB_CM_OP_RECV;
>  	struct sk_buff *skb, *newskb;
>  	struct ipoib_cm_rx *p;
>  	unsigned long flags;
>  	u64 mapping[IPOIB_CM_RX_SG];
> -	int frags;
> +	int frags, ret;
>  
> -	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
> -		       wr_id, wc->status);
> +	ipoib_dbg_data(priv, "cm recv completion: id %lld, status: %d\n",
> +		       (unsigned long long)wr_id, wc->status);
>  
>  	if (unlikely(wr_id >= ipoib_recvq_size)) {
> -		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
> +		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_RECV)) {
>  			spin_lock_irqsave(&priv->lock, flags);
>  			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
>  			ipoib_cm_start_rx_drain(priv);
>  			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
>  			spin_unlock_irqrestore(&priv->lock, flags);
>  		} else
> -			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
> -				   wr_id, ipoib_recvq_size);
> +			ipoib_warn(priv, "cm recv completion event with wrid %lld (> %d)\n",
> +				   (unsigned long long)wr_id, ipoib_recvq_size);
>  		return;
>  	}
>  
> @@ -428,23 +635,15 @@ void ipoib_cm_handle_rx_wc(struct net_de
>  
>  	if (unlikely(wc->status != IB_WC_SUCCESS)) {
>  		ipoib_dbg(priv, "cm recv error "
> -			   "(status=%d, wrid=%d vend_err %x)\n",
> -			   wc->status, wr_id, wc->vendor_err);
> +			   "(status=%d, wrid=%lld vend_err %x)\n",
> +			   wc->status, (unsigned long long)wr_id, wc->vendor_err);
>  		++priv->stats.rx_dropped;
> -		goto repost;
> +		goto repost_srq;
>  	}
>  
>  	if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) {
>  		p = wc->qp->qp_context;
> -		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
> -			spin_lock_irqsave(&priv->lock, flags);
> -			p->jiffies = jiffies;
> -			/* Move this entry to list head, but do not re-add it
> -			 * if it has been moved out of list. */
> -			if (p->state == IPOIB_CM_RX_LIVE)
> -				list_move(&p->list, &priv->cm.passive_ids);
> -			spin_unlock_irqrestore(&priv->lock, flags);
> -		}
> +		timer_check_srq(priv, p);
>  	}
>  
>  	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
> @@ -456,13 +655,109 @@ void ipoib_cm_handle_rx_wc(struct net_de
>  		 * If we can't allocate a new RX buffer, dump
>  		 * this packet and reuse the old buffer.
>  		 */
> -		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
> +		ipoib_dbg(priv, "failed to allocate receive buffer %lld\n",
> +			  (unsigned long long)wr_id);
> +		++priv->stats.rx_dropped;
> +		goto repost_srq;
> +	}
> +
> +	ipoib_cm_dma_unmap_rx(priv, frags,
> +			      priv->cm.srq_ring[wr_id].mapping);
> +	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping,
> +	       (frags + 1) * sizeof *mapping);
> +	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
> +		       wc->byte_len, wc->slid);
> +
> +	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
> +
> +	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
> +	skb_reset_mac_header(skb);
> +	skb_pull(skb, IPOIB_ENCAP_LEN);
> +
> +	dev->last_rx = jiffies;
> +	++priv->stats.rx_packets;
> +	priv->stats.rx_bytes += skb->len;
> +
> +	skb->dev = dev;
> +	/* XXX get correct PACKET_ type here */
> +	skb->pkt_type = PACKET_HOST;
> +	netif_receive_skb(skb);
> +
> +repost_srq:
> +	ret = post_receive_srq(dev, wr_id);
> +
> +	if (unlikely(ret))
> +		ipoib_warn(priv, "post_receive_srq failed for buf %lld\n",
> +			   (unsigned long long)wr_id);
> +
> +}
> +
> +static void handle_rx_wc_nosrq(struct net_device *dev, struct ib_wc *wc)
> +{
> +	struct ipoib_dev_priv *priv = netdev_priv(dev);
> +	struct sk_buff *skb, *newskb;
> +	u64 mapping[IPOIB_CM_RX_SG], wr_id = wc->wr_id >> 32;
> +	u32 index;
> +	struct ipoib_cm_rx *rx_ptr;
> +	int frags, ret;
> +
> +	ipoib_dbg_data(priv, "cm recv completion: id %lld, status: %d\n",
> +		       (unsigned long long)wr_id, wc->status);
> +
> +	if (unlikely(wr_id >= ipoib_recvq_size)) {
> +		ipoib_warn(priv, "cm recv completion event with wrid %lld (> %d)\n",
> +				   (unsigned long long)wr_id, ipoib_recvq_size);
> +		return;
> +	}
> +
> +	index = (wc->wr_id & ~IPOIB_CM_OP_RECV) & NOSRQ_INDEX_MASK;
> +
> +	/* This is the only place where rx_ptr could be a NULL - could
> +	 * have just received a packet from a connection that has become
> +	 * stale and so is going away. We will simply drop the packet and
> +	 * let the remote end handle the dropped packet.
> +	 * In the timer_check() function below, p->jiffies is updated and
> +	 * hence the connection will not be stale after that.
> +	 */
> +	rx_ptr = priv->cm.rx_index_table[index];
> +	if (unlikely(!rx_ptr)) {
> +		ipoib_warn(priv, "Received packet from a connection "
> +			   "that is going away. Remote end will handle it.\n");
> +		return;
> +	}

I thought we could remove this check and the comment above it.  It's 
misleading to keep them around.

- Sean