[ofa-general] IPOIB CM (NOSRQ)[PATCH V2] patch for review

Pradeep Satyanarayana pradeep at us.ibm.com
Wed Apr 18 17:56:44 PDT 2007


Here is a second version of the IPOIB_CM_NOSRQ patch for review. This 
patch will benefit adapters that do not support shared receive queues.

This patch incorporates the previous review comments:
-#ifdefs removed and a single binary drives HCAs that do and do not 
support SRQs
-avoids linear traversal through a list of QPs
-extraneous code removed
-compile time selection removed 
-No HTML version as part of this patch

This patch has been tested with linux-2.6.21-rc5 and rc7 with Topspin and 
IBM HCAs on ppc64 machines. I have run
netperf between two IBM HCAs and two Topspin HCAs, as well as between IBM 
and Topspin HCA.

Note 1: There was interesting discovery that I made when I ran netperf 
between Topsin and IBM HCA. I started to see
the IB_WC_RETRY_EXC_ERR error upon send completion. This may have been due 
to the differences in the
processing speeds of the two HCA. This was rectified by seting the 
retry_count to a non-zero value in ipoib_cm_send_req().
I had to do this inspite of the comment  --> /* RFC draft warns against 
retries */ 

Can someone point me to where this comment is in the RFC? I would like to 
understand the reasoning.

Note 2: The  IB_WC_RETRY_EXC_ERR is not seen when the two HCAs are of the 
same type.
Note 3: Another small patch (not in this one) is needed to the ehca driver 
for it to work on the IBM HCAs.

Signed-off-by: Pradeep Satyanarayana <pradeep at us.ibm.com>
---
--- linux-2.6.21-rc5.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2007-04-02 
17:44:58.000000000 -0700
+++ linux-2.6.21-rc5/drivers/infiniband/ulp/ipoib/ipoib.h 2007-04-03 
10:59:54.000000000 -0700
@@ -99,6 +99,12 @@ enum {
 #define                 IPOIB_OP_RECV   (1ul << 31)
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 #define                 IPOIB_CM_OP_SRQ (1ul << 30)
+#define IPOIB_CM_OP_NOSRQ (1ul << 29)
+
+/* These two go hand in hand */
+#define NOSRQ_INDEX_RING_SIZE 1024
+#define NOSRQ_INDEX_MASK      0x00000000000003ff
+
 #else
 #define                 IPOIB_CM_OP_SRQ (0)
 #endif
@@ -136,9 +142,11 @@ struct ipoib_cm_data {
 struct ipoib_cm_rx {
                 struct ib_cm_id     *id;
                 struct ib_qp        *qp;
+                struct ipoib_cm_rx_buf *rx_ring;
                 struct list_head     list;
                 struct net_device   *dev;
                 unsigned long        jiffies;
+                u32                                  index;
 };
 
 struct ipoib_cm_tx {
@@ -177,6 +185,7 @@ struct ipoib_cm_dev_priv {
                 struct ib_wc            ibwc[IPOIB_NUM_WC];
                 struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
                 struct ib_recv_wr       rx_wr;
+                struct ipoib_cm_rx              **rx_index_ring;
 };
 
 /*
--- linux-2.6.21-rc5.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c  
2007-04-02 17:44:58.000000000 -0700
+++ linux-2.6.21-rc5/drivers/infiniband/ulp/ipoib/ipoib_cm.c 2007-04-18 
16:23:12.000000000 -0700
@@ -76,35 +76,73 @@ static void ipoib_cm_dma_unmap_rx(struct
                                 ib_dma_unmap_single(priv->ca, mapping[i + 
1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
-static int ipoib_cm_post_receive(struct net_device *dev, int id)
+static int ipoib_cm_post_receive(struct net_device *dev, u64 id)
 {
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
                 struct ib_recv_wr *bad_wr;
                 int i, ret;
+                u32 index;
+                u64 wr_id;
+                struct ipoib_cm_rx *rx_ptr;
+                unsigned long flags;
 
-                priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+                if (priv->cm.srq) {
+                                priv->cm.rx_wr.wr_id = id | 
IPOIB_CM_OP_SRQ; /* Check id val */
 
-                for (i = 0; i < IPOIB_CM_RX_SG; ++i)
-                                priv->cm.rx_sge[i].addr = 
priv->cm.srq_ring[id].mapping[i];
+                                for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+                                                priv->cm.rx_sge[i].addr = 

+ priv->cm.srq_ring[id].mapping[i];
+
+                                ret = ib_post_srq_recv(priv->cm.srq, 
&priv->cm.rx_wr, &bad_wr);
+                                if (unlikely(ret)) {
+                                                ipoib_warn(priv, "post 
srq failed for buf %d (%d)\n", id, ret);
+ ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+     priv->cm.srq_ring[id].mapping);
+ dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+                                                priv->cm.srq_ring[id].skb 
= NULL;
+                                }
+                } else {
+                                index = id  & NOSRQ_INDEX_MASK ;
+                                wr_id = id >> 32;
 
-                ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, 
&bad_wr);
-                if (unlikely(ret)) {
-                                ipoib_warn(priv, "post srq failed for buf 
%d (%d)\n", id, ret);
-                                ipoib_cm_dma_unmap_rx(priv, 
IPOIB_CM_RX_SG - 1,
- priv->cm.srq_ring[id].mapping);
- dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
-                                priv->cm.srq_ring[id].skb = NULL;
-                }
+                                /* There is a slender chance of a race 
between the stale_task
+                                 * running after a period of inactivity 
and the receipt of
+                                 * a packet being processed at about the 
same instant. 
+                                 * Hence the lock */
+
+                                spin_lock_irqsave(&priv->lock, flags);
+                                rx_ptr = priv->cm.rx_index_ring[index];
+                                spin_unlock_irqrestore(&priv->lock, 
flags);
+
+                                priv->cm.rx_wr.wr_id = wr_id << 32 | 
index | IPOIB_CM_OP_NOSRQ;
+
+                                for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+                                                priv->cm.rx_sge[i].addr = 
rx_ptr->rx_ring[wr_id].mapping[i];
+
+                                ret = ib_post_recv(rx_ptr->qp, 
&priv->cm.rx_wr, &bad_wr);
+                                if (unlikely(ret)) {
+                                                ipoib_warn(priv, "post 
recv failed for buf %d (%d)\n",
+                                                           wr_id, ret);
+ ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+ rx_ptr->rx_ring[wr_id].mapping);
+ dev_kfree_skb_any(rx_ptr->rx_ring[wr_id].skb);
+ rx_ptr->rx_ring[wr_id].skb = NULL;
+                                }
+                } /* else NO SRQ */
 
                 return ret;
 }
 
-static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int 
id, int frags,
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, u64 
id, 
+       int frags,
       u64 mapping[IPOIB_CM_RX_SG])
 {
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
                 struct sk_buff *skb;
                 int i;
+                struct ipoib_cm_rx *rx_ptr;
+                u32 index, wr_id;
+                unsigned long flags;
 
                 skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
                 if (unlikely(!skb))
@@ -123,7 +161,7 @@ static struct sk_buff *ipoib_cm_alloc_rx
                                 return NULL;
                 }
 
-                for (i = 0; i < frags; i++) {
+                for (i = 0; i < frags; i++) { 
                                 struct page *page = 
alloc_page(GFP_ATOMIC);
 
                                 if (!page)
@@ -136,7 +174,17 @@ static struct sk_buff *ipoib_cm_alloc_rx
                                                 goto partial_error;
                 }
 
-                priv->cm.srq_ring[id].skb = skb;
+                if (priv->cm.srq) 
+                                priv->cm.srq_ring[id].skb = skb;
+                else {
+                                index = id  & NOSRQ_INDEX_MASK ;
+                                wr_id = id >> 32;
+                                spin_lock_irqsave(&priv->lock, flags);
+                                rx_ptr = priv->cm.rx_index_ring[index];
+                                spin_unlock_irqrestore(&priv->lock, 
flags);
+
+                                rx_ptr->rx_ring[wr_id].skb = skb;
+                }
                 return skb;
 
 partial_error:
@@ -157,13 +205,20 @@ static struct ib_qp *ipoib_cm_create_rx_
                 struct ib_qp_init_attr attr = {
                                 .send_cq = priv->cq, /* does not matter, 
we never send anything */
                                 .recv_cq = priv->cq,
-                                .srq = priv->cm.srq,
                                 .cap.max_send_wr = 1, /* FIXME: 0 Seems 
not to work */
+                                .cap.max_recv_wr = ipoib_recvq_size + 1,
                                 .cap.max_send_sge = 1, /* FIXME: 0 Seems 
not to work */
+                                .cap.max_recv_sge = IPOIB_CM_RX_SG, /* Is 
this correct? */
                                 .sq_sig_type = IB_SIGNAL_ALL_WR,
                                 .qp_type = IB_QPT_RC,
                                 .qp_context = p,
                 };
+
+                if (priv->cm.srq)
+                                attr.srq = priv->cm.srq;
+                else
+                                attr.srq = NULL;
+
                 return ib_create_qp(priv->pd, &attr);
 }
 
@@ -217,9 +272,13 @@ static int ipoib_cm_send_rep(struct net_
                 rep.flow_control = 0;
                 rep.rnr_retry_count = req->rnr_retry_count;
                 rep.target_ack_delay = 20; /* FIXME */
-                rep.srq = 1;
                 rep.qp_num = qp->qp_num;
                 rep.starting_psn = psn;
+ 
+                if (priv->cm.srq)
+                                rep.srq = 1;
+                else
+                                rep.srq = 0;
                 return ib_send_cm_rep(cm_id, &rep);
 }
 
@@ -231,6 +290,8 @@ static int ipoib_cm_req_handler(struct i
                 unsigned long flags;
                 unsigned psn;
                 int ret;
+                u32 qp_num, index;
+                u64 i;
 
                 ipoib_dbg(priv, "REQ arrived\n");
                 p = kzalloc(sizeof *p, GFP_KERNEL);
@@ -244,10 +305,69 @@ static int ipoib_cm_req_handler(struct i
                                 goto err_qp;
                 }
 
-                psn = random32() & 0xffffff;
-                ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
-                if (ret)
-                                goto err_modify;
+                if (priv->cm.srq == NULL) { /* NOSRQ */
+                                qp_num = p->qp->qp_num;
+                                /* Allocate space for the rx_ring here */
+                                p->rx_ring = kzalloc(ipoib_recvq_size * 
sizeof *p->rx_ring,
+ GFP_KERNEL);
+                                if (p->rx_ring == NULL)
+                                                return -ENOMEM;
+
+                                cm_id->context = p;
+                                p->jiffies = jiffies;
+                                spin_lock_irqsave(&priv->lock, flags);
+                                list_add(&p->list, 
&priv->cm.passive_ids);
+ 
+                                /* Find an empty rx_index_ring[] entry */
+                                for (index = 0; index < 
NOSRQ_INDEX_RING_SIZE; index++)
+                                                if 
(priv->cm.rx_index_ring[index] == NULL)
+                                                                break; 
+
+                                if ( index == NOSRQ_INDEX_RING_SIZE) {
+ spin_unlock_irqrestore(&priv->lock, flags);
+                                                printk(KERN_WARNING 
"NOSRQ supports a max of %d RC "
+                                                       "QPs. That limit 
has now been reached\n",
+ NOSRQ_INDEX_RING_SIZE);
+                                                return -EINVAL;
+                                }
+
+                                /* Store the pointer to retrieve it later 
using the index */
+                                priv->cm.rx_index_ring[index] = p;
+                                spin_unlock_irqrestore(&priv->lock, 
flags);
+                                p->index = index;
+
+                                psn = random32() & 0xffffff;
+                                ret = ipoib_cm_modify_rx_qp(dev, cm_id, 
p->qp, psn);
+                                if (ret) {
+                                                ipoib_warn(priv, 
"ipoib_cm_modify_rx_qp() failed %d\n",
+                                                           ret);
+                                                goto err_modify;
+                                }
+
+                                for (i = 0; i < ipoib_recvq_size; ++i) {
+                                                if 
(!ipoib_cm_alloc_rx_skb(dev, i << 32 | index,
+                   IPOIB_CM_RX_SG - 1,
+                   p->rx_ring[i].mapping)) {
+ ipoib_warn(priv, "failed to allocate receive "
+  "buffer %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+                                                                return 
-ENOMEM;
+                                                }
+ 
+                                                if 
(ipoib_cm_post_receive(dev, i << 32 | index)) {
+ ipoib_warn(priv, "ipoib_ib_post_receive "
+  "failed for  buf %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+                                                                return 
-EIO;
+                                                }
+                                }
+                } else { /* SRQ */
+                                p->rx_ring = NULL; /* This is used only 
by NOSRQ */
+                                psn = random32() & 0xffffff;
+                                ret = ipoib_cm_modify_rx_qp(dev, cm_id, 
p->qp, psn);
+                                if (ret)
+                                                goto err_modify;
+                }
 
                 ret = ipoib_cm_send_rep(dev, cm_id, p->qp, 
&event->param.req_rcvd, psn);
                 if (ret) {
@@ -255,13 +375,15 @@ static int ipoib_cm_req_handler(struct i
                                 goto err_rep;
                 }
 
-                cm_id->context = p;
-                p->jiffies = jiffies;
-                spin_lock_irqsave(&priv->lock, flags);
-                list_add(&p->list, &priv->cm.passive_ids);
-                spin_unlock_irqrestore(&priv->lock, flags);
+                if (priv->cm.srq) {
+                                cm_id->context = p;
+                                p->jiffies = jiffies;
+                                spin_lock_irqsave(&priv->lock, flags);
+                                list_add(&p->list, 
&priv->cm.passive_ids);
+                                spin_unlock_irqrestore(&priv->lock, 
flags);
+                }
                 queue_delayed_work(ipoib_workqueue,
-                                                   &priv->cm.stale_task, 
IPOIB_CM_RX_DELAY);
+                                           &priv->cm.stale_task, 
IPOIB_CM_RX_DELAY);
                 return 0;
 
 err_rep:
@@ -344,12 +466,19 @@ static void skb_put_frags(struct sk_buff
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
-                unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
                 struct sk_buff *skb, *newskb;
                 struct ipoib_cm_rx *p;
                 unsigned long flags;
-                u64 mapping[IPOIB_CM_RX_SG];
+                u64 mapping[IPOIB_CM_RX_SG], wr_id;
+                u32 index;
                 int frags;
+                struct ipoib_cm_rx *rx_ptr;
+ 
+
+                if (priv->cm.srq)
+                                wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+                else
+                                wr_id = wc->wr_id >> 32;
 
                 ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, 
status: %d\n",
                                        wr_id, wc->opcode, wc->status);
@@ -360,7 +489,16 @@ void ipoib_cm_handle_rx_wc(struct net_de
                                 return;
                 }
 
-                skb  = priv->cm.srq_ring[wr_id].skb;
+                if(priv->cm.srq)
+                                skb  = priv->cm.srq_ring[wr_id].skb;
+                else {
+                                index = (wc->wr_id & ~IPOIB_CM_OP_NOSRQ) 
& NOSRQ_INDEX_MASK ;
+                                spin_lock_irqsave(&priv->lock, flags);
+                                rx_ptr = priv->cm.rx_index_ring[index];
+                                spin_unlock_irqrestore(&priv->lock, 
flags);
+
+                                skb     = rx_ptr->rx_ring[wr_id].skb;
+                } /* NOSRQ */
 
                 if (unlikely(wc->status != IB_WC_SUCCESS)) {
                                 ipoib_dbg(priv, "cm recv error "
@@ -371,7 +509,13 @@ void ipoib_cm_handle_rx_wc(struct net_de
                 }
 
                 if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) {
-                                p = wc->qp->qp_context;
+                                if(priv->cm.srq == NULL)
+                                /* There are no guarantees that wc->qp is 
not NULL for HCAs 
+                                 * that do not support SRQ. */ 
+                                                p = rx_ptr;
+                                else
+                                                p = wc->qp->qp_context;
+
                                 if (time_after_eq(jiffies, p->jiffies + 
IPOIB_CM_RX_UPDATE_TIME)) {
 spin_lock_irqsave(&priv->lock, flags);
                                                 p->jiffies = jiffies;
@@ -388,7 +532,11 @@ void ipoib_cm_handle_rx_wc(struct net_de
                 frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
        (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
-                newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, 
mapping);
+                if (priv->cm.srq) 
+                                newskb = ipoib_cm_alloc_rx_skb(dev, 
wr_id, frags, mapping);
+                else
+                                newskb = ipoib_cm_alloc_rx_skb(dev, wr_id 
<< 32 | index, frags,
+         mapping);
                 if (unlikely(!newskb)) {
                                 /*
                                  * If we can't allocate a new RX buffer, 
dump
@@ -399,13 +547,22 @@ void ipoib_cm_handle_rx_wc(struct net_de
                                 goto repost;
                 }
 
-                ipoib_cm_dma_unmap_rx(priv, frags, 
priv->cm.srq_ring[wr_id].mapping);
-                memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags 
+ 1) * sizeof *mapping);
+                if (priv->cm.srq) {
+                                ipoib_cm_dma_unmap_rx(priv, frags, 
+ priv->cm.srq_ring[wr_id].mapping);
+                                memcpy(priv->cm.srq_ring[wr_id].mapping, 
mapping, 
+                                       (frags + 1) * sizeof *mapping);
+                } else {
+                                ipoib_cm_dma_unmap_rx(priv, frags, 
+ rx_ptr->rx_ring[wr_id].mapping);
+                                memcpy(rx_ptr->rx_ring[wr_id].mapping, 
mapping, 
+                                       (frags + 1) * sizeof *mapping);
+                }
 
                 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
                                        wc->byte_len, wc->slid);
 
-                skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, 
newskb);
+                skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, 
newskb); 
 
                 skb->protocol = ((struct ipoib_header *) 
skb->data)->proto;
                 skb->mac.raw = skb->data;
@@ -418,12 +575,19 @@ void ipoib_cm_handle_rx_wc(struct net_de
                 skb->dev = dev;
                 /* XXX get correct PACKET_ type here */
                 skb->pkt_type = PACKET_HOST;
+
                 netif_rx_ni(skb);
 
 repost:
-                if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
-                                ipoib_warn(priv, "ipoib_cm_post_receive 
failed "
-                                                   "for buf %d\n", 
wr_id);
+                if (priv->cm.srq) { 
+                                if (unlikely(ipoib_cm_post_receive(dev, 
wr_id)))
+                                                ipoib_warn(priv, 
"ipoib_cm_post_receive failed "
+                                                           "for buf 
%d\n", wr_id);
+                } else {
+                                if (unlikely(ipoib_cm_post_receive(dev, 
wr_id << 32 | index)))
+                                                ipoib_warn(priv, 
"ipoib_cm_post_receive failed "
+                                                           "for buf 
%d\n", wr_id);
+                }
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
@@ -432,6 +596,9 @@ static inline int post_send(struct ipoib
                                                     u64 addr, int len)
 {
                 struct ib_send_wr *bad_wr;
+                struct ib_qp_attr qp_attr;
+                struct ib_qp_init_attr qp_init_attr;
+                int ret, qp_attr_mask;
 
                 priv->tx_sge.addr             = addr;
                 priv->tx_sge.length           = len;
@@ -613,6 +780,7 @@ void ipoib_cm_dev_stop(struct net_device
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
                 struct ipoib_cm_rx *p;
                 unsigned long flags;
+                int i;
 
                 if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
                                 return;
@@ -621,6 +789,17 @@ void ipoib_cm_dev_stop(struct net_device
                 spin_lock_irqsave(&priv->lock, flags);
                 while (!list_empty(&priv->cm.passive_ids)) {
                                 p = list_entry(priv->cm.passive_ids.next, 
typeof(*p), list);
+                                if (priv->cm.srq == NULL) {
+                                                for(i = 0; i < 
ipoib_recvq_size; ++i)
+ if(p->rx_ring[i].skb) {
+  ipoib_cm_dma_unmap_rx(priv,
+                         IPOIB_CM_RX_SG - 1,
+                                 p->rx_ring[i].mapping);
+  dev_kfree_skb_any(p->rx_ring[i].skb);
+  p->rx_ring[i].skb = NULL;
+                                                                }
+                                                kfree(p->rx_ring);
+                                } 
                                 list_del_init(&p->list);
                                 spin_unlock_irqrestore(&priv->lock, 
flags);
                                 ib_destroy_cm_id(p->id);
@@ -707,9 +886,14 @@ static struct ib_qp *ipoib_cm_create_tx_
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
                 struct ib_qp_init_attr attr = {};
                 attr.recv_cq = priv->cq;
-                attr.srq = priv->cm.srq;
+                if (priv->cm.srq)
+                                attr.srq = priv->cm.srq;
+                else
+                                attr.srq = NULL;
                 attr.cap.max_send_wr = ipoib_sendq_size;
+                attr.cap.max_recv_wr = 1; /* Not in MST code */
                 attr.cap.max_send_sge = 1;
+                attr.cap.max_recv_sge = 1; /* Not in MST code */
                 attr.sq_sig_type = IB_SIGNAL_ALL_WR;
                 attr.qp_type = IB_QPT_RC;
                 attr.send_cq = cq;
@@ -746,10 +930,13 @@ static int ipoib_cm_send_req(struct net_
                 req.responder_resources                       = 4;
                 req.remote_cm_response_timeout = 20;
                 req.local_cm_response_timeout  = 20;
-                req.retry_count                       = 0; /* RFC draft 
warns against retries */
-                req.rnr_retry_count                   = 0; /* RFC draft 
warns against retries */
+                req.retry_count                       = 6; /* RFC draft 
warns against retries */
+                req.rnr_retry_count                   = 6;/* RFC draft 
warns against retries */
                 req.max_cm_retries                    = 15;
-                req.srq                               = 1;
+                if (priv->cm.srq)
+                                req.srq               = 1;
+                else
+                                req.srq               = 0;
                 return ib_send_cm_req(id, &req);
 }
 
@@ -1089,6 +1276,7 @@ static void ipoib_cm_stale_task(struct w
                    cm.stale_task.work);
                 struct ipoib_cm_rx *p;
                 unsigned long flags;
+                int i;
 
                 spin_lock_irqsave(&priv->lock, flags);
                 while (!list_empty(&priv->cm.passive_ids)) {
@@ -1097,6 +1285,19 @@ static void ipoib_cm_stale_task(struct w
                                 p = list_entry(priv->cm.passive_ids.prev, 
typeof(*p), list);
                                 if (time_before_eq(jiffies, p->jiffies + 
IPOIB_CM_RX_TIMEOUT))
                                                 break;
+                                if (priv->cm.srq == NULL) { /* NOSRQ */
+                                                for(i = 0; i < 
ipoib_recvq_size; ++i)
+ if(p->rx_ring[i].skb) {
+  ipoib_cm_dma_unmap_rx(priv,
+                  IPOIB_CM_RX_SG - 1,
+                  p->rx_ring[i].mapping);
+  dev_kfree_skb_any(p->rx_ring[i].skb);
+  p->rx_ring[i].skb = NULL;
+                                                                }
+                                                /* Free the rx_ring */
+                                                kfree(p->rx_ring);
+ priv->cm.rx_index_ring[p->index] = NULL;
+                                }
                                 list_del_init(&p->list);
                                 spin_unlock_irqrestore(&priv->lock, 
flags);
                                 ib_destroy_cm_id(p->id);
@@ -1154,13 +1355,9 @@ int ipoib_cm_add_mode_attr(struct net_de
 int ipoib_cm_dev_init(struct net_device *dev)
 {
                 struct ipoib_dev_priv *priv = netdev_priv(dev);
-                struct ib_srq_init_attr srq_init_attr = {
-                                .attr = {
-                                                .max_wr  = 
ipoib_recvq_size,
-                                                .max_sge = IPOIB_CM_RX_SG
-                                }
-                };
-                int ret, i;
+                struct ib_srq_init_attr srq_init_attr;
+                int ret, i, supports_srq;
+                struct ib_device_attr attr;
 
                 INIT_LIST_HEAD(&priv->cm.passive_ids);
                 INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1172,21 +1369,43 @@ int ipoib_cm_dev_init(struct net_device 
 
                 skb_queue_head_init(&priv->cm.skb_queue);
 
-                priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
-                if (IS_ERR(priv->cm.srq)) {
-                                ret = PTR_ERR(priv->cm.srq);
-                                priv->cm.srq = NULL;
+                if (ret = ib_query_device(priv->ca, &attr))
                                 return ret;
+                if (attr.max_srq)
+                                supports_srq = 1; /* This device supports 
SRQ */
+                else {
+                                supports_srq = 0; 
                 }
 
-                priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof 
*priv->cm.srq_ring,
- GFP_KERNEL);
-                if (!priv->cm.srq_ring) {
-                                printk(KERN_WARNING "%s: failed to 
allocate CM ring (%d entries)\n",
-                                       priv->ca->name, ipoib_recvq_size);
-                                ipoib_cm_dev_cleanup(dev);
-                                return -ENOMEM;
-                }
+                if (supports_srq) {
+                                srq_init_attr.attr.max_wr = 
ipoib_recvq_size;
+                                srq_init_attr.attr.max_sge = 
IPOIB_CM_RX_SG;
+
+                                priv->cm.srq = ib_create_srq(priv->pd, 
&srq_init_attr);
+                                if (IS_ERR(priv->cm.srq)) {
+                                                ret = 
PTR_ERR(priv->cm.srq);
+                                                priv->cm.srq = NULL;
+                                                return ret;
+                                }
+
+                                priv->cm.srq_ring = 
kzalloc(ipoib_recvq_size * 
+ sizeof *priv->cm.srq_ring, 
+   GFP_KERNEL);
+                                if (!priv->cm.srq_ring) {
+                                                printk(KERN_WARNING "%s: 
failed to allocate CM ring "
+                                                       "(%d entries)\n",
+ priv->ca->name, ipoib_recvq_size);
+ ipoib_cm_dev_cleanup(dev);
+                                                return -ENOMEM;
+                                }
+                                priv->cm.rx_index_ring = NULL; /* Not 
needed for SRQ */
+                } else {
+                                priv->cm.srq = NULL;
+                                priv->cm.srq_ring = NULL;
+                                priv->cm.rx_index_ring = 
kzalloc(NOSRQ_INDEX_RING_SIZE * 
+   sizeof *priv->cm.rx_index_ring,
+   GFP_KERNEL);
+                } 
 
                 for (i = 0; i < IPOIB_CM_RX_SG; ++i)
                                 priv->cm.rx_sge[i].lkey                 = 
priv->mr->lkey;
@@ -1198,19 +1417,25 @@ int ipoib_cm_dev_init(struct net_device 
                 priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
                 priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
 
-                for (i = 0; i < ipoib_recvq_size; ++i) {
-                                if (!ipoib_cm_alloc_rx_skb(dev, i, 
IPOIB_CM_RX_SG - 1,
+                /* One can post receive buffers even before the RX QP is 
created 
+                 * only in the SRQ case. Therefore for NOSRQ we skip the 
rest of init 
+                 * and do that in ipoib_cm_req_handler() */
+
+                if (priv->cm.srq) {
+                                for (i = 0; i < ipoib_recvq_size; ++i) {
+                                                if 
(!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
     priv->cm.srq_ring[i].mapping)) {
-                                                ipoib_warn(priv, "failed 
to allocate receive buffer %d\n", i);
- ipoib_cm_dev_cleanup(dev);
-                                                return -ENOMEM;
-                                }
-                                if (ipoib_cm_post_receive(dev, i)) {
-                                                ipoib_warn(priv, 
"ipoib_ib_post_receive failed for buf %d\n", i);
- ipoib_cm_dev_cleanup(dev);
-                                                return -EIO;
+ ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+                                                                return 
-ENOMEM;
+                                                }
+                                                if 
(ipoib_cm_post_receive(dev, i)) {
+ ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+ ipoib_cm_dev_cleanup(dev);
+                                                                return 
-EIO;
+                                                }
                                 }
-                }
+                } /* if supports SRQ */
 
                 priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
                 return 0;
--- linux-2.6.21-rc5.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c  
2007-04-02 17:44:58.000000000 -0700
+++ linux-2.6.21-rc5/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2007-04-03 
11:00:26.000000000 -0700
@@ -282,7 +282,7 @@ static void ipoib_ib_handle_tx_wc(struct
 
 static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
 {
-                if (wc->wr_id & IPOIB_CM_OP_SRQ)
+                if ((wc->wr_id & IPOIB_CM_OP_SRQ) || (wc->wr_id & 
IPOIB_CM_OP_NOSRQ))
                                 ipoib_cm_handle_rx_wc(dev, wc);
                 else if (wc->wr_id & IPOIB_OP_RECV)
                                 ipoib_ib_handle_rx_wc(dev, wc);




Pradeep
pradeep at us.ibm.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ipoib_cm.nosrq.patch.v2
Type: application/octet-stream
Size: 20146 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20070418/03156a9f/attachment.obj>


More information about the general mailing list