<br><font size=2 face="sans-serif">Roland,</font>
<br>
<br><font size=2 face="sans-serif">Here is the patch. This patch includes:</font>
<br>
<br><font size=2 face="sans-serif">1. sepeate CQ to send CQ and recv CQ</font>
<br><font size=2 face="sans-serif">2. increase both send/recv poll NUM_WC
from 4 to 32 </font>
<br><font size=2 face="sans-serif">3. add cacheline smp in tx_ring, rx_ring
and send_ibwc, recv_ibwc</font>
<br><font size=2 face="sans-serif">4. add tunalbe poll interval in both
send and recv.</font>
<br>
<br><font size=2 face="sans-serif">example commandline:</font>
<br><font size=2 face="sans-serif"> modprobe
ib_ipoib send_poll_interval=80 recv_poll_interval=10</font>
<br>
<br><font size=2 face="sans-serif">Attachment is the file for you download.
Any problems let me know.</font>
<br>
<br>
<br><font size=2 face="sans-serif">Signed-off-by: Shirley Ma <xma@us.ibm.com></font>
<br>
<div>
<br><font size=2 face="sans-serif">diff -urN infiniband/ulp/ipoib/ipoib.h
infiniband-cq/ulp/ipoib/ipoib.h<br>
--- infiniband/ulp/ipoib/ipoib.h 2006-04-05
17:43:18.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib.h 2006-04-19
08:40:42.030284464 -0700<br>
@@ -71,7 +71,8 @@<br>
IPOIB_MAX_QUEUE_SIZE
= 8192,<br>
IPOIB_MIN_QUEUE_SIZE
= 2,<br>
<br>
- IPOIB_NUM_WC
= 4,<br>
+ IPOIB_NUM_SEND_WC
= 32,<br>
+ IPOIB_NUM_RECV_WC
= 32,<br>
<br>
IPOIB_MAX_PATH_REC_QUEUE = 3,<br>
IPOIB_MAX_MCAST_QUEUE
= 3,<br>
@@ -151,7 +152,8 @@<br>
u16
pkey;<br>
struct ib_pd
*pd;<br>
struct ib_mr
*mr;<br>
- struct ib_cq
*cq;<br>
+ struct ib_cq
*send_cq;<br>
+ struct ib_cq
*recv_cq;<br>
struct ib_qp
*qp;<br>
u32
qkey;<br>
<br>
@@ -162,16 +164,17 @@<br>
unsigned int admin_mtu;<br>
unsigned int mcast_mtu;<br>
<br>
- struct ipoib_rx_buf *rx_ring;<br>
+ struct ipoib_rx_buf *rx_ring
____cacheline_aligned_in_smp;<br>
<br>
spinlock_t
tx_lock;<br>
- struct ipoib_tx_buf *tx_ring;<br>
+ struct ipoib_tx_buf *tx_ring
____cacheline_aligned_in_smp;
<br>
unsigned
tx_head;<br>
unsigned
tx_tail;</font>
<br><font size=2 face="sans-serif"> struct
ib_sge tx_sge;<br>
struct ib_send_wr tx_wr;<br>
<br>
- struct ib_wc ibwc[IPOIB_NUM_WC];<br>
+ struct ib_wc send_ibwc[IPOIB_NUM_SEND_WC]
____cacheline_aligned_in_smp;<br>
+ struct ib_wc recv_ibwc[IPOIB_NUM_RECV_WC]
____cacheline_aligned_in_smp;<br>
<br>
struct list_head dead_ahs;<br>
<br>
@@ -243,9 +246,13 @@<br>
<br>
extern struct workqueue_struct *ipoib_workqueue;<br>
<br>
+extern int ipoib_send_poll_interval;<br>
+extern int ipoib_recv_poll_interval;<br>
+<br>
/* functions */<br>
<br>
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);<br>
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr);<br>
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr);<br>
<br>
struct ipoib_ah *ipoib_create_ah(struct net_device *dev,<br>
struct
ib_pd *pd, struct ib_ah_attr *attr);<br>
diff -urN infiniband/ulp/ipoib/ipoib_ib.c infiniband-cq/ulp/ipoib/ipoib_ib.c<br>
--- infiniband/ulp/ipoib/ipoib_ib.c 2006-04-05
17:43:18.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib_ib.c 2006-04-19
08:56:40.395590792 -0700<br>
@@ -50,7 +50,6 @@<br>
"Enable
data path debug tracing if > 0");</font>
<br><font size=2 face="sans-serif"> #endif<br>
<br>
-#define IPOIB_OP_RECV
(1ul << 31)<br>
<br>
static DEFINE_MUTEX(pkey_mutex);<br>
<br>
@@ -108,7 +107,7 @@<br>
list.lkey = priv->mr->lkey;<br>
<br>
param.next = NULL;<br>
- param.wr_id = id | IPOIB_OP_RECV;<br>
+ param.wr_id = id;<br>
param.sg_list = &list;<br>
param.num_sge = 1;<br>
<br>
@@ -175,8 +174,8 @@<br>
return 0;<br>
}<br>
<br>
-static void ipoib_ib_handle_wc(struct net_device *dev,<br>
-
struct ib_wc *wc)<br>
+static void ipoib_ib_handle_recv_wc(struct net_device *dev,<br>
+
struct ib_wc *wc)<br>
{<br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
unsigned int wr_id = wc->wr_id;<br>
@@ -184,121 +183,142 @@<br>
ipoib_dbg_data(priv, "called:
id %d, op %d, status: %d\n",<br>
wr_id, wc->opcode, wc->status);<br>
<br>
- if (wr_id & IPOIB_OP_RECV) {<br>
- wr_id
&= ~IPOIB_OP_RECV;<br>
-<br>
- if
(wr_id < ipoib_recvq_size) {<br>
-
struct sk_buff *skb = priv->rx_ring[wr_id].skb;<br>
-
dma_addr_t addr
= priv->rx_ring[wr_id].mapping;<br>
-<br>
-
if (unlikely(wc->status != IB_WC_SUCCESS))
{<br>
-
if
(wc->status != IB_WC_WR_FLUSH_ERR)<br>
-
ipoib_warn(priv, "failed recv event
"</font>
<br><font size=2 face="sans-serif">-
"(status=%d, wrid=%d vend_err
%x)\n",<br>
-
wc->status, wr_id, wc->vendor_err);<br>
-
dma_unmap_single(priv->ca->dma_device,
addr,<br>
-
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);<br>
-
dev_kfree_skb_any(skb);<br>
-
priv->rx_ring[wr_id].skb
= NULL;<br>
-
return;<br>
-
}<br>
-<br>
-
/*<br>
-
* If we can't allocate a new RX buffer,
dump<br>
-
* this packet and reuse the old buffer.<br>
-
*/<br>
-
if (unlikely(ipoib_alloc_rx_skb(dev,
wr_id))) {<br>
-
++priv->stats.rx_dropped;<br>
-
goto
repost;<br>
-
}<br>
-<br>
-
ipoib_dbg_data(priv, "received
%d bytes, SLID 0x%04x\n",<br>
-
wc->byte_len, wc->slid);<br>
<br>
+ if (wr_id < ipoib_recvq_size) {<br>
+ struct
sk_buff *skb = priv->rx_ring[wr_id].skb;<br>
+ dma_addr_t
addr = priv->rx_ring[wr_id].mapping;<br>
+<br>
+ if
(unlikely(wc->status != IB_WC_SUCCESS)) {<br>
+
if (wc->status != IB_WC_WR_FLUSH_ERR)<br>
+
ipoib_warn(priv,
"failed recv event "<br>
+
"(status=%d, wrid=%d vend_err
%x)\n",<br>
+
wc->status, wr_id, wc->vendor_err);<br>
dma_unmap_single(priv->ca->dma_device,
addr,<br>
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);</font>
<br><font size=2 face="sans-serif">+
dev_kfree_skb_any(skb);<br>
+
priv->rx_ring[wr_id].skb = NULL;<br>
+
return;<br>
+ }<br>
<br>
-
skb_put(skb, wc->byte_len);<br>
-
skb_pull(skb, IB_GRH_BYTES);<br>
+ /*<br>
+
* If we can't allocate a new RX buffer, dump<br>
+
* this packet and reuse the old buffer.<br>
+
*/<br>
+ if
(unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {<br>
+
++priv->stats.rx_dropped;<br>
+
goto repost;<br>
+ }<br>
<br>
-
if (wc->slid != priv->local_lid
||<br>
-
wc->src_qp != priv->qp->qp_num)
{<br>
-
skb->protocol
= ((struct ipoib_header *) skb->data)->proto;<br>
-
skb->mac.raw
= skb->data;<br>
-
skb_pull(skb,
IPOIB_ENCAP_LEN);<br>
-<br>
-
dev->last_rx
= jiffies;<br>
-
++priv->stats.rx_packets;<br>
-
priv->stats.rx_bytes
+= skb->len;<br>
-<br>
-
skb->dev
= dev;<br>
-
/*
XXX get correct PACKET_ type here */<br>
-
skb->pkt_type
= PACKET_HOST;<br>
-
netif_rx_ni(skb);<br>
-
} else {<br>
-
ipoib_dbg_data(priv,
"dropping loopback packet\n");<br>
-
dev_kfree_skb_any(skb);<br>
-
}<br>
+ ipoib_dbg_data(priv,
"received %d bytes, SLID 0x%04x\n",<br>
+
wc->byte_len,
wc->slid);<br>
<br>
- repost:<br>
-
if (unlikely(ipoib_ib_post_receive(dev,
wr_id)))</font>
<br><font size=2 face="sans-serif">-
ipoib_warn(priv, "ipoib_ib_post_receive
failed "<br>
-
"for buf %d\n", wr_id);<br>
- }
else<br>
-
ipoib_warn(priv, "completion event
with wrid %d\n",<br>
-
wr_id);<br>
+ dma_unmap_single(priv->ca->dma_device,
addr,<br>
+
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);<br>
<br>
- } else {<br>
- struct
ipoib_tx_buf *tx_req;<br>
- unsigned
long flags;<br>
+ skb_put(skb,
wc->byte_len);<br>
+ skb_pull(skb,
IB_GRH_BYTES);<br>
<br>
- if
(wr_id >= ipoib_sendq_size) {<br>
-
ipoib_warn(priv, "completion event
with wrid %d (> %d)\n",<br>
-
wr_id, ipoib_sendq_size);<br>
-
return;<br>
+ if
(wc->slid != priv->local_lid ||<br>
+
wc->src_qp != priv->qp->qp_num) {<br>
+
skb->protocol = ((struct ipoib_header
*) skb->data)->proto;<br>
+
skb->mac.raw = skb->data;<br>
+
skb_pull(skb, IPOIB_ENCAP_LEN);<br>
+<br>
+
dev->last_rx = jiffies;<br>
+
++priv->stats.rx_packets;<br>
+
priv->stats.rx_bytes += skb->len;<br>
+<br>
+
skb->dev = dev;<br>
+
/* XXX get correct PACKET_ type here
*/<br>
+
skb->pkt_type = PACKET_HOST;<br>
+
netif_rx_ni(skb);<br>
+ }
else {<br>
+
ipoib_dbg_data(priv, "dropping
loopback packet\n");</font>
<br><font size=2 face="sans-serif">+
dev_kfree_skb_any(skb);<br>
}<br>
<br>
- ipoib_dbg_data(priv,
"send complete, wrid %d\n", wr_id);<br>
+ repost:<br>
+ if
(unlikely(ipoib_ib_post_receive(dev, wr_id)))<br>
+
ipoib_warn(priv, "ipoib_ib_post_receive
failed "<br>
+
"for buf %d\n", wr_id);<br>
+ } else<br>
+ ipoib_warn(priv,
"completion event with wrid %d\n",<br>
+
wr_id);<br>
+}<br>
+<br>
+static void ipoib_ib_handle_send_wc(struct net_device *dev,<br>
+
struct ib_wc *wc)<br>
+{<br>
+ struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
+ unsigned int wr_id = wc->wr_id;<br>
+ struct ipoib_tx_buf *tx_req;<br>
+ unsigned long flags;<br>
+<br>
+ ipoib_dbg_data(priv, "called:
id %d, op %d, status: %d\n",<br>
+
wr_id, wc->opcode, wc->status);<br>
<br>
- tx_req
= &priv->tx_ring[wr_id];<br>
+ if (wr_id >= ipoib_sendq_size)
{<br>
+ ipoib_warn(priv,
"completion event with wrid %d (> %d)\n",<br>
+
wr_id, ipoib_sendq_size);<br>
+ return;<br>
+ }<br>
<br>
- dma_unmap_single(priv->ca->dma_device,<br>
-
pci_unmap_addr(tx_req, mapping),<br>
-
tx_req->skb->len,<br>
-
DMA_TO_DEVICE);<br>
+ ipoib_dbg_data(priv, "send complete,
wrid %d\n", wr_id);</font>
<br><font size=2 face="sans-serif"> <br>
- ++priv->stats.tx_packets;<br>
- priv->stats.tx_bytes
+= tx_req->skb->len;<br>
+ tx_req = &priv->tx_ring[wr_id];<br>
<br>
- dev_kfree_skb_any(tx_req->skb);<br>
+ dma_unmap_single(priv->ca->dma_device,<br>
+
pci_unmap_addr(tx_req, mapping),<br>
+
tx_req->skb->len,<br>
+
DMA_TO_DEVICE);<br>
<br>
- spin_lock_irqsave(&priv->tx_lock,
flags);<br>
- ++priv->tx_tail;<br>
- if
(netif_queue_stopped(dev) &&<br>
-
priv->tx_head - priv->tx_tail <= ipoib_sendq_size
>> 1)<br>
-
netif_wake_queue(dev);<br>
- spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
+ ++priv->stats.tx_packets;<br>
+ priv->stats.tx_bytes += tx_req->skb->len;<br>
<br>
- if
(wc->status != IB_WC_SUCCESS &&<br>
-
wc->status != IB_WC_WR_FLUSH_ERR)<br>
-
ipoib_warn(priv, "failed send event
"<br>
-
"(status=%d, wrid=%d vend_err %x)\n",<br>
-
wc->status, wr_id, wc->vendor_err);<br>
- }<br>
+ dev_kfree_skb_any(tx_req->skb);<br>
+<br>
+ spin_lock_irqsave(&priv->tx_lock,
flags);<br>
+ ++priv->tx_tail;<br>
+ if (netif_queue_stopped(dev) &&<br>
+ priv->tx_head - priv->tx_tail
<= ipoib_sendq_size >> 1)<br>
+ netif_wake_queue(dev);<br>
+ spin_unlock_irqrestore(&priv->tx_lock,
flags);</font>
<br><font size=2 face="sans-serif">+<br>
+ if (wc->status != IB_WC_SUCCESS
&&<br>
+ wc->status != IB_WC_WR_FLUSH_ERR)<br>
+ ipoib_warn(priv,
"failed send event "<br>
+
"(status=%d, wrid=%d vend_err
%x)\n",<br>
+
wc->status, wr_id, wc->vendor_err);<br>
+}<br>
+<br>
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)<br>
+{<br>
+ struct net_device *dev = (struct net_device
*) dev_ptr;<br>
+ struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
+ int n, i;<br>
+<br>
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);<br>
+ udelay(ipoib_send_poll_interval);<br>
+ do {<br>
+ n
= ib_poll_cq(cq, IPOIB_NUM_SEND_WC, priv->send_ibwc);<br>
+ for
(i = 0; i < n; ++i)<br>
+
ipoib_ib_handle_send_wc(dev, priv->send_ibwc
+ i);<br>
+ } while (n == IPOIB_NUM_SEND_WC);<br>
}<br>
<br>
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)<br>
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr)<br>
{<br>
struct net_device *dev = (struct net_device
*) dev_ptr;<br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
int n, i;<br>
<br>
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);<br>
+ udelay(ipoib_recv_poll_interval);<br>
do {<br>
- n
= ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);</font>
<br><font size=2 face="sans-serif">+
n = ib_poll_cq(cq, IPOIB_NUM_RECV_WC,
priv->recv_ibwc);<br>
for
(i = 0; i < n; ++i)<br>
-
ipoib_ib_handle_wc(dev, priv->ibwc
+ i);<br>
- } while (n == IPOIB_NUM_WC);<br>
+
ipoib_ib_handle_recv_wc(dev, priv->recv_ibwc
+ i);<br>
+ } while (n == IPOIB_NUM_RECV_WC);<br>
}<br>
<br>
static inline int post_send(struct ipoib_dev_priv *priv,<br>
diff -urN infiniband/ulp/ipoib/ipoib_main.c infiniband-cq/ulp/ipoib/ipoib_main.c<br>
--- infiniband/ulp/ipoib/ipoib_main.c 2006-04-12
16:43:38.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib_main.c 2006-04-19
08:44:27.192054672 -0700<br>
@@ -56,12 +56,17 @@<br>
<br>
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;<br>
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;<br>
+int ipoib_send_poll_interval __read_mostly = 0;<br>
+int ipoib_recv_poll_interval __read_mostly = 0;<br>
<br>
module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);<br>
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send
queue");<br>
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);<br>
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive
queue");</font>
<br><font size=2 face="sans-serif"> <br>
+module_param_named(send_poll_interval, ipoib_send_poll_interval, int,
0444);<br>
+module_param_named(recv_poll_interval, ipoib_recv_poll_interval, int,
0444);<br>
+<br>
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG<br>
int ipoib_debug_level;<br>
<br>
@@ -895,7 +900,7 @@<br>
<br>
kfree(priv->rx_ring);<br>
kfree(priv->tx_ring);<br>
-<br>
+ <br>
priv->rx_ring = NULL;<br>
priv->tx_ring = NULL;<br>
}<br>
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c infiniband-cq/ulp/ipoib/ipoib_verbs.c<br>
--- infiniband/ulp/ipoib/ipoib_verbs.c 2006-04-05
17:43:18.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib_verbs.c 2006-04-12
19:14:41.000000000 -0700<br>
@@ -174,24 +174,35 @@<br>
return
-ENODEV;<br>
}<br>
<br>
- priv->cq = ib_create_cq(priv->ca,
ipoib_ib_completion, NULL, dev,<br>
-
ipoib_sendq_size
+ ipoib_recvq_size + 1);<br>
- if (IS_ERR(priv->cq)) {<br>
- printk(KERN_WARNING
"%s: failed to create CQ\n", ca->name);<br>
+ priv->send_cq = ib_create_cq(priv->ca,
ipoib_ib_send_completion, NULL, dev,<br>
+
ipoib_sendq_size
+ 1);<br>
+ if (IS_ERR(priv->send_cq)) {<br>
+ printk(KERN_WARNING
"%s: failed to create send CQ\n", ca->name);</font>
<br><font size=2 face="sans-serif">
goto out_free_pd;<br>
}<br>
<br>
- if (ib_req_notify_cq(priv->cq,
IB_CQ_NEXT_COMP))<br>
- goto
out_free_cq;<br>
+ if (ib_req_notify_cq(priv->send_cq,
IB_CQ_NEXT_COMP))<br>
+ goto
out_free_send_cq;<br>
+<br>
+<br>
+ priv->recv_cq = ib_create_cq(priv->ca,
ipoib_ib_recv_completion, NULL, dev,<br>
+
ipoib_recvq_size
+ 1);<br>
+ if (IS_ERR(priv->recv_cq)) {<br>
+ printk(KERN_WARNING
"%s: failed to create recv CQ\n", ca->name);<br>
+ goto
out_free_send_cq;<br>
+ }<br>
+<br>
+ if (ib_req_notify_cq(priv->recv_cq,
IB_CQ_NEXT_COMP))<br>
+ goto
out_free_recv_cq;<br>
<br>
priv->mr = ib_get_dma_mr(priv->pd,
IB_ACCESS_LOCAL_WRITE);<br>
if (IS_ERR(priv->mr)) {<br>
printk(KERN_WARNING
"%s: ib_get_dma_mr failed\n", ca->name);<br>
- goto
out_free_cq;<br>
+ goto
out_free_recv_cq;<br>
}<br>
<br>
- init_attr.send_cq = priv->cq;<br>
- init_attr.recv_cq = priv->cq,<br>
+ init_attr.send_cq = priv->send_cq;<br>
+ init_attr.recv_cq = priv->recv_cq,<br>
<br>
priv->qp = ib_create_qp(priv->pd,
&init_attr);<br>
if (IS_ERR(priv->qp)) {<br>
@@ -215,8 +226,11 @@<br>
out_free_mr:<br>
ib_dereg_mr(priv->mr);<br>
<br>
-out_free_cq:<br>
- ib_destroy_cq(priv->cq);</font>
<br><font size=2 face="sans-serif">+out_free_recv_cq:<br>
+ ib_destroy_cq(priv->recv_cq);<br>
+<br>
+out_free_send_cq:<br>
+ ib_destroy_cq(priv->send_cq);<br>
<br>
out_free_pd:<br>
ib_dealloc_pd(priv->pd);<br>
@@ -238,7 +252,10 @@<br>
if (ib_dereg_mr(priv->mr))<br>
ipoib_warn(priv,
"ib_dereg_mr failed\n");<br>
<br>
- if (ib_destroy_cq(priv->cq))<br>
+ if (ib_destroy_cq(priv->send_cq))<br>
+ ipoib_warn(priv,
"ib_cq_destroy failed\n");<br>
+<br>
+ if (ib_destroy_cq(priv->recv_cq))<br>
ipoib_warn(priv,
"ib_cq_destroy failed\n");<br>
<br>
if (ib_dealloc_pd(priv->pd))</font>
<br><font size=2 face="sans-serif"><br>
</font>
<br>
<br>
<br>
<br><font size=2 face="sans-serif">Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638<br>
<br>
</font></div>