[openib-general] [PATCH] IPoIB splitting CQ, increase both send/recv poll NUM_WC & interval
Shirley Ma
xma at us.ibm.com
Wed Apr 19 08:18:30 PDT 2006
Roland,
Here is the patch. This patch includes:
1. sepeate CQ to send CQ and recv CQ
2. increase both send/recv poll NUM_WC from 4 to 32
3. add cacheline smp in tx_ring, rx_ring and send_ibwc, recv_ibwc
4. add tunalbe poll interval in both send and recv.
example commandline:
modprobe ib_ipoib send_poll_interval=80 recv_poll_interval=10
Attachment is the file for you download. Any problems let me know.
Signed-off-by: Shirley Ma <xma at us.ibm.com>
diff -urN infiniband/ulp/ipoib/ipoib.h infiniband-cq/ulp/ipoib/ipoib.h
--- infiniband/ulp/ipoib/ipoib.h 2006-04-05 17:43:18.000000000
-0700
+++ infiniband-cq/ulp/ipoib/ipoib.h 2006-04-19 08:40:42.030284464
-0700
@@ -71,7 +71,8 @@
IPOIB_MAX_QUEUE_SIZE = 8192,
IPOIB_MIN_QUEUE_SIZE = 2,
- IPOIB_NUM_WC = 4,
+ IPOIB_NUM_SEND_WC = 32,
+ IPOIB_NUM_RECV_WC = 32,
IPOIB_MAX_PATH_REC_QUEUE = 3,
IPOIB_MAX_MCAST_QUEUE = 3,
@@ -151,7 +152,8 @@
u16 pkey;
struct ib_pd *pd;
struct ib_mr *mr;
- struct ib_cq *cq;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
struct ib_qp *qp;
u32 qkey;
@@ -162,16 +164,17 @@
unsigned int admin_mtu;
unsigned int mcast_mtu;
- struct ipoib_rx_buf *rx_ring;
+ struct ipoib_rx_buf *rx_ring ____cacheline_aligned_in_smp;
spinlock_t tx_lock;
- struct ipoib_tx_buf *tx_ring;
+ struct ipoib_tx_buf *tx_ring ____cacheline_aligned_in_smp;
unsigned tx_head;
unsigned tx_tail;
struct ib_sge tx_sge;
struct ib_send_wr tx_wr;
- struct ib_wc ibwc[IPOIB_NUM_WC];
+ struct ib_wc send_ibwc[IPOIB_NUM_SEND_WC]
____cacheline_aligned_in_smp;
+ struct ib_wc recv_ibwc[IPOIB_NUM_RECV_WC]
____cacheline_aligned_in_smp;
struct list_head dead_ahs;
@@ -243,9 +246,13 @@
extern struct workqueue_struct *ipoib_workqueue;
+extern int ipoib_send_poll_interval;
+extern int ipoib_recv_poll_interval;
+
/* functions */
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr);
struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
struct ib_pd *pd, struct ib_ah_attr
*attr);
diff -urN infiniband/ulp/ipoib/ipoib_ib.c
infiniband-cq/ulp/ipoib/ipoib_ib.c
--- infiniband/ulp/ipoib/ipoib_ib.c 2006-04-05 17:43:18.000000000
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_ib.c 2006-04-19 08:56:40.395590792
-0700
@@ -50,7 +50,6 @@
"Enable data path debug tracing if > 0");
#endif
-#define IPOIB_OP_RECV (1ul << 31)
static DEFINE_MUTEX(pkey_mutex);
@@ -108,7 +107,7 @@
list.lkey = priv->mr->lkey;
param.next = NULL;
- param.wr_id = id | IPOIB_OP_RECV;
+ param.wr_id = id;
param.sg_list = &list;
param.num_sge = 1;
@@ -175,8 +174,8 @@
return 0;
}
-static void ipoib_ib_handle_wc(struct net_device *dev,
- struct ib_wc *wc)
+static void ipoib_ib_handle_recv_wc(struct net_device *dev,
+ struct ib_wc *wc)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
unsigned int wr_id = wc->wr_id;
@@ -184,121 +183,142 @@
ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
wr_id, wc->opcode, wc->status);
- if (wr_id & IPOIB_OP_RECV) {
- wr_id &= ~IPOIB_OP_RECV;
-
- if (wr_id < ipoib_recvq_size) {
- struct sk_buff *skb = priv->rx_ring[wr_id].skb;
- dma_addr_t addr =
priv->rx_ring[wr_id].mapping;
-
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
- if (wc->status != IB_WC_WR_FLUSH_ERR)
- ipoib_warn(priv, "failed recv
event "
- "(status=%d, wrid=%d
vend_err %x)\n",
- wc->status, wr_id,
wc->vendor_err);
- dma_unmap_single(priv->ca->dma_device,
addr,
- IPOIB_BUF_SIZE,
DMA_FROM_DEVICE);
- dev_kfree_skb_any(skb);
- priv->rx_ring[wr_id].skb = NULL;
- return;
- }
-
- /*
- * If we can't allocate a new RX buffer, dump
- * this packet and reuse the old buffer.
- */
- if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
- ++priv->stats.rx_dropped;
- goto repost;
- }
-
- ipoib_dbg_data(priv, "received %d bytes, SLID
0x%04x\n",
- wc->byte_len, wc->slid);
+ if (wr_id < ipoib_recvq_size) {
+ struct sk_buff *skb = priv->rx_ring[wr_id].skb;
+ dma_addr_t addr = priv->rx_ring[wr_id].mapping;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ ipoib_warn(priv, "failed recv event "
+ "(status=%d, wrid=%d vend_err
%x)\n",
+ wc->status, wr_id,
wc->vendor_err);
dma_unmap_single(priv->ca->dma_device, addr,
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ dev_kfree_skb_any(skb);
+ priv->rx_ring[wr_id].skb = NULL;
+ return;
+ }
- skb_put(skb, wc->byte_len);
- skb_pull(skb, IB_GRH_BYTES);
+ /*
+ * If we can't allocate a new RX buffer, dump
+ * this packet and reuse the old buffer.
+ */
+ if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+ ++priv->stats.rx_dropped;
+ goto repost;
+ }
- if (wc->slid != priv->local_lid ||
- wc->src_qp != priv->qp->qp_num) {
- skb->protocol = ((struct ipoib_header *)
skb->data)->proto;
- skb->mac.raw = skb->data;
- skb_pull(skb, IPOIB_ENCAP_LEN);
-
- dev->last_rx = jiffies;
- ++priv->stats.rx_packets;
- priv->stats.rx_bytes += skb->len;
-
- skb->dev = dev;
- /* XXX get correct PACKET_ type here */
- skb->pkt_type = PACKET_HOST;
- netif_rx_ni(skb);
- } else {
- ipoib_dbg_data(priv, "dropping loopback
packet\n");
- dev_kfree_skb_any(skb);
- }
+ ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+ wc->byte_len, wc->slid);
- repost:
- if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
- ipoib_warn(priv, "ipoib_ib_post_receive
failed "
- "for buf %d\n", wr_id);
- } else
- ipoib_warn(priv, "completion event with wrid
%d\n",
- wr_id);
+ dma_unmap_single(priv->ca->dma_device, addr,
+ IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
- } else {
- struct ipoib_tx_buf *tx_req;
- unsigned long flags;
+ skb_put(skb, wc->byte_len);
+ skb_pull(skb, IB_GRH_BYTES);
- if (wr_id >= ipoib_sendq_size) {
- ipoib_warn(priv, "completion event with wrid %d (>
%d)\n",
- wr_id, ipoib_sendq_size);
- return;
+ if (wc->slid != priv->local_lid ||
+ wc->src_qp != priv->qp->qp_num) {
+ skb->protocol = ((struct ipoib_header *)
skb->data)->proto;
+ skb->mac.raw = skb->data;
+ skb_pull(skb, IPOIB_ENCAP_LEN);
+
+ dev->last_rx = jiffies;
+ ++priv->stats.rx_packets;
+ priv->stats.rx_bytes += skb->len;
+
+ skb->dev = dev;
+ /* XXX get correct PACKET_ type here */
+ skb->pkt_type = PACKET_HOST;
+ netif_rx_ni(skb);
+ } else {
+ ipoib_dbg_data(priv, "dropping loopback
packet\n");
+ dev_kfree_skb_any(skb);
}
- ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+ repost:
+ if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+ ipoib_warn(priv, "ipoib_ib_post_receive failed "
+ "for buf %d\n", wr_id);
+ } else
+ ipoib_warn(priv, "completion event with wrid %d\n",
+ wr_id);
+}
+
+static void ipoib_ib_handle_send_wc(struct net_device *dev,
+ struct ib_wc *wc)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned int wr_id = wc->wr_id;
+ struct ipoib_tx_buf *tx_req;
+ unsigned long flags;
+
+ ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+ wr_id, wc->opcode, wc->status);
- tx_req = &priv->tx_ring[wr_id];
+ if (wr_id >= ipoib_sendq_size) {
+ ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_sendq_size);
+ return;
+ }
- dma_unmap_single(priv->ca->dma_device,
- pci_unmap_addr(tx_req, mapping),
- tx_req->skb->len,
- DMA_TO_DEVICE);
+ ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
- ++priv->stats.tx_packets;
- priv->stats.tx_bytes += tx_req->skb->len;
+ tx_req = &priv->tx_ring[wr_id];
- dev_kfree_skb_any(tx_req->skb);
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(tx_req, mapping),
+ tx_req->skb->len,
+ DMA_TO_DEVICE);
- spin_lock_irqsave(&priv->tx_lock, flags);
- ++priv->tx_tail;
- if (netif_queue_stopped(dev) &&
- priv->tx_head - priv->tx_tail <= ipoib_sendq_size >>
1)
- netif_wake_queue(dev);
- spin_unlock_irqrestore(&priv->tx_lock, flags);
+ ++priv->stats.tx_packets;
+ priv->stats.tx_bytes += tx_req->skb->len;
- if (wc->status != IB_WC_SUCCESS &&
- wc->status != IB_WC_WR_FLUSH_ERR)
- ipoib_warn(priv, "failed send event "
- "(status=%d, wrid=%d vend_err %x)\n",
- wc->status, wr_id, wc->vendor_err);
- }
+ dev_kfree_skb_any(tx_req->skb);
+
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ ++priv->tx_tail;
+ if (netif_queue_stopped(dev) &&
+ priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+ netif_wake_queue(dev);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ ipoib_warn(priv, "failed send event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+}
+
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)
+{
+ struct net_device *dev = (struct net_device *) dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int n, i;
+
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ udelay(ipoib_send_poll_interval);
+ do {
+ n = ib_poll_cq(cq, IPOIB_NUM_SEND_WC, priv->send_ibwc);
+ for (i = 0; i < n; ++i)
+ ipoib_ib_handle_send_wc(dev, priv->send_ibwc + i);
+ } while (n == IPOIB_NUM_SEND_WC);
}
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr)
{
struct net_device *dev = (struct net_device *) dev_ptr;
struct ipoib_dev_priv *priv = netdev_priv(dev);
int n, i;
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ udelay(ipoib_recv_poll_interval);
do {
- n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+ n = ib_poll_cq(cq, IPOIB_NUM_RECV_WC, priv->recv_ibwc);
for (i = 0; i < n; ++i)
- ipoib_ib_handle_wc(dev, priv->ibwc + i);
- } while (n == IPOIB_NUM_WC);
+ ipoib_ib_handle_recv_wc(dev, priv->recv_ibwc + i);
+ } while (n == IPOIB_NUM_RECV_WC);
}
static inline int post_send(struct ipoib_dev_priv *priv,
diff -urN infiniband/ulp/ipoib/ipoib_main.c
infiniband-cq/ulp/ipoib/ipoib_main.c
--- infiniband/ulp/ipoib/ipoib_main.c 2006-04-12 16:43:38.000000000
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_main.c 2006-04-19
08:44:27.192054672 -0700
@@ -56,12 +56,17 @@
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+int ipoib_send_poll_interval __read_mostly = 0;
+int ipoib_recv_poll_interval __read_mostly = 0;
module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive
queue");
+module_param_named(send_poll_interval, ipoib_send_poll_interval, int,
0444);
+module_param_named(recv_poll_interval, ipoib_recv_poll_interval, int,
0444);
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
@@ -895,7 +900,7 @@
kfree(priv->rx_ring);
kfree(priv->tx_ring);
-
+
priv->rx_ring = NULL;
priv->tx_ring = NULL;
}
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c
infiniband-cq/ulp/ipoib/ipoib_verbs.c
--- infiniband/ulp/ipoib/ipoib_verbs.c 2006-04-05 17:43:18.000000000
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_verbs.c 2006-04-12
19:14:41.000000000 -0700
@@ -174,24 +174,35 @@
return -ENODEV;
}
- priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
- ipoib_sendq_size + ipoib_recvq_size + 1);
- if (IS_ERR(priv->cq)) {
- printk(KERN_WARNING "%s: failed to create CQ\n",
ca->name);
+ priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_send_completion,
NULL, dev,
+ ipoib_sendq_size + 1);
+ if (IS_ERR(priv->send_cq)) {
+ printk(KERN_WARNING "%s: failed to create send CQ\n",
ca->name);
goto out_free_pd;
}
- if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
- goto out_free_cq;
+ if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+ goto out_free_send_cq;
+
+
+ priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_recv_completion,
NULL, dev,
+ ipoib_recvq_size + 1);
+ if (IS_ERR(priv->recv_cq)) {
+ printk(KERN_WARNING "%s: failed to create recv CQ\n",
ca->name);
+ goto out_free_send_cq;
+ }
+
+ if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+ goto out_free_recv_cq;
priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(priv->mr)) {
printk(KERN_WARNING "%s: ib_get_dma_mr failed\n",
ca->name);
- goto out_free_cq;
+ goto out_free_recv_cq;
}
- init_attr.send_cq = priv->cq;
- init_attr.recv_cq = priv->cq,
+ init_attr.send_cq = priv->send_cq;
+ init_attr.recv_cq = priv->recv_cq,
priv->qp = ib_create_qp(priv->pd, &init_attr);
if (IS_ERR(priv->qp)) {
@@ -215,8 +226,11 @@
out_free_mr:
ib_dereg_mr(priv->mr);
-out_free_cq:
- ib_destroy_cq(priv->cq);
+out_free_recv_cq:
+ ib_destroy_cq(priv->recv_cq);
+
+out_free_send_cq:
+ ib_destroy_cq(priv->send_cq);
out_free_pd:
ib_dealloc_pd(priv->pd);
@@ -238,7 +252,10 @@
if (ib_dereg_mr(priv->mr))
ipoib_warn(priv, "ib_dereg_mr failed\n");
- if (ib_destroy_cq(priv->cq))
+ if (ib_destroy_cq(priv->send_cq))
+ ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+ if (ib_destroy_cq(priv->recv_cq))
ipoib_warn(priv, "ib_cq_destroy failed\n");
if (ib_dealloc_pd(priv->pd))
Thanks
Shirley Ma
IBM Linux Technology Center
15300 SW Koll Parkway
Beaverton, OR 97006-6063
Phone(Fax): (503) 578-7638
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060419/a82ac19a/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cq.tune.patch
Type: application/octet-stream
Size: 13025 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060419/a82ac19a/attachment.obj>
More information about the general
mailing list