[ofa-general] [PATCH v9] IB/mlx4: shrinking WQE
Jack Morgenstein
jackm at dev.mellanox.co.il
Tue Dec 4 00:32:38 PST 2007
IB/mlx4: shrinking WQE
ConnectX supports shrinking wqe, such that a single WR can include
multiple units of wqe_shift. This way, WRs can differ in size, and
do not have to be a power of 2 in size, saving memory and speeding up
send WR posting. Unfortunately, if we do this wqe_index field in CQE
can't be used to look up the WR ID anymore, so do this only if
selective signalling is off.
Further, on 32-bit platforms, we can't use vmap to make
the QP buffer virtually contigious. Thus we have to use
constant-sized WRs to make sure a WR is always fully within
a single page-sized chunk.
Finally, we use WR with NOP opcode to avoid wrap-around
in the middle of WR. We set NoErrorCompletion bit to avoid getting
completions with error for NOP WRs. Since NEC is only supported
starting with firmware 2.2.232, we use constant-sized WRs
for older firmware. And, since MLX QPs only support SEND, we use
constant-sized WRs in this case.
When stamping during NOP posting, do stamping following setting of
the NOP wqe valid bit.
Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>
---
changes since v8:
fixed bug in __mlx4_ib_modify_qp: did not reset qp->sq_next_wqe
in the TO_RESET transition.
Found by Mellanox regression testing.
Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c 2007-10-29 11:09:24.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/cq.c 2007-12-04 09:29:39.391728000 +0200
@@ -331,6 +331,12 @@ static int mlx4_ib_poll_one(struct mlx4_
is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
MLX4_CQE_OPCODE_ERROR;
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+ is_send)) {
+ printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+ return -EINVAL;
+ }
+
if (!*cur_qp ||
(be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
/*
@@ -353,8 +359,10 @@ static int mlx4_ib_poll_one(struct mlx4_
if (is_send) {
wq = &(*cur_qp)->sq;
- wqe_ctr = be16_to_cpu(cqe->wqe_index);
- wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+ if (!(*cur_qp)->sq_signal_bits) {
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+ }
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
} else if ((*cur_qp)->ibqp.srq) {
Index: infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/mlx4_ib.h 2007-10-24 18:19:56.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h 2007-12-04 09:29:39.394732000 +0200
@@ -120,6 +120,8 @@ struct mlx4_ib_qp {
u32 doorbell_qpn;
__be32 sq_signal_bits;
+ unsigned sq_next_wqe;
+ int sq_max_wqes_per_wr;
int sq_spare_wqes;
struct mlx4_ib_wq sq;
Index: infiniband/drivers/infiniband/hw/mlx4/qp.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/qp.c 2007-11-01 09:17:50.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/qp.c 2007-12-04 09:29:39.404731000 +0200
@@ -30,6 +30,7 @@
* SOFTWARE.
*/
+#include <linux/log2.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_pack.h>
@@ -96,7 +97,7 @@ static int is_qp0(struct mlx4_ib_dev *de
static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
{
- if (qp->buf.nbufs == 1)
+ if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1)
return qp->buf.u.direct.buf + offset;
else
return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
@@ -115,16 +116,88 @@ static void *get_send_wqe(struct mlx4_ib
/*
* Stamp a SQ WQE so that it is invalid if prefetched by marking the
- * first four bytes of every 64 byte chunk with 0xffffffff, except for
- * the very first chunk of the WQE.
+ * first four bytes of every 64 byte chunk with
+ * 0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When max WR is than or equal to the WQE size,
+ * as an optimization, we can stamp WQE with 0xffffffff,
+ * and skip the very first chunk of the WQE.
*/
-static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
{
- u32 *wqe = get_send_wqe(qp, n);
+ u32 *wqe;
int i;
+ int s;
+ int ind;
+ void *buf;
+ __be32 stamp;
+
+ s = roundup(size, 1 << qp->sq.wqe_shift);
+ if (qp->sq_max_wqes_per_wr > 1) {
+ for (i = 0; i < s; i += 64) {
+ ind = (i >> qp->sq.wqe_shift) + n;
+ stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+ cpu_to_be32(0xffffffff);
+ buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+ *wqe = stamp;
+ }
+ } else {
+ buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ for (i = 64; i < s; i += 64) {
+ wqe = buf + i;
+ *wqe = 0xffffffff;
+ }
+ }
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_inline_seg *inl;
+ void *wqe;
+ int s;
+
+ ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+ if (qp->ibqp.qp_type == IB_QPT_UD) {
+ struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+ struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+ memset(dgram, 0, sizeof *dgram);
+ av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+ s += sizeof(struct mlx4_wqe_datagram_seg);
+ }
+
+ /* Pad the remainder of the WQE with an inline data segment. */
+ if (size > s) {
+ inl = wqe + s;
+ inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+ }
+ ctrl->srcrb_flags = 0;
+ ctrl->fence_size = size / 16;
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
- for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
- wqe[i] = 0xffffffff;
+ ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+ (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+ stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+ unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+ if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+ post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+ ind += s;
+ }
+ return ind;
}
static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -241,6 +314,8 @@ static int set_rq_size(struct mlx4_ib_de
static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
enum ib_qp_type type, struct mlx4_ib_qp *qp)
{
+ int s;
+
/* Sanity check SQ size before proceeding */
if (cap->max_send_wr > dev->dev->caps.max_wqes ||
cap->max_send_sge > dev->dev->caps.max_sq_sg ||
@@ -256,20 +331,69 @@ static int set_kernel_sq_size(struct mlx
cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
return -EINVAL;
- qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
- sizeof (struct mlx4_wqe_data_seg),
- cap->max_inline_data +
- sizeof (struct mlx4_wqe_inline_seg)) +
- send_wqe_overhead(type)));
- qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
- sizeof (struct mlx4_wqe_data_seg);
+ s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+ cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+ send_wqe_overhead(type);
/*
- * We need to leave 2 KB + 1 WQE of headroom in the SQ to
- * allow HW to prefetch.
+ * Hermon supports shrinking wqe, such that a single WR can include
+ * multiple units of wqe_shift. This way, WRs can differ in size, and
+ * do not have to be a power of 2 in size, saving memory and speeding up
+ * send WR posting. Unfortunately, if we do this wqe_index field in CQE
+ * can't be used to look up the WR ID anymore, so do this only if
+ * selective signalling is off.
+ *
+ * Further, on 32-bit platforms, we can't use vmap to make
+ * the QP buffer virtually contigious. Thus we have to use
+ * constant-sized WRs to make sure a WR is always fully within
+ * a single page-sized chunk.
+ *
+ * Finally, we use NOP opcode to avoid wrap-around in the middle of WR.
+ * We set NEC bit to avoid getting completions with error for NOP WRs.
+ * Since NEC is only supported starting with firmware 2.2.232,
+ * we use constant-sized WRs for older firmware.
+ *
+ * And, since MLX QPs only support SEND, we use constant-sized WRs in this
+ * case.
+ *
+ * We look for the smallest value of wqe_shift such that the resulting
+ * number of wqes does not exceed device capabilities.
+ *
+ * We set WQE size to at least 64 bytes, this way stamping invalidates each WQE.
*/
- qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
- qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
+ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+ qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+ type != IB_QPT_SMI && type != IB_QPT_GSI)
+ qp->sq.wqe_shift = ilog2(64);
+ else
+ qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+ for (;;) {
+ if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << qp->sq.wqe_shift);
+
+ /*
+ * We need to leave 2 KB + 1 WR of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+ qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+ qp->sq_max_wqes_per_wr +
+ qp->sq_spare_wqes);
+
+ if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+ break;
+
+ if (qp->sq_max_wqes_per_wr <= 1)
+ return -EINVAL;
+
+ ++qp->sq.wqe_shift;
+ }
+
+ qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
+ send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
(qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -281,7 +405,8 @@ static int set_kernel_sq_size(struct mlx
qp->sq.offset = 0;
}
- cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+ cap->max_send_wr = qp->sq.max_post =
+ (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
cap->max_send_sge = qp->sq.max_gs;
/* We don't support inline sends for kernel QPs (yet) */
cap->max_inline_data = 0;
@@ -327,6 +452,12 @@ static int create_qp_common(struct mlx4_
qp->rq.tail = 0;
qp->sq.head = 0;
qp->sq.tail = 0;
+ qp->sq_next_wqe = 0;
+
+ if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+ else
+ qp->sq_signal_bits = 0;
err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
if (err)
@@ -417,11 +548,6 @@ static int create_qp_common(struct mlx4_
*/
qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
- if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
- qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- else
- qp->sq_signal_bits = 0;
-
qp->mqp.event = mlx4_ib_qp_event;
return 0;
@@ -916,7 +1042,7 @@ static int __mlx4_ib_modify_qp(struct ib
ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31);
- stamp_send_wqe(qp, i);
+ stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
}
}
@@ -969,6 +1095,7 @@ static int __mlx4_ib_modify_qp(struct ib
qp->rq.tail = 0;
qp->sq.head = 0;
qp->sq.tail = 0;
+ qp->sq_next_wqe = 0;
if (!ibqp->srq)
*qp->db.db = 0;
}
@@ -1278,13 +1405,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
unsigned long flags;
int nreq;
int err = 0;
- int ind;
- int size;
+ unsigned ind;
+ int uninitialized_var(stamp);
+ int uninitialized_var(size);
int i;
spin_lock_irqsave(&qp->sq.lock, flags);
- ind = qp->sq.head;
+ ind = qp->sq_next_wqe;
for (nreq = 0; wr; ++nreq, wr = wr->next) {
if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -1300,7 +1428,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
}
ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
- qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+ qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
ctrl->srcrb_flags =
(wr->send_flags & IB_SEND_SIGNALED ?
@@ -1413,16 +1541,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+ stamp = ind + qp->sq_spare_wqes;
+ ind += DIV_ROUND_UP(size * 16, 1 << qp->sq.wqe_shift);
+
/*
* We can improve latency by not stamping the last
* send queue WQE until after ringing the doorbell, so
* only stamp here if there are still more WQEs to post.
+ *
+ * Same optimization applies to padding with NOP wqe
+ * in case of WQE shrinking (used to prevent wrap-around
+ * in the middle of WR).
*/
- if (wr->next)
- stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
- (qp->sq.wqe_cnt - 1));
+ if (wr->next) {
+ stamp_send_wqe(qp, stamp, size * 16);
+ ind = pad_wraparound(qp, ind);
+ }
- ++ind;
}
out:
@@ -1444,8 +1579,10 @@ out:
*/
mmiowb();
- stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
- (qp->sq.wqe_cnt - 1));
+ stamp_send_wqe(qp, stamp, size * 16);
+
+ ind = pad_wraparound(qp, ind);
+ qp->sq_next_wqe = ind;
}
spin_unlock_irqrestore(&qp->sq.lock, flags);
Index: infiniband/drivers/net/mlx4/alloc.c
===================================================================
--- infiniband.orig/drivers/net/mlx4/alloc.c 2007-11-08 10:01:19.000000000 +0200
+++ infiniband/drivers/net/mlx4/alloc.c 2007-12-04 09:29:39.410730000 +0200
@@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev,
memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
}
+
+ if (BITS_PER_LONG == 64) {
+ struct page **pages;
+ pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+ if (!pages)
+ goto err_free;
+ for (i = 0; i < buf->nbufs; ++i)
+ pages[i] = virt_to_page(buf->u.page_list[i].buf);
+ buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+ if (!buf->u.direct.buf)
+ goto err_free;
+ }
}
return 0;
@@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev,
dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
buf->u.direct.map);
else {
+ if (BITS_PER_LONG == 64)
+ vunmap(buf->u.direct.buf);
+
for (i = 0; i < buf->nbufs; ++i)
if (buf->u.page_list[i].buf)
dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
Index: infiniband/include/linux/mlx4/device.h
===================================================================
--- infiniband.orig/include/linux/mlx4/device.h 2007-10-16 15:50:08.000000000 +0200
+++ infiniband/include/linux/mlx4/device.h 2007-12-04 09:29:39.417729000 +0200
@@ -133,6 +133,11 @@ enum {
MLX4_STAT_RATE_OFFSET = 5
};
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+ return (major << 32) | (minor << 16) | subminor;
+}
+
struct mlx4_caps {
u64 fw_ver;
int num_ports;
@@ -189,7 +194,7 @@ struct mlx4_buf_list {
};
struct mlx4_buf {
- union {
+ struct {
struct mlx4_buf_list direct;
struct mlx4_buf_list *page_list;
} u;
Index: infiniband/include/linux/mlx4/qp.h
===================================================================
--- infiniband.orig/include/linux/mlx4/qp.h 2007-10-16 15:50:08.000000000 +0200
+++ infiniband/include/linux/mlx4/qp.h 2007-12-04 09:29:39.420729000 +0200
@@ -154,7 +154,11 @@ struct mlx4_qp_context {
u32 reserved5[10];
};
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
enum {
+ MLX4_WQE_CTRL_NEC = 1 << 29,
MLX4_WQE_CTRL_FENCE = 1 << 6,
MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
MLX4_WQE_CTRL_SOLICITED = 1 << 1,
More information about the general
mailing list