[openib-general] [PATCH 11/13] [RFC] ipath verbs, part 2
Roland Dreier
rolandd at cisco.com
Fri Dec 16 15:48:55 PST 2005
Second half of ipath verbs
---
drivers/infiniband/hw/ipath/ipath_verbs.c | 2931 +++++++++++++++++++++++++++++
1 files changed, 2931 insertions(+), 0 deletions(-)
3f617d81354835f183e089849cca09e295b2df0a
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 808326e..25d738d 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -3242,3 +3242,2934 @@ static int get_rwqe(struct ipath_qp *qp,
spin_unlock(&rq->lock);
return 1;
}
+
+/*
+ * This is called from ipath_qp_rcv() to process an incomming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ int opcode;
+ u32 hdrsize;
+ u32 psn;
+ u32 pad;
+ unsigned long flags;
+ struct ib_wc wc;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ struct ib_reth *reth;
+
+ /* Check for GRH */
+ if (!has_grh) {
+ ohdr = &hdr->u.oth;
+ hdrsize = 8 + 12; /* LRH + BTH */
+ psn = be32_to_cpu(ohdr->bth[2]);
+ } else {
+ ohdr = &hdr->u.l.oth;
+ hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
+ /*
+ * The header with GRH is 60 bytes and the
+ * core driver sets the eager header buffer
+ * size to 56 bytes so the last 4 bytes of
+ * the BTH header (PSN) is in the data buffer.
+ */
+ psn = be32_to_cpu(((u32 *) data)[0]);
+ data += sizeof(u32);
+ }
+ /*
+ * The opcode is in the low byte when its in network order
+ * (top byte when in host order).
+ */
+ opcode = *(u8 *) (&ohdr->bth[0]);
+
+ wc.imm_data = 0;
+ wc.wc_flags = 0;
+
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+
+ /* Compare the PSN verses the expected PSN. */
+ if (unlikely(cmp24(psn, qp->r_psn) != 0)) {
+ /*
+ * Handle a sequence error.
+ * Silently drop any current message.
+ */
+ qp->r_psn = psn;
+ inv:
+ qp->r_state = IB_OPCODE_UC_SEND_LAST;
+ switch (opcode) {
+ case IB_OPCODE_UC_SEND_FIRST:
+ case IB_OPCODE_UC_SEND_ONLY:
+ case IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE:
+ goto send_first;
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_UC_RDMA_WRITE_ONLY:
+ case IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+ goto rdma_first;
+
+ default:
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case IB_OPCODE_UC_SEND_FIRST:
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ if (opcode == IB_OPCODE_UC_SEND_MIDDLE ||
+ opcode == IB_OPCODE_UC_SEND_LAST ||
+ opcode == IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE)
+ break;
+ goto inv;
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ if (opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE ||
+ opcode == IB_OPCODE_UC_RDMA_WRITE_LAST ||
+ opcode == IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE)
+ break;
+ goto inv;
+
+ default:
+ if (opcode == IB_OPCODE_UC_SEND_FIRST ||
+ opcode == IB_OPCODE_UC_SEND_ONLY ||
+ opcode == IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE ||
+ opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ opcode == IB_OPCODE_UC_RDMA_WRITE_ONLY ||
+ opcode == IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE)
+ break;
+ goto inv;
+ }
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case IB_OPCODE_UC_SEND_FIRST:
+ case IB_OPCODE_UC_SEND_ONLY:
+ case IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE:
+ send_first:
+ if (qp->r_reuse_sge) {
+ qp->r_reuse_sge = 0;
+ qp->r_sge = qp->s_rdma_sge;
+ } else if (!get_rwqe(qp, 0)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Save the WQE so we can reuse it in case of an error. */
+ qp->s_rdma_sge = qp->r_sge;
+ qp->r_rcv_len = 0;
+ if (opcode == IB_OPCODE_UC_SEND_ONLY)
+ goto send_last;
+ else if (opcode == IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE)
+ goto send_last_imm;
+ /* FALLTHROUGH */
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+ qp->r_reuse_sge = 1;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len)) {
+ qp->r_reuse_sge = 1;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+ send_last_imm:
+ if (has_grh) {
+ wc.imm_data = *(u32 *) data;
+ data += sizeof(u32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ /* FALLTHROUGH */
+ case IB_OPCODE_UC_SEND_LAST:
+ send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (ohdr->bth[0] >> 12) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ qp->r_reuse_sge = 1;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_reuse_sge = 1;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* XXX Need to free SGEs */
+ last_imm:
+ copy_sge(&qp->r_sge, data, tlen);
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.vendor_err = 0;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = qp->remote_qpn;
+ wc.pkey_index = 0;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ ohdr->bth[0] & __constant_cpu_to_be32(1 << 23));
+ break;
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_UC_RDMA_WRITE_ONLY:
+ case IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE: /* consume RWQE */
+ rdma_first:
+ /* RETH comes after BTH */
+ if (!has_grh)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ hdrsize += sizeof(*reth);
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+
+ /* Check rkey */
+ if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, qp->r_len,
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_WRITE))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ } else {
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ if (opcode == IB_OPCODE_UC_RDMA_WRITE_ONLY)
+ goto rdma_last;
+ else if (opcode == IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE)
+ goto rdma_last_imm;
+ /* FALLTHROUGH */
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ rdma_last_imm:
+ /* Get the number of bytes the message was padded by. */
+ pad = (ohdr->bth[0] >> 12) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ if (qp->r_reuse_sge) {
+ qp->r_reuse_sge = 0;
+ } else if (!get_rwqe(qp, 1)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ if (has_grh) {
+ wc.imm_data = *(u32 *) data;
+ data += sizeof(u32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.byte_len = 0;
+ goto last_imm;
+
+ case IB_OPCODE_UC_RDMA_WRITE_LAST:
+ rdma_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (ohdr->bth[0] >> 12) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ copy_sge(&qp->r_sge, data, tlen);
+ break;
+
+ default:
+ /* Drop packet for unknown opcodes. */
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ dev->n_pkt_drops++;
+ return;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+done:
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+}
+
+/*
+ * Put this QP on the RNR timeout list for the device.
+ * XXX Use a simple list for now. We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+static void insert_rnr_queue(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (list_empty(&dev->rnrwait))
+ list_add(&qp->timerwait, &dev->rnrwait);
+ else {
+ struct list_head *l = &dev->rnrwait;
+ struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+ timerwait);
+
+ while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+ qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+ l = l->next;
+ if (l->next == &dev->rnrwait)
+ break;
+ nqp = list_entry(l->next, struct ipath_qp, timerwait);
+ }
+ list_add(&qp->timerwait, l);
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+/*
+ * This is called from do_uc_send() or do_rc_send() to forward a WQE addressed
+ * to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send(). We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc)
+{
+ struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+ struct ipath_qp *qp;
+ struct ipath_swqe *wqe;
+ struct ipath_sge *sge;
+ unsigned long flags;
+ u64 sdata;
+
+ qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+ if (!qp) {
+ dev->n_pkt_drops++;
+ return;
+ }
+
+again:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+
+ if (!(state_ops[sqp->state] & IPATH_PROCESS_SEND_OK)) {
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ goto done;
+ }
+
+ /* Get the next send request. */
+ if (sqp->s_last == sqp->s_head) {
+ /* Send work queue is empty. */
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ goto done;
+ }
+
+ /*
+ * We can rely on the entry not changing without the s_lock
+ * being held until we update s_last.
+ */
+ wqe = get_swqe_ptr(sqp, sqp->s_last);
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+ wc->wc_flags = 0;
+ wc->imm_data = 0;
+
+ sqp->s_sge.sge = wqe->sg_list[0];
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
+ sqp->s_len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->imm_data = wqe->wr.imm_data;
+ /* FALLTHROUGH */
+ case IB_WR_SEND:
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ if (!get_rwqe(qp, 0)) {
+ rnr_nak:
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ /* Handle RNR NAK */
+ if (qp->ibqp.qp_type == IB_QPT_UC)
+ goto send_comp;
+ if (sqp->s_rnr_retry == 0) {
+ wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto err;
+ }
+ if (sqp->s_rnr_retry_cnt < 7)
+ sqp->s_rnr_retry--;
+ dev->n_rnr_naks++;
+ sqp->s_rnr_timeout = rnr_table[sqp->s_min_rnr_timer];
+ insert_rnr_queue(sqp);
+ goto done;
+ }
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ break;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->imm_data = wqe->wr.imm_data;
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ if (!get_rwqe(qp, 1))
+ goto rnr_nak;
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE:
+ if (wqe->length == 0)
+ break;
+ if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_WRITE))) {
+ acc_err:
+ wc->status = IB_WC_REM_ACCESS_ERR;
+ err:
+ wc->wr_id = wqe->wr.wr_id;
+ wc->opcode = wc_opcode[wqe->wr.opcode];
+ wc->vendor_err = 0;
+ wc->byte_len = 0;
+ wc->qp_num = sqp->ibqp.qp_num;
+ wc->src_qp = sqp->remote_qpn;
+ wc->pkey_index = 0;
+ wc->slid = sqp->remote_ah_attr.dlid;
+ wc->sl = sqp->remote_ah_attr.sl;
+ wc->dlid_path_bits = 0;
+ wc->port_num = 0;
+ ipath_sqerror_qp(sqp, wc);
+ goto done;
+ }
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (unlikely(!ipath_rkey_ok(dev, &sqp->s_sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_READ))) {
+ goto acc_err;
+ }
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto acc_err;
+ qp->r_sge.sge = wqe->sg_list[0];
+ qp->r_sge.sg_list = wqe->sg_list + 1;
+ qp->r_sge.num_sge = wqe->wr.num_sge;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, sizeof(u64),
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_ATOMIC))) {
+ goto acc_err;
+ }
+ /* Perform atomic OP and save result. */
+ sdata = wqe->wr.wr.atomic.swap;
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
+ if (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ *(u64 *) qp->r_sge.sge.vaddr =
+ qp->r_atomic_data + sdata;
+ } else if (qp->r_atomic_data == wqe->wr.wr.atomic.compare_add) {
+ *(u64 *) qp->r_sge.sge.vaddr = sdata;
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+ *(u64 *) sqp->s_sge.sge.vaddr = qp->r_atomic_data;
+ goto send_comp;
+
+ default:
+ goto done;
+ }
+
+ sge = &sqp->s_sge.sge;
+ while (sqp->s_len) {
+ u32 len = sqp->s_len;
+
+ if (len > sge->length)
+ len = sge->length;
+ BUG_ON(len == 0);
+ copy_sge(&qp->r_sge, sge->vaddr, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--sqp->s_sge.num_sge)
+ *sge = *sqp->s_sge.sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ sqp->s_len -= len;
+ }
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
+ wqe->wr.opcode == IB_WR_RDMA_READ)
+ goto send_comp;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc->opcode = IB_WC_RECV;
+ wc->wr_id = qp->r_wr_id;
+ wc->status = IB_WC_SUCCESS;
+ wc->vendor_err = 0;
+ wc->byte_len = wqe->length;
+ wc->qp_num = qp->ibqp.qp_num;
+ wc->src_qp = qp->remote_qpn;
+ /* XXX do we know which pkey matched? Only needed for GSI. */
+ wc->pkey_index = 0;
+ wc->slid = qp->remote_ah_attr.dlid;
+ wc->sl = qp->remote_ah_attr.sl;
+ wc->dlid_path_bits = 0;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
+ wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+
+ if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ wc->wr_id = wqe->wr.wr_id;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = wc_opcode[wqe->wr.opcode];
+ wc->vendor_err = 0;
+ wc->byte_len = wqe->length;
+ wc->qp_num = sqp->ibqp.qp_num;
+ wc->src_qp = 0;
+ wc->pkey_index = 0;
+ wc->slid = 0;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = 0;
+ ipath_cq_enter(to_icq(sqp->ibqp.send_cq), wc, 0);
+ }
+
+ /* Update s_last now that we are finished with the SWQE */
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ if (++sqp->s_last >= sqp->s_size)
+ sqp->s_last = 0;
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ goto again;
+
+done:
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+/*
+ * Flush send work queue.
+ * The QP s_lock should be held.
+ */
+static void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+ u32 credit = (aeth >> 24) & 0x1F;
+
+ /*
+ * If credit == 0x1F, credit is invalid and we can send
+ * as many packets as we like. Otherwise, we have to
+ * honor the credit field.
+ */
+ if (credit == 0x1F) {
+ qp->s_lsn = (u32) -1;
+ } else if (qp->s_lsn != (u32) -1) {
+ /* Compute new LSN (i.e., MSN + credit) */
+ credit = (aeth + credit_table[credit]) & 0xFFFFFF;
+ if (cmp24(credit, qp->s_lsn) > 0)
+ qp->s_lsn = credit;
+ }
+
+ /* Restart sending if it was blocked due to lack of credits. */
+ if (qp->s_cur != qp->s_head &&
+ (qp->s_lsn == (u32) -1 ||
+ cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, qp->s_lsn + 1) <= 0)) {
+ tasklet_schedule(&qp->s_task);
+ }
+}
+
+/*
+ * This is called from ipath_rc_rcv() to process an incomming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ib_wc wc;
+ struct ipath_swqe *wqe;
+
+ /*
+ * Remove the QP from the timeout queue (or RNR timeout queue).
+ * If ipath_ib_timer() has already removed it,
+ * it's OK since we hold the QP s_lock and ipath_restart_rc()
+ * just won't find anything to restart if we ACK everything.
+ */
+ spin_lock(&dev->pending_lock);
+ if (qp->timerwait.next != LIST_POISON1)
+ list_del(&qp->timerwait);
+ spin_unlock(&dev->pending_lock);
+
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and
+ * RDMA write requests and implicitly NAK RDMA read and
+ * atomic requests issued before the NAK'ed request.
+ * The MSN won't include the NAK'ed request but will include
+ * an ACK'ed request(s).
+ */
+ wqe = get_swqe_ptr(qp, qp->s_last);
+
+ /* Nothing is pending to ACK/NAK. */
+ if (qp->s_last == qp->s_tail)
+ return 0;
+
+ /*
+ * The MSN might be for a later WQE than the PSN indicates so
+ * only complete WQEs that the PSN finishes.
+ */
+ while (cmp24(psn, wqe->lpsn) >= 0) {
+ /* If we are ACKing a WQE, the MSN should be >= the SSN. */
+ if (cmp24(aeth, wqe->ssn) < 0)
+ break;
+ /*
+ * If this request is a RDMA read or atomic, and the ACK is
+ * for a later operation, this ACK NAKs the RDMA read or atomic.
+ * In other words, only a RDMA_READ_LAST or ONLY can ACK
+ * a RDMA read and likewise for atomic ops.
+ * Note that the NAK case can only happen if relaxed ordering
+ * is used and requests are sent after an RDMA read
+ * or atomic is sent but before the response is received.
+ */
+ if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+ opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) ||
+ ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+ (opcode != IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE ||
+ cmp24(wqe->psn, psn) != 0))) {
+ /* The last valid PSN seen is the previous request's. */
+ qp->s_last_psn = wqe->psn - 1;
+ /* Retry this request. */
+ ipath_restart_rc(qp, wqe->psn, &wc);
+ /*
+ * No need to process the ACK/NAK since we are
+ * restarting an earlier request.
+ */
+ return 0;
+ }
+ /* Post a send completion queue entry if requested. */
+ if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = wc_opcode[wqe->wr.opcode];
+ wc.vendor_err = 0;
+ wc.byte_len = wqe->length;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = qp->remote_qpn;
+ wc.pkey_index = 0;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+ }
+ qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If we are completing a request which is in the process
+ * of being resent, we can stop resending it since we know
+ * the responder has already seen it.
+ */
+ if (qp->s_last == qp->s_cur) {
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->s_psn = wqe->psn;
+ }
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ if (qp->s_last == qp->s_tail)
+ break;
+ }
+
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ dev->n_rc_acks++;
+ /* If this is a partial ACK, reset the retransmit timer. */
+ if (qp->s_last != qp->s_tail) {
+ spin_lock(&dev->pending_lock);
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+ }
+ ipath_get_credit(qp, aeth);
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ qp->s_retry = qp->s_retry_cnt;
+ qp->s_last_psn = psn;
+ return 1;
+
+ case 1: /* RNR NAK */
+ dev->n_rnr_naks++;
+ if (qp->s_rnr_retry == 0) {
+ if (qp->s_last == qp->s_tail)
+ return 0;
+
+ wc.status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto class_b;
+ }
+ if (qp->s_rnr_retry_cnt < 7)
+ qp->s_rnr_retry--;
+ if (qp->s_last == qp->s_tail)
+ return 0;
+
+ /* The last valid PSN seen is the previous request's. */
+ qp->s_last_psn = wqe->psn - 1;
+
+ /* Restart this request after the RNR timeout. */
+ wqe = get_swqe_ptr(qp, qp->s_last);
+
+ dev->n_rc_resends += (int)qp->s_psn - (int)psn;
+
+ /*
+ * If we are starting the request from the beginning, let the
+ * normal send code handle initialization.
+ */
+ qp->s_cur = qp->s_last;
+ if (cmp24(psn, wqe->psn) <= 0) {
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->s_psn = wqe->psn;
+ } else {
+ u32 n;
+
+ n = qp->s_cur;
+ for (;;) {
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail) {
+ if (cmp24(psn, qp->s_next_psn) >= 0) {
+ qp->s_cur = n;
+ wqe = get_swqe_ptr(qp, n);
+ }
+ break;
+ }
+ wqe = get_swqe_ptr(qp, n);
+ if (cmp24(psn, wqe->psn) < 0)
+ break;
+ qp->s_cur = n;
+ }
+ qp->s_psn = psn;
+
+ /*
+ * Set the state to restart in the middle of a request.
+ * Don't change the s_sge, s_cur_sge, or s_cur_size.
+ * See do_rc_send().
+ */
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+ break;
+
+ case IB_WR_RDMA_READ:
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+ break;
+
+ default:
+ /*
+ * This case shouldn't happen since its only
+ * one PSN per req.
+ */
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ }
+ }
+
+ qp->s_rnr_timeout = rnr_table[(aeth >> 24) & 0x1F];
+ insert_rnr_queue(qp);
+ return 0;
+
+ case 3: /* NAK */
+ /* The last valid PSN seen is the previous request's. */
+ if (qp->s_last != qp->s_tail)
+ qp->s_last_psn = wqe->psn - 1;
+ switch ((aeth >> 24) & 0x1F) {
+ case 0: /* PSN sequence error */
+ dev->n_seq_naks++;
+ /*
+ * Back up to the responder's expected PSN.
+ * XXX Note that we might get a NAK in the
+ * middle of an RDMA READ response which
+ * terminates the RDMA READ.
+ */
+ if (qp->s_last == qp->s_tail)
+ break;
+
+ if (cmp24(psn, wqe->psn) < 0) {
+ break;
+ }
+ /* Retry the request. */
+ ipath_restart_rc(qp, psn, &wc);
+ break;
+
+ case 1: /* Invalid Request */
+ wc.status = IB_WC_REM_INV_REQ_ERR;
+ dev->n_other_naks++;
+ goto class_b;
+
+ case 2: /* Remote Access Error */
+ wc.status = IB_WC_REM_ACCESS_ERR;
+ dev->n_other_naks++;
+ goto class_b;
+
+ case 3: /* Remote Operation Error */
+ wc.status = IB_WC_REM_OP_ERR;
+ dev->n_other_naks++;
+ class_b:
+ wc.wr_id = wqe->wr.wr_id;
+ wc.opcode = wc_opcode[wqe->wr.opcode];
+ wc.vendor_err = 0;
+ wc.byte_len = 0;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = qp->remote_qpn;
+ wc.pkey_index = 0;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ ipath_sqerror_qp(qp, &wc);
+ break;
+
+ default:
+ /* Ignore other reserved NAK error codes */
+ goto reserved;
+ }
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ return 0;
+
+ default: /* 2: reserved */
+ reserved:
+ /* Ignore reserved NAK codes. */
+ return 0;
+ }
+}
+
+/*
+ * This is called from ipath_qp_rcv() to process an incomming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ int opcode;
+ u32 hdrsize;
+ u32 psn;
+ u32 pad;
+ unsigned long flags;
+ struct ib_wc wc;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ int diff;
+ struct ib_reth *reth;
+
+ /* Check for GRH */
+ if (!has_grh) {
+ ohdr = &hdr->u.oth;
+ hdrsize = 8 + 12; /* LRH + BTH */
+ psn = be32_to_cpu(ohdr->bth[2]);
+ } else {
+ ohdr = &hdr->u.l.oth;
+ hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
+ /*
+ * The header with GRH is 60 bytes and the
+ * core driver sets the eager header buffer
+ * size to 56 bytes so the last 4 bytes of
+ * the BTH header (PSN) is in the data buffer.
+ */
+ psn = be32_to_cpu(((u32 *) data)[0]);
+ data += sizeof(u32);
+ }
+ /*
+ * The opcode is in the low byte when its in network order
+ * (top byte when in host order).
+ */
+ opcode = *(u8 *) (&ohdr->bth[0]);
+
+ /*
+ * Process responses (ACKs) before anything else.
+ * Note that the packet sequence number will be for something
+ * in the send work queue rather than the expected receive
+ * packet sequence number. In other words, this QP is the
+ * requester.
+ */
+ if (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
+ opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) {
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Ignore invalid responses. */
+ if (cmp24(psn, qp->s_next_psn) >= 0) {
+ goto ack_done;
+ }
+
+ /* Ignore duplicate responses. */
+ diff = cmp24(psn, qp->s_last_psn);
+ if (unlikely(diff <= 0)) {
+ /* Update credits for "ghost" ACKs */
+ if (diff == 0 && opcode == IB_OPCODE_RC_ACKNOWLEDGE) {
+ if (!has_grh) {
+ pad = be32_to_cpu(ohdr->u.aeth);
+ } else {
+ pad = be32_to_cpu(((u32 *) data)[0]);
+ data += sizeof(u32);
+ }
+ if ((pad >> 29) == 0) {
+ ipath_get_credit(qp, pad);
+ }
+ }
+ goto ack_done;
+ }
+
+ switch (opcode) {
+ case IB_OPCODE_RC_ACKNOWLEDGE:
+ case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ if (!has_grh) {
+ pad = be32_to_cpu(ohdr->u.aeth);
+ } else {
+ pad = be32_to_cpu(((u32 *) data)[0]);
+ data += sizeof(u32);
+ }
+ if (opcode == IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) {
+ *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data;
+ }
+ if (!do_rc_ack(qp, pad, psn, opcode) ||
+ opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+ goto ack_done;
+ }
+ hdrsize += 4;
+ /*
+ * do_rc_ack() has already checked the PSN so skip
+ * the sequence check.
+ */
+ goto rdma_read;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ /* no AETH, no ACK */
+ if (unlikely(cmp24(psn, qp->s_last_psn + 1) != 0)) {
+ dev->n_rdma_seq++;
+ ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+ goto ack_done;
+ }
+ rdma_read:
+ if (unlikely(qp->s_state !=
+ IB_OPCODE_RC_RDMA_READ_REQUEST))
+ goto ack_done;
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto ack_done;
+ if (unlikely(pmtu >= qp->s_len))
+ goto ack_done;
+ /* We got a response so update the timeout. */
+ if (unlikely(qp->s_last == qp->s_tail ||
+ get_swqe_ptr(qp, qp->s_last)->wr.opcode !=
+ IB_WR_RDMA_READ))
+ goto ack_done;
+ spin_lock(&dev->pending_lock);
+ if (qp->s_rnr_timeout == 0 &&
+ qp->timerwait.next != LIST_POISON1) {
+ list_move_tail(&qp->timerwait,
+ &dev->pending[dev->
+ pending_index]);
+ }
+ spin_unlock(&dev->pending_lock);
+ /*
+ * Update the RDMA receive state but do the copy w/o
+ * holding the locks and blocking interrupts.
+ * XXX Yet another place that affects relaxed
+ * RDMA order since we don't want s_sge modified.
+ */
+ qp->s_len -= pmtu;
+ qp->s_last_psn = psn;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ copy_sge(&qp->s_sge, data, pmtu);
+ return;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ /* ACKs READ req. */
+ if (unlikely(cmp24(psn, qp->s_last_psn + 1) != 0)) {
+ dev->n_rdma_seq++;
+ ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+ goto ack_done;
+ }
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+ if (unlikely(qp->s_state !=
+ IB_OPCODE_RC_RDMA_READ_REQUEST)) {
+ goto ack_done;
+ }
+ /* Get the number of bytes the message was padded by. */
+ pad = (ohdr->bth[0] >> 12) & 3;
+ /*
+ * Check that the data size is >= 1 && <= pmtu.
+ * Remember to account for the AETH header (4)
+ * and ICRC (4).
+ */
+ if (unlikely(tlen <= (hdrsize + pad + 8))) {
+ /* XXX Need to generate an error CQ entry. */
+ goto ack_done;
+ }
+ tlen -= hdrsize + pad + 8;
+ if (unlikely(tlen != qp->s_len)) {
+ /* XXX Need to generate an error CQ entry. */
+ goto ack_done;
+ }
+ if (!has_grh) {
+ pad = be32_to_cpu(ohdr->u.aeth);
+ } else {
+ pad = be32_to_cpu(((u32 *) data)[0]);
+ data += sizeof(u32);
+ }
+ copy_sge(&qp->s_sge, data, tlen);
+ if (do_rc_ack(qp, pad, psn,
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST)) {
+ /*
+ * Change the state so we contimue
+ * processing new requests.
+ */
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ }
+ goto ack_done;
+ }
+ ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+ }
+
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+
+ /* Compute 24 bits worth of difference. */
+ diff = cmp24(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (diff > 0) {
+ /*
+ * Packet sequence error.
+ * A NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if a RDMA read, atomic, or
+ * NAK is pending though.
+ */
+ spin_lock(&qp->s_lock);
+ if ((qp->s_ack_state >=
+ IB_OPCODE_RC_RDMA_READ_REQUEST &&
+ qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) ||
+ qp->s_nak_state != 0) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_SEND_ONLY;
+ qp->s_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->s_ack_psn = qp->r_psn;
+ goto resched;
+ }
+
+ /*
+ * Handle a duplicate request.
+ * Don't re-execute SEND, RDMA write or atomic op.
+ * Don't NAK errors, just silently drop the duplicate request.
+ * Note that r_sge, r_len, and r_rcv_len may be
+ * in use so don't modify them.
+ *
+ * We are supposed to ACK the earliest duplicate PSN
+ * but we can coalesce an outstanding duplicate ACK.
+ * We have to send the earliest so that RDMA reads
+ * can be restarted at the requester's expected PSN.
+ */
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE &&
+ cmp24(psn, qp->s_ack_psn) >= 0) {
+ if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST)
+ qp->s_ack_psn = psn;
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ switch (opcode) {
+ case IB_OPCODE_RC_RDMA_READ_REQUEST:
+ /*
+ * We have to be careful to not change s_rdma_sge
+ * while do_rc_send() is using it and not holding
+ * the s_lock.
+ */
+ if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE &&
+ qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
+ spin_unlock(&qp->s_lock);
+ dev->n_rdma_dup_busy++;
+ goto done;
+ }
+ /* RETH comes after BTH */
+ if (!has_grh)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ qp->s_rdma_len = be32_to_cpu(reth->length);
+ if (qp->s_rdma_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+
+ /*
+ * Address range must be a subset of the
+ * original request and start on pmtu
+ * boundaries.
+ */
+ if (unlikely(!ipath_rkey_ok(dev,
+ &qp->s_rdma_sge,
+ qp->s_rdma_len,
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_READ)))
+ {
+ goto done;
+ }
+ } else {
+ qp->s_rdma_sge.sg_list = NULL;
+ qp->s_rdma_sge.num_sge = 0;
+ qp->s_rdma_sge.sge.mr = NULL;
+ qp->s_rdma_sge.sge.vaddr = NULL;
+ qp->s_rdma_sge.sge.length = 0;
+ qp->s_rdma_sge.sge.sge_length = 0;
+ }
+ break;
+
+ case IB_OPCODE_RC_COMPARE_SWAP:
+ case IB_OPCODE_RC_FETCH_ADD:
+ /*
+ * Check for the PSN of the last atomic operations
+ * performed and resend the result if found.
+ */
+ if ((psn & 0xFFFFFF) != qp->r_atomic_psn) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ qp->s_ack_atomic = qp->r_atomic_data;
+ break;
+ }
+ qp->s_ack_state = opcode;
+ qp->s_nak_state = 0;
+ qp->s_ack_psn = psn;
+ goto resched;
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case IB_OPCODE_RC_SEND_FIRST:
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ if (opcode == IB_OPCODE_RC_SEND_MIDDLE ||
+ opcode == IB_OPCODE_RC_SEND_LAST ||
+ opcode == IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE)
+ break;
+ nack_inv:
+ /*
+ * A NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if a RDMA read, atomic, or
+ * NAK is pending though.
+ */
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state >= IB_OPCODE_RC_RDMA_READ_REQUEST &&
+ qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ /* XXX Flush WQEs */
+ qp->state = IB_QPS_ERR;
+ qp->s_ack_state = IB_OPCODE_RC_SEND_ONLY;
+ qp->s_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->s_ack_psn = qp->r_psn;
+ goto resched;
+
+ case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ if (opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_LAST ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE)
+ break;
+ goto nack_inv;
+
+ case IB_OPCODE_RC_RDMA_READ_REQUEST:
+ case IB_OPCODE_RC_COMPARE_SWAP:
+ case IB_OPCODE_RC_FETCH_ADD:
+ /*
+ * Drop all new requests until a response has been sent.
+ * A new request then ACKs the RDMA response we sent.
+ * Relaxed ordering would allow new requests to be
+ * processed but we would need to keep a queue
+ * of rwqe's for all that are in progress.
+ * Note that we can't RNR NAK this request since the RDMA
+ * READ or atomic response is already queued to be sent
+ * (unless we implement a response send queue).
+ */
+ goto done;
+
+ default:
+ if (opcode == IB_OPCODE_RC_SEND_MIDDLE ||
+ opcode == IB_OPCODE_RC_SEND_LAST ||
+ opcode == IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_LAST ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE)
+ goto nack_inv;
+ break;
+ }
+
+ wc.imm_data = 0;
+ wc.wc_flags = 0;
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case IB_OPCODE_RC_SEND_FIRST:
+ if (!get_rwqe(qp, 0)) {
+ rnr_nak:
+ /*
+ * A RNR NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if a RDMA read or atomic
+ * is pending though.
+ */
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state >= IB_OPCODE_RC_RDMA_READ_REQUEST &&
+ qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_SEND_ONLY;
+ qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer;
+ qp->s_ack_psn = qp->r_psn;
+ goto resched;
+ }
+ qp->r_rcv_len = 0;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ send_middle:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+ goto nack_inv;
+ }
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len)) {
+ goto nack_inv;
+ }
+ copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ /* consume RWQE */
+ if (!get_rwqe(qp, 1))
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case IB_OPCODE_RC_SEND_ONLY:
+ case IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE:
+ if (!get_rwqe(qp, 0))
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ if (opcode == IB_OPCODE_RC_SEND_ONLY)
+ goto send_last;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+ send_last_imm:
+ if (has_grh) {
+ wc.imm_data = *(u32 *) data;
+ data += sizeof(u32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_SEND_LAST:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST:
+ send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (ohdr->bth[0] >> 12) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ goto nack_inv;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ goto nack_inv;
+ }
+ /* XXX Need to free SGEs */
+ copy_sge(&qp->r_sge, data, tlen);
+ atomic_inc(&qp->msn);
+ if (opcode == IB_OPCODE_RC_RDMA_WRITE_LAST ||
+ opcode == IB_OPCODE_RC_RDMA_WRITE_ONLY)
+ break;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.vendor_err = 0;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = qp->remote_qpn;
+ wc.pkey_index = 0;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ ohdr->bth[0] & __constant_cpu_to_be32(1 << 23));
+ break;
+
+ case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_RC_RDMA_WRITE_ONLY:
+ case IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+ /* consume RWQE */
+ /* RETH comes after BTH */
+ if (!has_grh)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ hdrsize += sizeof(*reth);
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+
+ /* Check rkey & NAK */
+ if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, qp->r_len,
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_WRITE))) {
+ nack_acc:
+ /*
+ * A NAK will ACK earlier sends and RDMA
+ * writes.
+ * Don't queue the NAK if a RDMA read,
+ * atomic, or NAK is pending though.
+ */
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state >=
+ IB_OPCODE_RC_RDMA_READ_REQUEST &&
+ qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ /* XXX Flush WQEs */
+ qp->state = IB_QPS_ERR;
+ qp->s_ack_state = IB_OPCODE_RC_RDMA_WRITE_ONLY;
+ qp->s_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->s_ack_psn = qp->r_psn;
+ goto resched;
+ }
+ } else {
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_acc;
+ if (opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST)
+ goto send_middle;
+ else if (opcode == IB_OPCODE_RC_RDMA_WRITE_ONLY)
+ goto send_last;
+ if (!get_rwqe(qp, 1))
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case IB_OPCODE_RC_RDMA_READ_REQUEST:
+ /* RETH comes after BTH */
+ if (!has_grh)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE &&
+ qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
+ spin_unlock(&qp->s_lock);
+ goto done;
+ }
+ qp->s_rdma_len = be32_to_cpu(reth->length);
+ if (qp->s_rdma_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+
+ /* Check rkey & NAK */
+ if (unlikely(!ipath_rkey_ok(dev, &qp->s_rdma_sge,
+ qp->s_rdma_len,
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_READ))) {
+ spin_unlock(&qp->s_lock);
+ goto nack_acc;
+ }
+ /*
+ * Update the next expected PSN.
+ * We add 1 later below, so only add the remainder here.
+ */
+ if (qp->s_rdma_len > pmtu)
+ qp->r_psn += (qp->s_rdma_len - 1) / pmtu;
+ } else {
+ qp->s_rdma_sge.sg_list = NULL;
+ qp->s_rdma_sge.num_sge = 0;
+ qp->s_rdma_sge.sge.mr = NULL;
+ qp->s_rdma_sge.sge.vaddr = NULL;
+ qp->s_rdma_sge.sge.length = 0;
+ qp->s_rdma_sge.sge.sge_length = 0;
+ }
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ atomic_inc(&qp->msn);
+ qp->s_ack_state = opcode;
+ qp->s_nak_state = 0;
+ qp->s_ack_psn = psn;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ goto rdmadone;
+
+ case IB_OPCODE_RC_COMPARE_SWAP:
+ case IB_OPCODE_RC_FETCH_ADD:{
+ struct ib_atomic_eth *ateth;
+ u64 vaddr;
+ u64 sdata;
+ u32 rkey;
+
+ if (!has_grh)
+ ateth = &ohdr->u.atomic_eth;
+ else {
+ ateth = (struct ib_atomic_eth *)data;
+ data += sizeof(*ateth);
+ }
+ vaddr = be64_to_cpu(ateth->vaddr);
+ if (unlikely(vaddr & 0x7))
+ goto nack_inv;
+ rkey = be32_to_cpu(ateth->rkey);
+ /* Check rkey & NAK */
+ if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge,
+ sizeof(u64), vaddr, rkey,
+ IB_ACCESS_REMOTE_ATOMIC))) {
+ goto nack_acc;
+ }
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_acc;
+ /* Perform atomic OP and save result. */
+ sdata = be64_to_cpu(ateth->swap_data);
+ spin_lock(&dev->pending_lock);
+ qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
+ if (opcode == IB_OPCODE_RC_FETCH_ADD) {
+ *(u64 *) qp->r_sge.sge.vaddr =
+ qp->r_atomic_data + sdata;
+ } else if (qp->r_atomic_data ==
+ be64_to_cpu(ateth->compare_data)) {
+ *(u64 *) qp->r_sge.sge.vaddr = sdata;
+ }
+ spin_unlock(&dev->pending_lock);
+ atomic_inc(&qp->msn);
+ qp->r_atomic_psn = psn & 0xFFFFFF;
+ psn |= 1 << 31;
+ break;
+ }
+
+ default:
+ /* Drop packet for unknown opcodes. */
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ return;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ /* Send an ACK if requested or required. */
+ if (psn & (1 << 31)) {
+ /*
+ * Coalesce ACKs unless there is a RDMA READ or
+ * ATOMIC pending.
+ */
+ spin_lock(&qp->s_lock);
+ if (qp->s_ack_state == IB_OPCODE_RC_ACKNOWLEDGE ||
+ qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) {
+ qp->s_ack_state = opcode;
+ qp->s_nak_state = 0;
+ qp->s_ack_psn = psn;
+ qp->s_ack_atomic = qp->r_atomic_data;
+ goto resched;
+ }
+ spin_unlock(&qp->s_lock);
+ }
+done:
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ return;
+
+resched:
+ /* Try to send ACK right away but not if do_rc_send() is active. */
+ if (qp->s_hdrwords == 0 &&
+ (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST ||
+ qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP))
+ send_rc_ack(qp);
+
+rdmadone:
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+ /* Call do_rc_send() in another thread. */
+ tasklet_schedule(&qp->s_task);
+}
+
+/*
+ * This is called from ipath_ib_rcv() to process an incomming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_qp_rcv(struct ipath_ibdev *dev,
+ struct ipath_ib_header *hdr, int has_grh,
+ void *data, u32 tlen, struct ipath_qp *qp)
+{
+ /* Check for valid receive state. */
+ if (!(state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+ dev->n_pkt_drops++;
+ return;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ case IB_QPT_RC:
+ ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ case IB_QPT_UC:
+ ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * This is called from ipath_kreceive() to process an incomming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+static void ipath_ib_rcv(const ipath_type t, void *rhdr, void *data, u32 tlen)
+{
+ struct ipath_ibdev *dev = ipath_devices[t];
+ struct ipath_ib_header *hdr = rhdr;
+ struct ipath_other_headers *ohdr;
+ struct ipath_qp *qp;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ if (dev == NULL)
+ return;
+
+ if (tlen < 24) { /* LRH+BTH+CRC */
+ dev->n_pkt_drops++;
+ return;
+ }
+
+ /* Check for GRH */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == IPS_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == IPS_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else {
+ dev->n_pkt_drops++;
+ return;
+ }
+
+ opcode = *(u8 *) (&ohdr->bth[0]);
+ dev->opstats[opcode].n_bytes += tlen;
+ dev->opstats[opcode].n_packets++;
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->bth[1]) & 0xFFFFFF;
+ if (qp_num == 0xFFFFFF) {
+ struct ipath_mcast *mcast;
+ struct ipath_mcast_qp *p;
+
+ mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+ if (mcast == NULL) {
+ dev->n_pkt_drops++;
+ return;
+ }
+ dev->n_multicast_rcv++;
+ list_for_each_entry_rcu(p, &mcast->qp_list, list)
+ ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, tlen,
+ p->qp);
+ /*
+ * Notify ipath_multicast_detach() if it is waiting for us
+ * to finish.
+ */
+ if (atomic_dec_return(&mcast->refcount) <= 1)
+ wake_up(&mcast->wait);
+ } else if ((qp = ipath_lookup_qpn(&dev->qp_table, qp_num)) != NULL) {
+ ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, tlen, qp);
+ /*
+ * Notify ipath_destroy_qp() if it is waiting for us to finish.
+ */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ } else
+ dev->n_pkt_drops++;
+}
+
+/*
+ * This is called from ipath_do_rcv_timer() at interrupt level
+ * to check for QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(const ipath_type t)
+{
+ struct ipath_ibdev *dev = ipath_devices[t];
+ struct ipath_qp *resend = NULL;
+ struct ipath_qp *rnr = NULL;
+ struct list_head *last;
+ struct ipath_qp *qp;
+ unsigned long flags;
+
+ if (dev == NULL)
+ return;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ /* Start filling the next pending queue. */
+ if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+ dev->pending_index = 0;
+ /* Save any requests still in the new queue, they have timed out. */
+ last = &dev->pending[dev->pending_index];
+ while (!list_empty(last)) {
+ qp = list_entry(last->next, struct ipath_qp, timerwait);
+ if (last->next == LIST_POISON1 ||
+ last->next != &qp->timerwait ||
+ qp->timerwait.prev != last) {
+ INIT_LIST_HEAD(last);
+ } else {
+ list_del(&qp->timerwait);
+ qp->timerwait.prev = (struct list_head *) resend;
+ resend = qp;
+ atomic_inc(&qp->refcount);
+ }
+ }
+ last = &dev->rnrwait;
+ if (!list_empty(last)) {
+ qp = list_entry(last->next, struct ipath_qp, timerwait);
+ if (--qp->s_rnr_timeout == 0) {
+ do {
+ if (last->next == LIST_POISON1 ||
+ last->next != &qp->timerwait ||
+ qp->timerwait.prev != last) {
+ INIT_LIST_HEAD(last);
+ break;
+ }
+ list_del(&qp->timerwait);
+ qp->timerwait.prev = (struct list_head *) rnr;
+ rnr = qp;
+ if (list_empty(last))
+ break;
+ qp = list_entry(last->next, struct ipath_qp,
+ timerwait);
+ } while (qp->s_rnr_timeout == 0);
+ }
+ }
+ /* We should only be in the started state if pma_sample_start != 0 */
+ if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+ --dev->pma_sample_start == 0) {
+ dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+ ipath_layer_snapshot_counters(dev->ib_unit, &dev->ipath_sword,
+ &dev->ipath_rword,
+ &dev->ipath_spkts,
+ &dev->ipath_rpkts);
+ }
+ if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+ if (dev->pma_sample_interval == 0) {
+ u64 ta, tb, tc, td;
+
+ dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+ ipath_layer_snapshot_counters(dev->ib_unit,
+ &ta, &tb, &tc, &td);
+
+ dev->ipath_sword = ta - dev->ipath_sword;
+ dev->ipath_rword = tb - dev->ipath_rword;
+ dev->ipath_spkts = tc - dev->ipath_spkts;
+ dev->ipath_rpkts = td - dev->ipath_rpkts;
+ } else {
+ dev->pma_sample_interval--;
+ }
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ /* XXX What if timer fires again while this is running? */
+ for (qp = resend; qp != NULL;
+ qp = (struct ipath_qp *) qp->timerwait.prev) {
+ struct ib_wc wc;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
+ dev->n_timeouts++;
+ ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ for (qp = rnr; qp != NULL;
+ qp = (struct ipath_qp *) qp->timerwait.prev) {
+ tasklet_schedule(&qp->s_task);
+ }
+}
+
+/*
+ * This is called from ipath_intr() at interrupt level when a PIO buffer
+ * is available after ipath_verbs_send() returned an error that no
+ * buffers were available.
+ * Return 0 if we consumed all the PIO buffers and we still have QPs
+ * waiting for buffers (for now, just do a tasklet_schedule and return one).
+ */
+static int ipath_ib_piobufavail(const ipath_type t)
+{
+ struct ipath_ibdev *dev = ipath_devices[t];
+ struct ipath_qp *qp;
+ unsigned long flags;
+
+ if (dev == NULL)
+ return 1;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ while (!list_empty(&dev->piowait)) {
+ qp = list_entry(dev->piowait.next, struct ipath_qp, piowait);
+ list_del(&qp->piowait);
+ tasklet_schedule(&qp->s_task);
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ return 1;
+}
+
+static struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct ipath_qp *qp;
+ int err;
+ struct ipath_swqe *swq = NULL;
+ struct ipath_ibdev *dev;
+ size_t sz;
+
+ if (init_attr->cap.max_send_sge > 255 ||
+ init_attr->cap.max_recv_sge > 255)
+ return ERR_PTR(-ENOMEM);
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_UC:
+ case IB_QPT_RC:
+ sz = sizeof(struct ipath_sge) * init_attr->cap.max_send_sge +
+ sizeof(struct ipath_swqe);
+ swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+ if (swq == NULL)
+ return ERR_PTR(-ENOMEM);
+ /* FALLTHROUGH */
+ case IB_QPT_UD:
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ qp = kmalloc(sizeof(*qp), GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+ qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+ sz = sizeof(struct ipath_sge) * init_attr->cap.max_recv_sge +
+ sizeof(struct ipath_rwqe);
+ qp->r_rq.wq = vmalloc(qp->r_rq.size * sz);
+ if (!qp->r_rq.wq) {
+ kfree(qp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * ib_create_qp() will initialize qp->ibqp
+ * except for qp->ibqp.qp_num.
+ */
+ spin_lock_init(&qp->s_lock);
+ spin_lock_init(&qp->r_rq.lock);
+ atomic_set(&qp->refcount, 0);
+ init_waitqueue_head(&qp->wait);
+ tasklet_init(&qp->s_task,
+ init_attr->qp_type == IB_QPT_RC ? do_rc_send :
+ do_uc_send, (unsigned long)qp);
+ qp->piowait.next = LIST_POISON1;
+ qp->piowait.prev = LIST_POISON2;
+ qp->timerwait.next = LIST_POISON1;
+ qp->timerwait.prev = LIST_POISON2;
+ qp->state = IB_QPS_RESET;
+ qp->s_wq = swq;
+ qp->s_size = init_attr->cap.max_send_wr + 1;
+ qp->s_max_sge = init_attr->cap.max_send_sge;
+ qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+ qp->s_flags = init_attr->sq_sig_type == IB_SIGNAL_REQ_WR ?
+ 1 << IPATH_S_SIGNAL_REQ_WR : 0;
+ dev = to_idev(ibpd->device);
+ err = ipath_alloc_qpn(&dev->qp_table, qp, init_attr->qp_type);
+ if (err) {
+ vfree(swq);
+ vfree(qp->r_rq.wq);
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+ ipath_reset_qp(qp);
+
+ /* Tell the core driver that the kernel SMA is present. */
+ if (qp->ibqp.qp_type == IB_QPT_SMI)
+ ipath_verbs_set_flags(dev->ib_unit,
+ IPATH_VERBS_KERNEL_SMA);
+ break;
+
+ default:
+ /* Don't support raw QPs */
+ return ERR_PTR(-ENOSYS);
+ }
+
+ init_attr->cap.max_inline_data = 0;
+
+ return &qp->ibqp;
+}
+
+/*
+ * Note that this can be called while the QP is actively sending or receiving!
+ */
+static int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+ unsigned long flags;
+
+ /* Tell the core driver that the kernel SMA is gone. */
+ if (qp->ibqp.qp_type == IB_QPT_SMI)
+ ipath_verbs_set_flags(dev->ib_unit, 0);
+
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ spin_lock(&qp->s_lock);
+ qp->state = IB_QPS_ERR;
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+ /* Stop the sending tasklet. */
+ tasklet_kill(&qp->s_task);
+
+ /* Make sure the QP isn't on the timeout list. */
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (qp->timerwait.next != LIST_POISON1)
+ list_del(&qp->timerwait);
+ if (qp->piowait.next != LIST_POISON1)
+ list_del(&qp->piowait);
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ /*
+ * Make sure that the QP is not in the QPN table so receive interrupts
+ * will discard packets for this QP.
+ * XXX Also remove QP from multicast table.
+ */
+ if (atomic_read(&qp->refcount) != 0)
+ ipath_free_qp(&dev->qp_table, qp);
+
+ vfree(qp->s_wq);
+ vfree(qp->r_rq.wq);
+ kfree(qp);
+ return 0;
+}
+
+static struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct ipath_srq *srq;
+ u32 sz;
+
+ if (srq_init_attr->attr.max_sge < 1)
+ return ERR_PTR(-EINVAL);
+
+ srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ /* Need to use vmalloc() if we want to support large #s of entries. */
+ srq->rq.size = srq_init_attr->attr.max_wr + 1;
+ sz = sizeof(struct ipath_sge) * srq_init_attr->attr.max_sge +
+ sizeof(struct ipath_rwqe);
+ srq->rq.wq = vmalloc(srq->rq.size * sz);
+ if (!srq->rq.wq) {
+ kfree(srq);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * ib_create_srq() will initialize srq->ibsrq.
+ */
+ spin_lock_init(&srq->rq.lock);
+ srq->rq.head = 0;
+ srq->rq.tail = 0;
+ srq->rq.max_sge = srq_init_attr->attr.max_sge;
+ srq->limit = srq_init_attr->attr.srq_limit;
+
+ return &srq->ibsrq;
+}
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+ unsigned long flags;
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ spin_lock_irqsave(&srq->rq.lock, flags);
+ srq->limit = attr->srq_limit;
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ }
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ u32 size = attr->max_wr + 1;
+ struct ipath_rwqe *wq, *p;
+ u32 n;
+ u32 sz;
+
+ if (attr->max_sge < srq->rq.max_sge)
+ return -EINVAL;
+
+ sz = sizeof(struct ipath_rwqe) +
+ attr->max_sge * sizeof(struct ipath_sge);
+ wq = vmalloc(size * sz);
+ if (!wq)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&srq->rq.lock, flags);
+ if (srq->rq.head < srq->rq.tail)
+ n = srq->rq.size + srq->rq.head - srq->rq.tail;
+ else
+ n = srq->rq.head - srq->rq.tail;
+ if (size <= n || size <= srq->limit) {
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ vfree(wq);
+ return -EINVAL;
+ }
+ n = 0;
+ p = wq;
+ while (srq->rq.tail != srq->rq.head) {
+ struct ipath_rwqe *wqe;
+ int i;
+
+ wqe = get_rwqe_ptr(&srq->rq, srq->rq.tail);
+ p->wr_id = wqe->wr_id;
+ p->length = wqe->length;
+ p->num_sge = wqe->num_sge;
+ for (i = 0; i < wqe->num_sge; i++)
+ p->sg_list[i] = wqe->sg_list[i];
+ n++;
+ p = (struct ipath_rwqe *)((char *) p + sz);
+ if (++srq->rq.tail >= srq->rq.size)
+ srq->rq.tail = 0;
+ }
+ vfree(srq->rq.wq);
+ srq->rq.wq = wq;
+ srq->rq.size = size;
+ srq->rq.head = n;
+ srq->rq.tail = 0;
+ srq->rq.max_sge = attr->max_sge;
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ }
+ return 0;
+}
+
+static int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+
+ vfree(srq->rq.wq);
+ kfree(srq);
+
+ return 0;
+}
+
+/*
+ * This may be called from interrupt context.
+ */
+static int ipath_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *entry)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+ unsigned long flags;
+ int npolled;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+ if (cq->tail == cq->head)
+ break;
+ *entry = cq->queue[cq->tail];
+ if (++cq->tail == cq->ibcq.cqe)
+ cq->tail = 0;
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return npolled;
+}
+
+static struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct ipath_cq *cq;
+
+ /* Need to use vmalloc() if we want to support large #s of entries. */
+ cq = vmalloc(sizeof(*cq) + entries * sizeof(*cq->queue));
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+ * The number of entries should be >= the number requested or
+ * return an error.
+ */
+ cq->ibcq.cqe = entries + 1;
+ cq->notify = IB_CQ_NONE;
+ cq->triggered = 0;
+ spin_lock_init(&cq->lock);
+ tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+ cq->head = 0;
+ cq->tail = 0;
+
+ return &cq->ibcq;
+}
+
+static int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+
+ tasklet_kill(&cq->comptask);
+ vfree(cq);
+
+ return 0;
+}
+
+/*
+ * This may be called from interrupt context.
+ */
+static int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ /*
+ * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+ * any other transitions.
+ */
+ if (cq->notify != IB_CQ_NEXT_COMP)
+ cq->notify = notify;
+ spin_unlock_irqrestore(&cq->lock, flags);
+ return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ uint32_t vendor, boardrev, majrev, minrev;
+
+ memset(props, 0, sizeof(*props));
+
+ props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+ IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+ IB_DEVICE_SYS_IMAGE_GUID;
+ ipath_layer_query_device(dev->ib_unit, &vendor, &boardrev,
+ &majrev, &minrev);
+ props->vendor_id = vendor;
+ props->vendor_part_id = boardrev;
+ props->hw_ver = boardrev << 16 | majrev << 8 | minrev;
+
+ props->sys_image_guid = dev->sys_image_guid;
+ props->node_guid = ipath_layer_get_guid(dev->ib_unit);
+
+ props->max_mr_size = ~0ull;
+ props->max_qp = 0xffff;
+ props->max_qp_wr = 0xffff;
+ props->max_sge = 255;
+ props->max_cq = 0xffff;
+ props->max_cqe = 0xffff;
+ props->max_mr = 0xffff;
+ props->max_pd = 0xffff;
+ props->max_qp_rd_atom = 1;
+ props->max_qp_init_rd_atom = 1;
+ /* props->max_res_rd_atom */
+ props->max_srq = 0xffff;
+ props->max_srq_wr = 0xffff;
+ props->max_srq_sge = 255;
+ /* props->local_ca_ack_delay */
+ props->atomic_cap = IB_ATOMIC_HCA;
+ props->max_pkeys = ipath_layer_get_npkeys(dev->ib_unit);
+ props->max_mcast_grp = 0xffff;
+ props->max_mcast_qp_attach = 0xffff;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+
+ return 0;
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ uint32_t flags = ipath_layer_get_flags(dev->ib_unit);
+ enum ib_mtu mtu;
+ uint32_t l;
+ uint16_t lid = ipath_layer_get_lid(dev->ib_unit);
+
+ memset(props, 0, sizeof(*props));
+ props->lid = lid ? lid : IB_LID_PERMISSIVE;
+ props->lmc = dev->mkeyprot_resv_lmc & 7;
+ props->sm_lid = dev->sm_lid;
+ props->sm_sl = dev->sm_sl;
+ if (flags & IPATH_LINKDOWN)
+ props->state = IB_PORT_DOWN;
+ else if (flags & IPATH_LINKARMED)
+ props->state = IB_PORT_ARMED;
+ else if (flags & IPATH_LINKACTIVE)
+ props->state = IB_PORT_ACTIVE;
+ else if (flags & IPATH_LINK_SLEEPING)
+ props->state = IB_PORT_ACTIVE_DEFER;
+ else
+ props->state = IB_PORT_NOP;
+ /* See phys_state_show() */
+ props->phys_state = 5; /* LinkUp */
+ props->port_cap_flags = dev->port_cap_flags;
+ props->gid_tbl_len = 1;
+ props->max_msg_sz = 4096;
+ props->pkey_tbl_len = ipath_layer_get_npkeys(dev->ib_unit);
+ props->bad_pkey_cntr = ipath_layer_get_cr_errpkey(dev->ib_unit);
+ props->qkey_viol_cntr = dev->qkey_violations;
+ props->active_width = IB_WIDTH_4X;
+ /* See rate_show() */
+ props->active_speed = 1; /* Regular 10Mbs speed. */
+ props->max_vl_num = 1; /* VLCap = VL0 */
+ props->init_type_reply = 0;
+
+ props->max_mtu = IB_MTU_4096;
+ l = ipath_layer_get_ibmtu(dev->ib_unit);
+ switch (l) {
+ case 4096:
+ mtu = IB_MTU_4096;
+ break;
+ case 2048:
+ mtu = IB_MTU_2048;
+ break;
+ case 1024:
+ mtu = IB_MTU_1024;
+ break;
+ case 512:
+ mtu = IB_MTU_512;
+ break;
+ case 256:
+ mtu = IB_MTU_256;
+ break;
+ default:
+ mtu = IB_MTU_2048;
+ }
+ props->active_mtu = mtu;
+ props->subnet_timeout = dev->subnet_timeout;
+
+ return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+ to_idev(device)->sys_image_guid = device_modify->sys_image_guid;
+
+ return 0;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+
+ atomic_set_mask(props->set_port_cap_mask, &dev->port_cap_flags);
+ atomic_clear_mask(props->clr_port_cap_mask, &dev->port_cap_flags);
+ if (port_modify_mask & IB_PORT_SHUTDOWN)
+ ipath_kset_linkstate(dev->ib_unit << 16 | IPATH_IB_LINKDOWN);
+ if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ dev->qkey_violations = 0;
+ return 0;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev,
+ u8 port, u16 index, u16 *pkey)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+
+ if (index >= ipath_layer_get_npkeys(dev->ib_unit))
+ return -EINVAL;
+ *pkey = ipath_layer_get_pkey(dev->ib_unit, index);
+ return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+
+ if (index >= 1)
+ return -EINVAL;
+ gid->global.subnet_prefix = dev->gid_prefix;
+ gid->global.interface_id = ipath_layer_get_guid(dev->ib_unit);
+
+ return 0;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct ipath_pd *pd;
+
+ pd = kmalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ /* ib_alloc_pd() will initialize pd->ibpd. */
+ pd->user = udata != NULL;
+
+ return &pd->ibpd;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+ struct ipath_pd *pd = to_ipd(ibpd);
+
+ kfree(pd);
+
+ return 0;
+}
+
+/*
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr)
+{
+ struct ipath_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ /* ib_create_ah() will initialize ah->ibah. */
+ ah->attr = *ah_attr;
+
+ return &ah->ibah;
+}
+
+/*
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+ struct ipath_ah *ah = to_iah(ibah);
+
+ kfree(ah);
+
+ return 0;
+}
+
+static struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct ipath_mr *mr;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ /* ib_get_dma_mr() will initialize mr->ibmr except for lkey and rkey. */
+ memset(mr, 0, sizeof *mr);
+ mr->mr.access_flags = acc;
+ return &mr->ibmr;
+}
+
+static struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf,
+ int acc, u64 *iova_start)
+{
+ struct ipath_mr *mr;
+ int n, m, i;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (num_phys_buf + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+ mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ /* Allocate first level page tables. */
+ for (i = 0; i < m; i++) {
+ mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+ if (!mr->mr.map[i]) {
+ while (i)
+ kfree(mr->mr.map[--i]);
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ mr->mr.mapsz = m;
+
+ /*
+ * ib_reg_phys_mr() will initialize mr->ibmr except for
+ * lkey and rkey.
+ */
+ if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &mr->mr)) {
+ while (i)
+ kfree(mr->mr.map[--i]);
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+ mr->mr.user_base = *iova_start;
+ mr->mr.iova = *iova_start;
+ mr->mr.length = 0;
+ mr->mr.offset = 0;
+ mr->mr.access_flags = acc;
+ mr->mr.max_segs = num_phys_buf;
+ m = 0;
+ n = 0;
+ for (i = 0; i < num_phys_buf; i++) {
+ mr->mr.map[m]->segs[n].vaddr =
+ phys_to_virt(buffer_list[i].addr);
+ mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+ mr->mr.length += buffer_list[i].size;
+ if (++n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ return &mr->ibmr;
+}
+
+static struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd,
+ struct ib_umem *region,
+ int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct ipath_mr *mr;
+ struct ib_umem_chunk *chunk;
+ int n, m, i;
+
+ n = 0;
+ list_for_each_entry(chunk, ®ion->chunk_list, list)
+ n += chunk->nents;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (n + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+ mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ /* Allocate first level page tables. */
+ for (i = 0; i < m; i++) {
+ mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+ if (!mr->mr.map[i]) {
+ while (i)
+ kfree(mr->mr.map[--i]);
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ mr->mr.mapsz = m;
+
+ /*
+ * ib_uverbs_reg_mr() will initialize mr->ibmr except for
+ * lkey and rkey.
+ */
+ if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &mr->mr)) {
+ while (i)
+ kfree(mr->mr.map[--i]);
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+ mr->mr.user_base = region->user_base;
+ mr->mr.iova = region->virt_base;
+ mr->mr.length = region->length;
+ mr->mr.offset = region->offset;
+ mr->mr.access_flags = mr_access_flags;
+ mr->mr.max_segs = n;
+ m = 0;
+ n = 0;
+ list_for_each_entry(chunk, ®ion->chunk_list, list) {
+ for (i = 0; i < chunk->nmap; i++) {
+ mr->mr.map[m]->segs[n].vaddr =
+ page_address(chunk->page_list[i].page);
+ mr->mr.map[m]->segs[n].length = region->page_size;
+ if (++n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ }
+ return &mr->ibmr;
+}
+
+/*
+ * Note that this is called to free MRs created by
+ * ipath_get_dma_mr() or ipath_reg_user_mr().
+ */
+static int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+ struct ipath_mr *mr = to_imr(ibmr);
+ int i;
+
+ ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+ i = mr->mr.mapsz;
+ while (i)
+ kfree(mr->mr.map[--i]);
+ kfree(mr);
+ return 0;
+}
+
+static struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ipath_fmr *fmr;
+ int m, i;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+ fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+ if (!fmr)
+ return ERR_PTR(-ENOMEM);
+
+ /* Allocate first level page tables. */
+ for (i = 0; i < m; i++) {
+ fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], GFP_KERNEL);
+ if (!fmr->mr.map[i]) {
+ while (i)
+ kfree(fmr->mr.map[--i]);
+ kfree(fmr);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ fmr->mr.mapsz = m;
+
+ /* ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & rkey. */
+ if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) {
+ while (i)
+ kfree(fmr->mr.map[--i]);
+ kfree(fmr);
+ return ERR_PTR(-ENOMEM);
+ }
+ fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+ /* Resources are allocated but no valid mapping (RKEY can't be used). */
+ fmr->mr.user_base = 0;
+ fmr->mr.iova = 0;
+ fmr->mr.length = 0;
+ fmr->mr.offset = 0;
+ fmr->mr.access_flags = mr_access_flags;
+ fmr->mr.max_segs = fmr_attr->max_pages;
+ fmr->page_size = fmr_attr->page_size;
+ return &fmr->ibfmr;
+}
+
+/*
+ * This may be called from interrupt context.
+ * XXX Can we ever be called to map a portion of the RKEY space?
+ */
+static int ipath_map_phys_fmr(struct ib_fmr *ibfmr,
+ u64 * page_list, int list_len, u64 iova)
+{
+ struct ipath_fmr *fmr = to_ifmr(ibfmr);
+ struct ipath_lkey_table *rkt;
+ unsigned long flags;
+ int m, n, i;
+ u32 ps;
+
+ if (list_len > fmr->mr.max_segs)
+ return -EINVAL;
+ rkt = &to_idev(ibfmr->device)->lk_table;
+ spin_lock_irqsave(&rkt->lock, flags);
+ fmr->mr.user_base = iova;
+ fmr->mr.iova = iova;
+ ps = 1 << fmr->page_size;
+ fmr->mr.length = list_len * ps;
+ m = 0;
+ n = 0;
+ ps = 1 << fmr->page_size;
+ for (i = 0; i < list_len; i++) {
+ fmr->mr.map[m]->segs[n].vaddr = phys_to_virt(page_list[i]);
+ fmr->mr.map[m]->segs[n].length = ps;
+ if (++n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ return 0;
+}
+
+static int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ipath_fmr *fmr;
+
+ list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+ fmr->mr.user_base = 0;
+ fmr->mr.iova = 0;
+ fmr->mr.length = 0;
+ }
+ return 0;
+}
+
+static int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+ struct ipath_fmr *fmr = to_ifmr(ibfmr);
+ int i;
+
+ ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+ i = fmr->mr.mapsz;
+ while (i)
+ kfree(fmr->mr.map[--i]);
+ kfree(fmr);
+ return 0;
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ int vendor, boardrev, majrev, minrev;
+
+ ipath_layer_query_device(dev->ib_unit, &vendor, &boardrev,
+ &majrev, &minrev);
+ return sprintf(buf, "%d.%d\n", majrev, minrev);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ int vendor, boardrev, majrev, minrev;
+
+ ipath_layer_query_device(dev->ib_unit, &vendor, &boardrev,
+ &majrev, &minrev);
+ ipath_get_boardname(dev->ib_unit, buf, 128);
+ strcat(buf, "\n");
+ return strlen(buf);
+}
+
+static ssize_t show_board(struct class_device *cdev, char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ int vendor, boardrev, majrev, minrev;
+
+ ipath_layer_query_device(dev->ib_unit, &vendor, &boardrev,
+ &majrev, &minrev);
+ ipath_get_boardname(dev->ib_unit, buf, 128);
+ strcat(buf, "\n");
+ return strlen(buf);
+}
+
+static ssize_t show_stats(struct class_device *cdev, char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+ char *p;
+ int i;
+
+ sprintf(buf,
+ "RC resends %d\n"
+ "RC QACKs %d\n"
+ "RC ACKs %d\n"
+ "RC SEQ NAKs %d\n"
+ "RC RDMA seq %d\n"
+ "RC RNR NAKs %d\n"
+ "RC OTH NAKs %d\n"
+ "RC timeouts %d\n"
+ "RC RDMA dup %d\n"
+ "piobuf wait %d\n"
+ "no piobuf %d\n"
+ "PKT drops %d\n"
+ "WQE errs %d\n",
+ dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+ dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+ dev->n_other_naks, dev->n_timeouts, dev->n_rdma_dup_busy,
+ dev->n_piowait, dev->n_no_piobuf, dev->n_pkt_drops,
+ dev->n_wqe_errs);
+ p = buf;
+ for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+ if (!dev->opstats[i].n_packets && !dev->opstats[i].n_bytes)
+ continue;
+ p += strlen(p);
+ sprintf(p, "%02x %llu/%llu\n",
+ i, dev->opstats[i].n_packets, dev->opstats[i].n_bytes);
+ }
+ return strlen(buf);
+}
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct class_device_attribute *ipath_class_attributes[] = {
+ &class_device_attr_hw_rev,
+ &class_device_attr_hca_type,
+ &class_device_attr_board_id,
+ &class_device_attr_stats
+};
+
+/*
+ * Allocate a ucontext.
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct ipath_ucontext *context;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return ERR_PTR(-ENOMEM);
+
+ return &context->ibucontext;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+ kfree(to_iucontext(context));
+ return 0;
+}
+
+/*
+ * Register our device with the infiniband core.
+ */
+static int ipath_register_ib_device(const ipath_type t)
+{
+ struct ipath_ibdev *idev;
+ struct ib_device *dev;
+ int i;
+ int ret;
+
+ idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+ if (idev == NULL)
+ return -ENOMEM;
+
+ dev = &idev->ibdev;
+
+ /* Only need to initialize non-zero fields. */
+ spin_lock_init(&idev->qp_table.lock);
+ spin_lock_init(&idev->lk_table.lock);
+ idev->sm_lid = IB_LID_PERMISSIVE;
+ idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000UL);
+ idev->qp_table.last = 1; /* QPN 0 and 1 are special. */
+ idev->qp_table.max = ib_ipath_qp_table_size;
+ idev->qp_table.nmaps = 1;
+ idev->qp_table.table = kmalloc(idev->qp_table.max *
+ sizeof(*idev->qp_table.table),
+ GFP_KERNEL);
+ if (idev->qp_table.table == NULL) {
+ ret = -ENOMEM;
+ goto err_qp;
+ }
+ memset(idev->qp_table.table, 0,
+ idev->qp_table.max * sizeof(*idev->qp_table.table));
+ for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+ atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+ idev->qp_table.map[i].page = NULL;
+ }
+ /*
+ * The top ib_ipath_lkey_table_size bits are used to index the table.
+ * The lower 8 bits can be owned by the user (copied from the LKEY).
+ * The remaining bits act as a generation number or tag.
+ */
+ idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+ idev->lk_table.table = kmalloc(idev->lk_table.max *
+ sizeof(*idev->lk_table.table),
+ GFP_KERNEL);
+ if (idev->lk_table.table == NULL) {
+ ret = -ENOMEM;
+ goto err_lk;
+ }
+ memset(idev->lk_table.table, 0,
+ idev->lk_table.max * sizeof(*idev->lk_table.table));
+ spin_lock_init(&idev->pending_lock);
+ INIT_LIST_HEAD(&idev->pending[0]);
+ INIT_LIST_HEAD(&idev->pending[1]);
+ INIT_LIST_HEAD(&idev->pending[2]);
+ INIT_LIST_HEAD(&idev->piowait);
+ INIT_LIST_HEAD(&idev->rnrwait);
+ idev->pending_index = 0;
+ idev->port_cap_flags =
+ IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+ idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+ idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+ idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+ idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+ idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT;
+
+ /*
+ * The system image GUI is supposed to be the same for all
+ * IB HCAs in a single system.
+ * Note that this code assumes device zero is found first.
+ */
+ idev->sys_image_guid =
+ t ? ipath_devices[t]->sys_image_guid : ipath_layer_get_guid(t);
+ idev->ib_unit = t;
+
+ strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+ dev->node_guid = ipath_layer_get_guid(t);
+ dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+ dev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+ dev->node_type = IB_NODE_CA;
+ dev->phys_port_cnt = 1;
+ dev->dma_device = ipath_layer_get_pcidev(t);
+ dev->class_dev.dev = dev->dma_device;
+ dev->query_device = ipath_query_device;
+ dev->modify_device = ipath_modify_device;
+ dev->query_port = ipath_query_port;
+ dev->modify_port = ipath_modify_port;
+ dev->query_pkey = ipath_query_pkey;
+ dev->query_gid = ipath_query_gid;
+ dev->alloc_ucontext = ipath_alloc_ucontext;
+ dev->dealloc_ucontext = ipath_dealloc_ucontext;
+ dev->alloc_pd = ipath_alloc_pd;
+ dev->dealloc_pd = ipath_dealloc_pd;
+ dev->create_ah = ipath_create_ah;
+ dev->destroy_ah = ipath_destroy_ah;
+ dev->create_srq = ipath_create_srq;
+ dev->modify_srq = ipath_modify_srq;
+ dev->destroy_srq = ipath_destroy_srq;
+ dev->create_qp = ipath_create_qp;
+ dev->modify_qp = ipath_modify_qp;
+ dev->destroy_qp = ipath_destroy_qp;
+ dev->post_send = ipath_post_send;
+ dev->post_recv = ipath_post_receive;
+ dev->post_srq_recv = ipath_post_srq_receive;
+ dev->create_cq = ipath_create_cq;
+ dev->destroy_cq = ipath_destroy_cq;
+ dev->poll_cq = ipath_poll_cq;
+ dev->req_notify_cq = ipath_req_notify_cq;
+ dev->get_dma_mr = ipath_get_dma_mr;
+ dev->reg_phys_mr = ipath_reg_phys_mr;
+ dev->reg_user_mr = ipath_reg_user_mr;
+ dev->dereg_mr = ipath_dereg_mr;
+ dev->alloc_fmr = ipath_alloc_fmr;
+ dev->map_phys_fmr = ipath_map_phys_fmr;
+ dev->unmap_fmr = ipath_unmap_fmr;
+ dev->dealloc_fmr = ipath_dealloc_fmr;
+ dev->attach_mcast = ipath_multicast_attach;
+ dev->detach_mcast = ipath_multicast_detach;
+ dev->process_mad = ipath_process_mad;
+
+ ret = ib_register_device(dev);
+ if (ret)
+ goto err_reg;
+
+ for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+ ret = class_device_create_file(&dev->class_dev,
+ ipath_class_attributes[i]);
+ if (ret)
+ goto err_class;
+ }
+
+ ipath_layer_enable_timer(t);
+
+ ipath_devices[t] = idev;
+ return 0;
+
+err_class:
+ ib_unregister_device(dev);
+err_reg:
+ kfree(idev->lk_table.table);
+err_lk:
+ kfree(idev->qp_table.table);
+err_qp:
+ ib_dealloc_device(dev);
+ return ret;
+}
+
+static void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+ struct ib_device *ibdev = &dev->ibdev;
+
+ ipath_layer_disable_timer(dev->ib_unit);
+
+ ib_unregister_device(ibdev);
+
+ if (!list_empty(&dev->pending[0]) || !list_empty(&dev->pending[1]) ||
+ !list_empty(&dev->pending[2]))
+ _VERBS_ERROR("ipath%d pending list not empty!\n", dev->ib_unit);
+ if (!list_empty(&dev->piowait))
+ _VERBS_ERROR("ipath%d piowait list not empty!\n", dev->ib_unit);
+ if (!list_empty(&dev->rnrwait))
+ _VERBS_ERROR("ipath%d rnrwait list not empty!\n", dev->ib_unit);
+ if (mcast_tree.rb_node != NULL)
+ _VERBS_ERROR("ipath%d multicast table memory leak!\n",
+ dev->ib_unit);
+ /*
+ * Note that ipath_unregister_ib_device() can be called before all
+ * the QPs are destroyed!
+ */
+ ipath_free_all_qps(&dev->qp_table);
+ kfree(dev->qp_table.table);
+ kfree(dev->lk_table.table);
+ ib_dealloc_device(ibdev);
+}
+
+int __init ipath_verbs_init(void)
+{
+ int i;
+
+ number_of_devices = ipath_layer_get_num_of_dev();
+ i = number_of_devices * sizeof(struct ipath_ibdev *);
+ ipath_devices = kmalloc(i, GFP_ATOMIC);
+ if (ipath_devices == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < number_of_devices; i++) {
+ int ret = ipath_verbs_register(i, ipath_ib_piobufavail,
+ ipath_ib_rcv, ipath_ib_timer);
+
+ if (ret == 0)
+ ipath_devices[i] = NULL;
+ else if ((ret = ipath_register_ib_device(i)) != 0) {
+ _VERBS_ERROR("ib_ipath%d cannot register ib device "
+ "(%d)!\n", i, ret);
+ ipath_verbs_unregister(i);
+ ipath_devices[i] = NULL;
+ }
+ }
+
+ return 0;
+}
+
+void __exit ipath_verbs_cleanup(void)
+{
+ int i;
+
+ for (i = 0; i < number_of_devices; i++)
+ if (ipath_devices[i]) {
+ ipath_unregister_ib_device(ipath_devices[i]);
+ ipath_verbs_unregister(i);
+ }
+
+ kfree(ipath_devices);
+}
+
+module_init(ipath_verbs_init);
+module_exit(ipath_verbs_cleanup);
--
0.99.9n
More information about the general
mailing list