[openib-general] [PATCH 15 of 20] ipath - infiniband verbs support, part 1 of 3
Bryan O'Sullivan
bos at pathscale.com
Wed Dec 28 16:31:34 PST 2005
Signed-off-by: Bryan O'Sullivan <bos at pathscale.com>
diff -r 26993cb5faee -r 471b7a7a005c drivers/infiniband/hw/ipath/ipath_verbs.c
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c Wed Dec 28 14:19:43 2005 -0800
@@ -0,0 +1,2307 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Patent licenses, if any, provided herein do not apply to
+ * combinations of this program with other software, or any other
+ * product whatsoever.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/pci.h>
+#include <linux/err.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <asm/uaccess.h>
+#include <asm-generic/bug.h>
+
+#include "ips_common.h"
+#include "ipath_layer.h"
+#include "ipath_verbs.h"
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp24(u32 a, u32 b)
+{
+ return (((int) a) - ((int) b)) << 8;
+}
+
+#define MODNAME "ib_ipath"
+#define DRIVER_LOAD_MSG "PathScale " MODNAME " loaded: "
+#define PFX MODNAME ": "
+
+
+#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off) (((map) - (qpt)->map)*BITS_PER_PAGE + (off))
+#define find_next_offset(map, off) \
+ find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/* Not static, because we don't want the compiler removing it */
+const char ipath_verbs_version[] = "ipath_verbs " IPATH_IDSTR;
+
+unsigned int ib_ipath_qp_table_size = 251;
+module_param(ib_ipath_qp_table_size, uint, 0444);
+MODULE_PARM_DESC(ib_ipath_qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param(ib_ipath_lkey_table_size, uint, 0444);
+MODULE_PARM_DESC(ib_ipath_lkey_table_size,
+ "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+unsigned int ib_ipath_debug; /* debug mask */
+module_param(ib_ipath_debug, uint, 0644);
+MODULE_PARM_DESC(ib_ipath_debug, "Verbs debug mask");
+
+
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss,
+ u32 len, struct ib_send_wr *wr, struct ib_wc *wc);
+static void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc);
+static int ipath_destroy_qp(struct ib_qp *ibqp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("PathScale <infinipath-support at pathscale.com>");
+MODULE_DESCRIPTION("Pathscale InfiniPath driver");
+
+enum {
+ IPATH_FAULT_RC_DROP_SEND_F = 1,
+ IPATH_FAULT_RC_DROP_SEND_M,
+ IPATH_FAULT_RC_DROP_SEND_L,
+ IPATH_FAULT_RC_DROP_SEND_O,
+ IPATH_FAULT_RC_DROP_RDMA_WRITE_F,
+ IPATH_FAULT_RC_DROP_RDMA_WRITE_M,
+ IPATH_FAULT_RC_DROP_RDMA_WRITE_L,
+ IPATH_FAULT_RC_DROP_RDMA_WRITE_O,
+ IPATH_FAULT_RC_DROP_RDMA_READ_RESP_F,
+ IPATH_FAULT_RC_DROP_RDMA_READ_RESP_M,
+ IPATH_FAULT_RC_DROP_RDMA_READ_RESP_L,
+ IPATH_FAULT_RC_DROP_RDMA_READ_RESP_O,
+ IPATH_FAULT_RC_DROP_ACK,
+};
+
+enum {
+ IPATH_TRANS_INVALID = 0,
+ IPATH_TRANS_ANY2RST,
+ IPATH_TRANS_RST2INIT,
+ IPATH_TRANS_INIT2INIT,
+ IPATH_TRANS_INIT2RTR,
+ IPATH_TRANS_RTR2RTS,
+ IPATH_TRANS_RTS2RTS,
+ IPATH_TRANS_SQERR2RTS,
+ IPATH_TRANS_ANY2ERR,
+ IPATH_TRANS_RTS2SQD, /* XXX Wait for expected ACKs & signal event */
+ IPATH_TRANS_SQD2SQD, /* error if not drained & parameter change */
+ IPATH_TRANS_SQD2RTS, /* error if not drained */
+};
+
+enum {
+ IPATH_POST_SEND_OK = 0x0001,
+ IPATH_POST_RECV_OK = 0x0002,
+ IPATH_PROCESS_RECV_OK = 0x0004,
+ IPATH_PROCESS_SEND_OK = 0x0008,
+};
+
+static int state_ops[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = 0,
+ [IB_QPS_INIT] = IPATH_POST_RECV_OK,
+ [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+ [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+ [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK,
+ [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+ [IB_QPS_ERR] = 0,
+};
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+ 0, /* 0 */
+ 1, /* 1 */
+ 2, /* 2 */
+ 3, /* 3 */
+ 4, /* 4 */
+ 6, /* 5 */
+ 8, /* 6 */
+ 12, /* 7 */
+ 16, /* 8 */
+ 24, /* 9 */
+ 32, /* A */
+ 48, /* B */
+ 64, /* C */
+ 96, /* D */
+ 128, /* E */
+ 192, /* F */
+ 256, /* 10 */
+ 384, /* 11 */
+ 512, /* 12 */
+ 768, /* 13 */
+ 1024, /* 14 */
+ 1536, /* 15 */
+ 2048, /* 16 */
+ 3072, /* 17 */
+ 4096, /* 18 */
+ 6144, /* 19 */
+ 8192, /* 1A */
+ 12288, /* 1B */
+ 16384, /* 1C */
+ 24576, /* 1D */
+ 32768 /* 1E */
+};
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+static u32 rnr_table[32] = {
+ 656, /* 0 */
+ 1, /* 1 */
+ 1, /* 2 */
+ 1, /* 3 */
+ 1, /* 4 */
+ 1, /* 5 */
+ 1, /* 6 */
+ 1, /* 7 */
+ 1, /* 8 */
+ 1, /* 9 */
+ 1, /* A */
+ 1, /* B */
+ 1, /* C */
+ 1, /* D */
+ 2, /* E */
+ 2, /* F */
+ 3, /* 10 */
+ 4, /* 11 */
+ 6, /* 12 */
+ 8, /* 13 */
+ 11, /* 14 */
+ 16, /* 15 */
+ 21, /* 16 */
+ 31, /* 17 */
+ 41, /* 18 */
+ 62, /* 19 */
+ 82, /* 1A */
+ 123, /* 1B */
+ 164, /* 1C */
+ 246, /* 1D */
+ 328, /* 1E */
+ 492 /* 1F */
+};
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+static enum ib_wc_opcode wc_opcode[] = {
+ [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+ [IB_WR_SEND] = IB_WC_SEND,
+ [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+ [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Array of device pointers.
+ */
+static uint32_t number_of_devices;
+static struct ipath_ibdev **ipath_devices;
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static spinlock_t mcast_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Allocate a structure to link a QP to the multicast GID structure.
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+ struct ipath_mcast_qp *mqp;
+
+ mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+ if (!mqp)
+ return NULL;
+
+ mqp->qp = qp;
+ atomic_inc(&qp->refcount);
+
+ return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+ struct ipath_qp *qp = mqp->qp;
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+
+ kfree(mqp);
+}
+
+/*
+ * Allocate a structure for the multicast GID.
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+ struct ipath_mcast *mcast;
+
+ mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
+ if (!mcast)
+ return NULL;
+
+ mcast->mgid = *mgid;
+ INIT_LIST_HEAD(&mcast->qp_list);
+ init_waitqueue_head(&mcast->wait);
+ atomic_set(&mcast->refcount, 0);
+
+ return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+ struct ipath_mcast_qp *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+ ipath_mcast_qp_free(p);
+
+ kfree(mcast);
+}
+
+/*
+ * Search the global table for the given multicast GID.
+ * Return it or NULL if not found.
+ * The caller is responsible for decrementing the reference count if found.
+ */
+static struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+ struct rb_node *n;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mcast_lock, flags);
+ n = mcast_tree.rb_node;
+ while (n) {
+ struct ipath_mcast *mcast;
+ int ret;
+
+ mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+ ret = memcmp(mgid->raw, mcast->mgid.raw, sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else {
+ atomic_inc(&mcast->refcount);
+ spin_unlock_irqrestore(&mcast_lock, flags);
+ return mcast;
+ }
+ }
+ spin_unlock_irqrestore(&mcast_lock, flags);
+
+ return NULL;
+}
+
+/*
+ * Insert the multicast GID into the table and
+ * attach the QP structure.
+ * Return zero if both were added.
+ * Return EEXIST if the GID was already in the table but the QP was added.
+ * Return ESRCH if the QP was already attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_mcast *mcast,
+ struct ipath_mcast_qp *mqp)
+{
+ struct rb_node **n = &mcast_tree.rb_node;
+ struct rb_node *pn = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mcast_lock, flags);
+
+ while (*n) {
+ struct ipath_mcast *tmcast;
+ struct ipath_mcast_qp *p;
+ int ret;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+ ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0) {
+ n = &pn->rb_left;
+ continue;
+ }
+ if (ret > 0) {
+ n = &pn->rb_right;
+ continue;
+ }
+
+ /* Search the QP list to see if this is already there. */
+ list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+ if (p->qp == mqp->qp) {
+ spin_unlock_irqrestore(&mcast_lock, flags);
+ return ESRCH;
+ }
+ }
+ list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+ spin_unlock_irqrestore(&mcast_lock, flags);
+ return EEXIST;
+ }
+
+ list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+ atomic_inc(&mcast->refcount);
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+ spin_unlock_irqrestore(&mcast_lock, flags);
+
+ return 0;
+}
+
+static int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid,
+ u16 lid)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_mcast *mcast;
+ struct ipath_mcast_qp *mqp;
+
+ /*
+ * Allocate data structures since its better to do this outside of
+ * spin locks and it will most likely be needed.
+ */
+ mcast = ipath_mcast_alloc(gid);
+ if (mcast == NULL)
+ return -ENOMEM;
+ mqp = ipath_mcast_qp_alloc(qp);
+ if (mqp == NULL) {
+ ipath_mcast_free(mcast);
+ return -ENOMEM;
+ }
+ switch (ipath_mcast_add(mcast, mqp)) {
+ case ESRCH:
+ /* Neither was used: can't attach the same QP twice. */
+ ipath_mcast_qp_free(mqp);
+ ipath_mcast_free(mcast);
+ return -EINVAL;
+ case EEXIST: /* The mcast wasn't used */
+ ipath_mcast_free(mcast);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid,
+ u16 lid)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_mcast *mcast = NULL;
+ struct ipath_mcast_qp *p, *tmp;
+ struct rb_node *n;
+ unsigned long flags;
+ int last = 0;
+
+ spin_lock_irqsave(&mcast_lock, flags);
+
+ /* Find the GID in the mcast table. */
+ n = mcast_tree.rb_node;
+ while (1) {
+ int ret;
+
+ if (n == NULL) {
+ spin_unlock_irqrestore(&mcast_lock, flags);
+ return 0;
+ }
+
+ mcast = rb_entry(n, struct ipath_mcast, rb_node);
+ ret = memcmp(gid->raw, mcast->mgid.raw, sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ /* Search the QP list. */
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+ if (p->qp != qp)
+ continue;
+ /*
+ * We found it, so remove it, but don't poison the forward link
+ * until we are sure there are no list walkers.
+ */
+ list_del_rcu(&p->list);
+
+ /* If this was the last attached QP, remove the GID too. */
+ if (list_empty(&mcast->qp_list)) {
+ rb_erase(&mcast->rb_node, &mcast_tree);
+ last = 1;
+ }
+ break;
+ }
+
+ spin_unlock_irqrestore(&mcast_lock, flags);
+
+ if (p) {
+ /*
+ * Wait for any list walkers to finish before freeing the
+ * list element.
+ */
+ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+ ipath_mcast_qp_free(p);
+ }
+ if (last) {
+ atomic_dec(&mcast->refcount);
+ wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+ ipath_mcast_free(mcast);
+ }
+
+ return 0;
+}
+
+/*
+ * Copy data to SGE memory.
+ */
+static void copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ BUG_ON(len == 0);
+ if (len > length)
+ len = length;
+ memcpy(sge->vaddr, data, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
+}
+
+/*
+ * Skip over length bytes of SGE memory.
+ */
+static void skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length > sge->sge_length) {
+ length -= sge->sge_length;
+ ss->sge = *ss->sg_list++;
+ }
+ while (length) {
+ u32 len = sge->length;
+
+ BUG_ON(len == 0);
+ if (len > length)
+ len = length;
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+}
+
+static inline u32 alloc_qpn(struct ipath_qp_table *qpt)
+{
+ u32 i, offset, max_scan, qpn;
+ struct qpn_map *map;
+
+ qpn = qpt->last + 1;
+ if (qpn >= QPN_MAX)
+ qpn = 2;
+ offset = qpn & BITS_PER_PAGE_MASK;
+ map = &qpt->map[qpn / BITS_PER_PAGE];
+ max_scan = qpt->nmaps - !offset;
+ for (i = 0;;) {
+ if (unlikely(!map->page)) {
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+ unsigned long flags;
+
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irqsave(&qpt->lock, flags);
+ if (map->page)
+ free_page(page);
+ else
+ map->page = (void *)page;
+ spin_unlock_irqrestore(&qpt->lock, flags);
+ if (unlikely(!map->page))
+ break;
+ }
+ if (likely(atomic_read(&map->n_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->n_free);
+ qpt->last = qpn;
+ return qpn;
+ }
+ offset = find_next_offset(map, offset);
+ qpn = mk_qpn(qpt, map, offset);
+ /*
+ * This test differs from alloc_pidmap().
+ * If find_next_offset() does find a zero bit,
+ * we don't need to check for QPN wrapping
+ * around past our starting QPN. We
+ * just need to be sure we don't loop forever.
+ */
+ } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+ }
+ /*
+ * In order to keep the number of pages allocated to a minimum,
+ * we scan the all existing pages before increasing the size
+ * of the bitmap table.
+ */
+ if (++i > max_scan) {
+ if (qpt->nmaps == QPNMAP_ENTRIES)
+ break;
+ map = &qpt->map[qpt->nmaps++];
+ offset = 0;
+ } else if (map < &qpt->map[qpt->nmaps]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &qpt->map[0];
+ offset = 2;
+ }
+ qpn = mk_qpn(qpt, map, offset);
+ }
+ return 0;
+}
+
+static inline void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+ struct qpn_map *map;
+
+ map = qpt->map + qpn / BITS_PER_PAGE;
+ if (map->page)
+ clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+ atomic_inc(&map->n_free);
+}
+
+/*
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+ enum ib_qp_type type)
+{
+ unsigned long flags;
+ u32 qpn;
+
+ if (type == IB_QPT_SMI)
+ qpn = 0;
+ else if (type == IB_QPT_GSI)
+ qpn = 1;
+ else {
+ /* Allocate the next available QPN */
+ qpn = alloc_qpn(qpt);
+ if (qpn == 0) {
+ return -ENOMEM;
+ }
+ }
+ qp->ibqp.qp_num = qpn;
+
+ /* Add the QP to the hash table. */
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ qpn %= qpt->max;
+ qp->next = qpt->table[qpn];
+ qpt->table[qpn] = qp;
+ atomic_inc(&qp->refcount);
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+ return 0;
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+ struct ipath_qp *q, **qpp;
+ unsigned long flags;
+ int fnd = 0;
+
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ /* Remove QP from the hash table. */
+ qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+ for (; (q = *qpp) != NULL; qpp = &q->next) {
+ if (q == qp) {
+ *qpp = qp->next;
+ qp->next = NULL;
+ atomic_dec(&qp->refcount);
+ fnd = 1;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+
+ if (!fnd)
+ return;
+
+ /* If QPN is not reserved, mark QPN free in the bitmap. */
+ if (qp->ibqp.qp_num > 1)
+ free_qpn(qpt, qp->ibqp.qp_num);
+
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Remove all QPs from the table.
+ */
+static void ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+ unsigned long flags;
+ struct ipath_qp *qp, *nqp;
+ u32 n;
+
+ for (n = 0; n < qpt->max; n++) {
+ spin_lock_irqsave(&qpt->lock, flags);
+ qp = qpt->table[n];
+ qpt->table[n] = NULL;
+ spin_unlock_irqrestore(&qpt->lock, flags);
+
+ while (qp) {
+ nqp = qp->next;
+ if (qp->ibqp.qp_num > 1)
+ free_qpn(qpt, qp->ibqp.qp_num);
+ if (!atomic_dec_and_test(&qp->refcount) ||
+ !ipath_destroy_qp(&qp->ibqp))
+ _VERBS_INFO("QP memory leak!\n");
+ qp = nqp;
+ }
+ }
+
+ for (n = 0; n < ARRAY_SIZE(qpt->map); n++) {
+ if (qpt->map[n].page)
+ free_page((unsigned long)qpt->map[n].page);
+ }
+}
+
+/*
+ * Return the QP with the given QPN.
+ * The caller is responsible for decrementing the QP reference count when done.
+ */
+static struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+ unsigned long flags;
+ struct ipath_qp *qp;
+
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+ if (qp->ibqp.qp_num == qpn) {
+ atomic_inc(&qp->refcount);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+ return qp;
+}
+
+static int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+ struct ipath_mregion *mr)
+{
+ unsigned long flags;
+ u32 r;
+ u32 n;
+
+ spin_lock_irqsave(&rkt->lock, flags);
+
+ /* Find the next available LKEY */
+ r = n = rkt->next;
+ for (;;) {
+ if (rkt->table[r] == NULL)
+ break;
+ r = (r + 1) & (rkt->max - 1);
+ if (r == n) {
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ _VERBS_INFO("LKEY table full\n");
+ return 0;
+ }
+ }
+ rkt->next = (r + 1) & (rkt->max - 1);
+ /*
+ * Make sure lkey is never zero which is reserved to indicate an
+ * unrestricted LKEY.
+ */
+ rkt->gen++;
+ mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+ ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) << 8);
+ if (mr->lkey == 0) {
+ mr->lkey |= 1 << 8;
+ rkt->gen++;
+ }
+ rkt->table[r] = mr;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+
+ return 1;
+}
+
+static void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+ unsigned long flags;
+ u32 r;
+
+ if (lkey == 0)
+ return;
+ r = lkey >> (32 - ib_ipath_lkey_table_size);
+ spin_lock_irqsave(&rkt->lock, flags);
+ rkt->table[r] = NULL;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/*
+ * Check the IB SGE for validity and initialize our internal version of it.
+ * Return 1 if OK, else zero.
+ */
+static int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+ struct ib_sge *sge, int acc)
+{
+ struct ipath_mregion *mr;
+ size_t off;
+
+ /*
+ * We use LKEY == zero to mean a physical kmalloc() address.
+ * This is a bit of a hack since we rely on dma_map_single()
+ * being reversible by calling bus_to_virt().
+ */
+ if (sge->lkey == 0) {
+ isge->mr = NULL;
+ isge->vaddr = bus_to_virt(sge->addr);
+ isge->length = sge->length;
+ isge->sge_length = sge->length;
+ return 1;
+ }
+ spin_lock(&rkt->lock);
+ mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+ spin_unlock(&rkt->lock);
+ if (unlikely(mr == NULL || mr->lkey != sge->lkey))
+ return 0;
+
+ off = sge->addr - mr->user_base;
+ if (unlikely(sge->addr < mr->user_base ||
+ off + sge->length > mr->length ||
+ (mr->access_flags & acc) != acc))
+ return 0;
+
+ off += mr->offset;
+ isge->mr = mr;
+ isge->m = 0;
+ isge->n = 0;
+ while (off >= mr->map[isge->m]->segs[isge->n].length) {
+ off -= mr->map[isge->m]->segs[isge->n].length;
+ if (++isge->n >= IPATH_SEGSZ) {
+ isge->m++;
+ isge->n = 0;
+ }
+ }
+ isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off;
+ isge->length = mr->map[isge->m]->segs[isge->n].length - off;
+ isge->sge_length = sge->length;
+ return 1;
+}
+
+/*
+ * Initialize the qp->s_sge after a restart.
+ * The QP s_lock should be held.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+ struct ipath_ibdev *dev;
+ u32 len;
+
+ len = ((qp->s_psn - wqe->psn) & 0xFFFFFF) *
+ ib_mtu_enum_to_int(qp->path_mtu);
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ skip_sge(&qp->s_sge, len);
+ qp->s_len = wqe->length - len;
+ dev = to_idev(qp->ibqp.device);
+ spin_lock(&dev->pending_lock);
+ if (qp->timerwait.next == LIST_POISON1)
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+}
+
+/*
+ * Check the IB virtual address, length, and RKEY.
+ * Return 1 if OK, else zero.
+ * The QP r_rq.lock should be held.
+ */
+static int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+ u32 len, u64 vaddr, u32 rkey, int acc)
+{
+ struct ipath_lkey_table *rkt = &dev->lk_table;
+ struct ipath_sge *sge = &ss->sge;
+ struct ipath_mregion *mr;
+ size_t off;
+
+ spin_lock(&rkt->lock);
+ mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+ spin_unlock(&rkt->lock);
+ if (unlikely(mr == NULL || mr->lkey != rkey))
+ return 0;
+
+ off = vaddr - mr->iova;
+ if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+ (mr->access_flags & acc) == 0))
+ return 0;
+
+ off += mr->offset;
+ sge->mr = mr;
+ sge->m = 0;
+ sge->n = 0;
+ while (off >= mr->map[sge->m]->segs[sge->n].length) {
+ off -= mr->map[sge->m]->segs[sge->n].length;
+ if (++sge->n >= IPATH_SEGSZ) {
+ sge->m++;
+ sge->n = 0;
+ }
+ }
+ sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off;
+ sge->length = mr->map[sge->m]->segs[sge->n].length - off;
+ sge->sge_length = len;
+ ss->sg_list = NULL;
+ ss->num_sge = 1;
+ return 1;
+}
+
+/*
+ * Add a new entry to the completion queue.
+ * This may be called with one of the qp->s_lock or qp->r_rq.lock held.
+ */
+static void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig)
+{
+ unsigned long flags;
+ u32 next;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ cq->queue[cq->head] = *entry;
+ next = cq->head + 1;
+ if (next == cq->ibcq.cqe)
+ next = 0;
+ if (likely(next != cq->tail))
+ cq->head = next;
+ else {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ if (cq->ibcq.event_handler) {
+ struct ib_event ev;
+
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+ return;
+ }
+
+ if (cq->notify == IB_CQ_NEXT_COMP ||
+ (cq->notify == IB_CQ_SOLICITED && sig)) {
+ cq->notify = IB_CQ_NONE;
+ cq->triggered++;
+ /*
+ * This will cause send_complete() to be called in
+ * another thread.
+ */
+ tasklet_schedule(&cq->comptask);
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (entry->status != IB_WC_SUCCESS)
+ to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+static void send_complete(unsigned long data)
+{
+ struct ipath_cq *cq = (struct ipath_cq *)data;
+
+ /*
+ * The completion handler will most likely rearm the notification
+ * and poll for all pending entries. If a new completion entry
+ * is added while we are in this routine, tasklet_schedule()
+ * won't call us again until we return so we check triggered to
+ * see if we need to call the handler again.
+ */
+ for (;;) {
+ u8 triggered = cq->triggered;
+
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+ if (cq->triggered == triggered)
+ return;
+ }
+}
+
+/*
+ * This is the QP state transition table.
+ * See ipath_modify_qp() for details.
+ */
+static const struct {
+ int trans;
+ u32 req_param[IB_QPT_RAW_IPV6];
+ u32 opt_param[IB_QPT_RAW_IPV6];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_INIT] = {
+ .trans = IPATH_TRANS_RST2INIT,
+ .req_param = {
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ },
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_INIT] = {
+ .trans = IPATH_TRANS_INIT2INIT,
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .trans = IPATH_TRANS_INIT2RTR,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = IPATH_TRANS_RTR2RTS,
+ .req_param = {
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ },
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = IPATH_TRANS_RTS2RTS,
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .trans = IPATH_TRANS_RTS2SQD,
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = IPATH_TRANS_SQD2RTS,
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .trans = IPATH_TRANS_SQD2SQD,
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = IPATH_TRANS_SQERR2RTS,
+ .opt_param = {
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
+ [IB_QPT_UC] = IB_QP_CUR_STATE,
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR }
+ }
+};
+
+/*
+ * Initialize the QP state to the reset state.
+ */
+static void ipath_reset_qp(struct ipath_qp *qp)
+{
+ qp->remote_qpn = 0;
+ qp->qkey = 0;
+ qp->qp_access_flags = 0;
+ qp->s_hdrwords = 0;
+ qp->s_psn = 0;
+ qp->r_psn = 0;
+ atomic_set(&qp->msn, 0);
+ if (qp->ibqp.qp_type == IB_QPT_RC) {
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->r_state = IB_OPCODE_RC_SEND_LAST;
+ } else {
+ qp->s_state = IB_OPCODE_UC_SEND_LAST;
+ qp->r_state = IB_OPCODE_UC_SEND_LAST;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ qp->s_nak_state = 0;
+ qp->s_rnr_timeout = 0;
+ qp->s_head = 0;
+ qp->s_tail = 0;
+ qp->s_cur = 0;
+ qp->s_last = 0;
+ qp->s_ssn = 1;
+ qp->s_lsn = 0;
+ qp->r_rq.head = 0;
+ qp->r_rq.tail = 0;
+ qp->r_reuse_sge = 0;
+}
+
+/*
+ * Flush send work queue.
+ * The QP s_lock should be held.
+ */
+static void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+
+ _VERBS_INFO("Send queue error on QP%d/%d: err: %d\n",
+ qp->ibqp.qp_num, qp->remote_qpn, wc->status);
+
+ spin_lock(&dev->pending_lock);
+ /* XXX What if its already removed by the timeout code? */
+ if (qp->timerwait.next != LIST_POISON1)
+ list_del(&qp->timerwait);
+ if (qp->piowait.next != LIST_POISON1)
+ list_del(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+
+ wc->status = IB_WC_WR_FLUSH_ERR;
+
+ while (qp->s_last != qp->s_head) {
+ wc->wr_id = wqe->wr.wr_id;
+ wc->opcode = wc_opcode[wqe->wr.opcode];
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ }
+ qp->s_cur = qp->s_tail = qp->s_head;
+ qp->state = IB_QPS_SQE;
+}
+
+/*
+ * Flush both send and receive work queues.
+ * QP r_rq.lock and s_lock should be held.
+ */
+static void ipath_error_qp(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ib_wc wc;
+
+ _VERBS_INFO("QP%d/%d in error state\n",
+ qp->ibqp.qp_num, qp->remote_qpn);
+
+ spin_lock(&dev->pending_lock);
+ /* XXX What if its already removed by the timeout code? */
+ if (qp->timerwait.next != LIST_POISON1)
+ list_del(&qp->timerwait);
+ if (qp->piowait.next != LIST_POISON1)
+ list_del(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+
+ wc.status = IB_WC_WR_FLUSH_ERR;
+ wc.vendor_err = 0;
+ wc.byte_len = 0;
+ wc.imm_data = 0;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = 0;
+ wc.wc_flags = 0;
+ wc.pkey_index = 0;
+ wc.slid = 0;
+ wc.sl = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+
+ while (qp->s_last != qp->s_head) {
+ struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+
+ wc.wr_id = wqe->wr.wr_id;
+ wc.opcode = wc_opcode[wqe->wr.opcode];
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
+ }
+ qp->s_cur = qp->s_tail = qp->s_head;
+ qp->s_hdrwords = 0;
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+
+ wc.opcode = IB_WC_RECV;
+ while (qp->r_rq.tail != qp->r_rq.head) {
+ wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id;
+ if (++qp->r_rq.tail >= qp->r_rq.size)
+ qp->r_rq.tail = 0;
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ }
+}
+
+static int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ u32 req_param, opt_param;
+ unsigned long flags;
+
+ if (attr_mask & IB_QP_CUR_STATE) {
+ cur_state = attr->cur_qp_state;
+ if (cur_state != IB_QPS_RTR &&
+ cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return -EINVAL;
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ spin_lock(&qp->s_lock);
+ } else {
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ spin_lock(&qp->s_lock);
+ cur_state = qp->state;
+ }
+
+ if (attr_mask & IB_QP_STATE) {
+ new_state = attr->qp_state;
+ if (new_state < 0 || new_state > IB_QPS_ERR)
+ goto inval;
+ } else
+ new_state = cur_state;
+
+ switch (qp_state_table[cur_state][new_state].trans) {
+ case IPATH_TRANS_INVALID:
+ goto inval;
+
+ case IPATH_TRANS_ANY2RST:
+ ipath_reset_qp(qp);
+ break;
+
+ case IPATH_TRANS_ANY2ERR:
+ ipath_error_qp(qp);
+ break;
+
+ }
+
+ req_param =
+ qp_state_table[cur_state][new_state].req_param[qp->ibqp.qp_type];
+ opt_param =
+ qp_state_table[cur_state][new_state].opt_param[qp->ibqp.qp_type];
+
+ if ((req_param & attr_mask) != req_param)
+ goto inval;
+
+ if (attr_mask & ~(req_param | opt_param | IB_QP_STATE))
+ goto inval;
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+ if (attr->pkey_index >= ipath_layer_get_npkeys(dev->ib_unit))
+ goto inval;
+ qp->s_pkey_index = attr->pkey_index;
+ }
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ qp->remote_qpn = attr->dest_qp_num;
+
+ if (attr_mask & IB_QP_SQ_PSN) {
+ qp->s_next_psn = attr->sq_psn;
+ qp->s_last_psn = qp->s_next_psn - 1;
+ }
+
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp->r_psn = attr->rq_psn;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->qp_access_flags = attr->qp_access_flags;
+
+ if (attr_mask & IB_QP_AV)
+ qp->remote_ah_attr = attr->ah_attr;
+
+ if (attr_mask & IB_QP_PATH_MTU)
+ qp->path_mtu = attr->path_mtu;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp->s_rnr_retry = attr->rnr_retry;
+ if (qp->s_rnr_retry > 7)
+ qp->s_rnr_retry = 7;
+ qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+ }
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ qp->s_min_rnr_timer = attr->min_rnr_timer & 0x1F;
+
+ if (attr_mask & IB_QP_QKEY)
+ qp->qkey = attr->qkey;
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ qp->s_pkey_index = attr->pkey_index;
+
+ qp->state = new_state;
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+ /*
+ * Try to move to ARMED if QP1 changed to the RTS state.
+ */
+ if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) {
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+ /*
+ * Bounce the link even if it was active so the SM will
+ * reinitialize the SMA's state.
+ */
+ ipath_kset_linkstate((dev->ib_unit << 16) | IPATH_IB_LINKDOWN);
+ ipath_kset_linkstate((dev->ib_unit << 16) | IPATH_IB_LINKARM);
+ }
+ return 0;
+
+inval:
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ return -EINVAL;
+}
+
+/*
+ * Compute the AETH (syndrome + MSN).
+ * The QP s_lock should be held.
+ */
+static u32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+ u32 aeth = atomic_read(&qp->msn) & 0xFFFFFF;
+
+ if (qp->s_nak_state) {
+ aeth |= qp->s_nak_state << 24;
+ } else if (qp->ibqp.srq) {
+ /* Shared receive queues don't generate credits. */
+ aeth |= 0x1F << 24;
+ } else {
+ u32 min, max, x;
+ u32 credits;
+
+ /*
+ * Compute the number of credits available (RWQEs).
+ * XXX Not holding the r_rq.lock here so there is a small
+ * chance that the pair of reads are not atomic.
+ */
+ credits = qp->r_rq.head - qp->r_rq.tail;
+ if ((int)credits < 0)
+ credits += qp->r_rq.size;
+ /* Binary search the credit table to find the code to use. */
+ min = 0;
+ max = 31;
+ for (;;) {
+ x = (min + max) / 2;
+ if (credit_table[x] == credits)
+ break;
+ if (credit_table[x] > credits)
+ max = x;
+ else if (min == x)
+ break;
+ else
+ min = x;
+ }
+ aeth |= x << 24;
+ }
+ return cpu_to_be32(aeth);
+}
+
+
+static void no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (qp->piowait.next == LIST_POISON1)
+ list_add_tail(&qp->piowait, &dev->piowait);
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+ /*
+ * Note that as soon as ipath_layer_want_buffer() is called and
+ * possibly before it returns, ipath_ib_piobufavail()
+ * could be called. If we are still in the tasklet function,
+ * tasklet_schedule() will not call us until the next time
+ * tasklet_schedule() is called.
+ * We clear the tasklet flag now since we are committing to return
+ * from the tasklet function.
+ */
+ tasklet_unlock(&qp->s_task);
+ ipath_layer_want_buffer(dev->ib_unit);
+ dev->n_piowait++;
+}
+
+/*
+ * Process entries in the send work queue until the queue is exhausted.
+ * Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, after we drop the QP lock, two threads could send
+ * packets out of order.
+ * This is similar to do_rc_send() below except we don't have timeouts or
+ * resends.
+ */
+static void do_uc_send(unsigned long data)
+{
+ struct ipath_qp *qp = (struct ipath_qp *)data;
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_swqe *wqe;
+ unsigned long flags;
+ u16 lrh0;
+ u32 hwords;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth0;
+ u32 bth2;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ u32 len;
+ struct ipath_other_headers *ohdr;
+ struct ib_wc wc;
+
+ if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
+ return;
+
+ if (unlikely(qp->remote_ah_attr.dlid ==
+ ipath_layer_get_lid(dev->ib_unit))) {
+ /* Pass in an uninitialized ib_wc to save stack space. */
+ ipath_ruc_loopback(qp, &wc);
+ clear_bit(IPATH_S_BUSY, &qp->s_flags);
+ return;
+ }
+
+ ohdr = &qp->s_hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr.u.l.oth;
+
+again:
+ /* Check for a constructed packet to be sent. */
+ if (qp->s_hdrwords != 0) {
+ /*
+ * If no PIO bufs are available, return.
+ * An interrupt will call ipath_ib_piobufavail()
+ * when one is available.
+ */
+ if (ipath_verbs_send(dev->ib_unit, qp->s_hdrwords,
+ (uint32_t *) &qp->s_hdr,
+ qp->s_cur_size, qp->s_cur_sge)) {
+ no_bufs_available(qp, dev);
+ return;
+ }
+ dev->n_unicast_xmit++;
+ /* Record that we sent the packet and s_hdr is empty. */
+ qp->s_hdrwords = 0;
+ }
+
+ lrh0 = IPS_LRH_BTH;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ /*
+ * The lock is needed to synchronize between
+ * setting qp->s_ack_state and post_send().
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (!(state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
+ goto done;
+
+ bth0 = ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
+
+ /* Send a request. */
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ switch (qp->s_state) {
+ default:
+ /* Signal the completion of the last send (if there is one). */
+ if (qp->s_last != qp->s_tail) {
+ if (++qp->s_last == qp->s_size)
+ qp->s_last = 0;
+ if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = wc_opcode[wqe->wr.opcode];
+ wc.vendor_err = 0;
+ wc.byte_len = wqe->length;
+ wc.qp_num = qp->ibqp.qp_num;
+ wc.src_qp = qp->remote_qpn;
+ wc.pkey_index = 0;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+ 0);
+ }
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ }
+ /* Check if send work queue is empty. */
+ if (qp->s_tail == qp->s_head)
+ goto done;
+ /*
+ * Start a new request.
+ */
+ qp->s_psn = wqe->psn = qp->s_next_psn;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_len = len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ if (len > pmtu) {
+ qp->s_state = IB_OPCODE_UC_SEND_FIRST;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = IB_OPCODE_UC_SEND_ONLY;
+ } else {
+ qp->s_state =
+ IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / 4;
+ if (len > pmtu) {
+ qp->s_state = IB_OPCODE_UC_RDMA_WRITE_FIRST;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state = IB_OPCODE_UC_RDMA_WRITE_ONLY;
+ } else {
+ qp->s_state =
+ IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE;
+ /* Immediate data comes after the RETH */
+ ohdr->u.rc.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ break;
+
+ default:
+ goto done;
+ }
+ if (++qp->s_tail >= qp->s_size)
+ qp->s_tail = 0;
+ break;
+
+ case IB_OPCODE_UC_SEND_FIRST:
+ qp->s_state = IB_OPCODE_UC_SEND_MIDDLE;
+ /* FALLTHROUGH */
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = IB_OPCODE_UC_SEND_LAST;
+ else {
+ qp->s_state = IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ break;
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ qp->s_state = IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ /* FALLTHROUGH */
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = IB_OPCODE_UC_RDMA_WRITE_LAST;
+ else {
+ qp->s_state =
+ IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ break;
+ }
+ bth2 = qp->s_next_psn++ & 0xFFFFFF;
+ qp->s_len -= len;
+ bth0 |= qp->s_state << 24;
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Construct the header. */
+ extra_bytes = (4 - len) & 3;
+ nwords = (len + extra_bytes) >> 2;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ /* Header size in 32-bit words. */
+ hwords += 10;
+ lrh0 = IPS_LRH_GRH;
+ qp->s_hdr.u.l.grh.version_tclass_flow =
+ cpu_to_be32((6 << 28) |
+ (qp->remote_ah_attr.grh.traffic_class << 20) |
+ qp->remote_ah_attr.grh.flow_label);
+ qp->s_hdr.u.l.grh.paylen =
+ cpu_to_be16(((hwords - 12) + nwords + SIZE_OF_CRC) << 2);
+ qp->s_hdr.u.l.grh.next_hdr = 0x1B;
+ qp->s_hdr.u.l.grh.hop_limit = qp->remote_ah_attr.grh.hop_limit;
+ /* The SGID is 32-bit aligned. */
+ qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
+ qp->s_hdr.u.l.grh.sgid.global.interface_id =
+ ipath_layer_get_guid(dev->ib_unit);
+ qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
+ }
+ qp->s_hdrwords = hwords;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_cur_size = len;
+ lrh0 |= qp->remote_ah_attr.sl << 4;
+ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+ /* DEST LID */
+ qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
+ qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->ib_unit));
+ bth0 |= extra_bytes << 20;
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(bth2);
+
+ /* Check for more work to do. */
+ goto again;
+
+done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ clear_bit(IPATH_S_BUSY, &qp->s_flags);
+}
+
+/*
+ * Process entries in the send work queue until credit or queue is exhausted.
+ * Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, after we drop the QP s_lock, two threads could send
+ * packets out of order.
+ */
+static void do_rc_send(unsigned long data)
+{
+ struct ipath_qp *qp = (struct ipath_qp *)data;
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_swqe *wqe;
+ struct ipath_sge_state *ss;
+ unsigned long flags;
+ u16 lrh0;
+ u32 hwords;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth0;
+ u32 bth2;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ u32 len;
+ struct ipath_other_headers *ohdr;
+ char newreq;
+
+ if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
+ return;
+
+ if (unlikely(qp->remote_ah_attr.dlid ==
+ ipath_layer_get_lid(dev->ib_unit))) {
+ struct ib_wc wc;
+
+ /*
+ * Pass in an uninitialized ib_wc to be consistent with
+ * other places where ipath_ruc_loopback() is called.
+ */
+ ipath_ruc_loopback(qp, &wc);
+ clear_bit(IPATH_S_BUSY, &qp->s_flags);
+ return;
+ }
+
+ ohdr = &qp->s_hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr.u.l.oth;
+
+again:
+ /* Check for a constructed packet to be sent. */
+ if (qp->s_hdrwords != 0) {
+ /*
+ * If no PIO bufs are available, return.
+ * An interrupt will call ipath_ib_piobufavail()
+ * when one is available.
+ */
+ if (ipath_verbs_send(dev->ib_unit, qp->s_hdrwords,
+ (uint32_t *) &qp->s_hdr,
+ qp->s_cur_size, qp->s_cur_sge)) {
+ no_bufs_available(qp, dev);
+ return;
+ }
+ dev->n_unicast_xmit++;
+ /* Record that we sent the packet and s_hdr is empty. */
+ qp->s_hdrwords = 0;
+ }
+
+ lrh0 = IPS_LRH_BTH;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ /*
+ * The lock is needed to synchronize between
+ * setting qp->s_ack_state, resend timer, and post_send().
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ bth0 = ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
+
+ /* Sending responses has higher priority over sending requests. */
+ if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE) {
+ /*
+ * Send a response.
+ * Note that we are in the responder's side of the QP context.
+ */
+ switch (qp->s_ack_state) {
+ case IB_OPCODE_RC_RDMA_READ_REQUEST:
+ ss = &qp->s_rdma_sge;
+ len = qp->s_rdma_len;
+ if (len > pmtu) {
+ len = pmtu;
+ qp->s_ack_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+ } else {
+ qp->s_ack_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+ }
+ qp->s_rdma_len -= len;
+ bth0 |= qp->s_ack_state << 24;
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ qp->s_ack_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ ss = &qp->s_rdma_sge;
+ len = qp->s_rdma_len;
+ if (len > pmtu) {
+ len = pmtu;
+ } else {
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_state =
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+ }
+ qp->s_rdma_len -= len;
+ bth0 |= qp->s_ack_state << 24;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+ /*
+ * We have to prevent new requests from changing
+ * the r_sge state while a ipath_verbs_send()
+ * is in progress.
+ * Changing r_state allows the receiver
+ * to continue processing new packets.
+ * We do it here now instead of above so
+ * that we are sure the packet was sent before
+ * changing the state.
+ */
+ qp->r_state = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ goto send_req;
+
+ case IB_OPCODE_RC_COMPARE_SWAP:
+ case IB_OPCODE_RC_FETCH_ADD:
+ ss = NULL;
+ len = 0;
+ qp->r_state = IB_OPCODE_RC_SEND_LAST;
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
+ ohdr->u.at.aeth = ipath_compute_aeth(qp);
+ ohdr->u.at.atomic_ack_eth =
+ cpu_to_be64(qp->s_ack_atomic);
+ hwords += sizeof(ohdr->u.at) / 4;
+ break;
+
+ default:
+ /* Send a regular ACK. */
+ ss = NULL;
+ len = 0;
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ bth0 |= qp->s_ack_state << 24;
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ }
+ bth2 = qp->s_ack_psn++ & 0xFFFFFF;
+ } else {
+ send_req:
+ if (!(state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
+ qp->s_rnr_timeout)
+ goto done;
+
+ /* Send a request. */
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ switch (qp->s_state) {
+ default:
+ /*
+ * Resend an old request or start a new one.
+ *
+ * We keep track of the current SWQE so that
+ * we don't reset the "furthest progress" state
+ * if we need to back up.
+ */
+ newreq = 0;
+ if (qp->s_cur == qp->s_tail) {
+ /* Check if send work queue is empty. */
+ if (qp->s_tail == qp->s_head)
+ goto done;
+ qp->s_psn = wqe->psn = qp->s_next_psn;
+ newreq = 1;
+ }
+ /*
+ * Note that we have to be careful not to modify the
+ * original work request since we may need to resend
+ * it.
+ */
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_len = len = wqe->length;
+ ss = &qp->s_sge;
+ bth2 = 0;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ /* If no credit, return. */
+ if (qp->s_lsn != (u32) -1 &&
+ cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+ goto done;
+ }
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state = IB_OPCODE_RC_SEND_FIRST;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = IB_OPCODE_RC_SEND_ONLY;
+ } else {
+ qp->s_state =
+ IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ bth2 = 1 << 31; /* Request ACK. */
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (newreq)
+ qp->s_lsn++;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ /* If no credit, return. */
+ if (qp->s_lsn != (u32) -1 &&
+ cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+ goto done;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / 4;
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_WRITE_ONLY;
+ } else {
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE;
+ /* Immediate data comes after RETH */
+ ohdr->u.rc.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ if (wqe->wr.
+ send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ bth2 = 1 << 31; /* Request ACK. */
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_READ:
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ qp->s_state = IB_OPCODE_RC_RDMA_READ_REQUEST;
+ hwords += sizeof(ohdr->u.rc.reth) / 4;
+ if (newreq) {
+ qp->s_lsn++;
+ /*
+ * Adjust s_next_psn to count the
+ * expected number of responses.
+ */
+ if (len > pmtu)
+ qp->s_next_psn +=
+ (len - 1) / pmtu;
+ wqe->lpsn = qp->s_next_psn++;
+ }
+ ss = NULL;
+ len = 0;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ qp->s_state =
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ?
+ IB_OPCODE_RC_COMPARE_SWAP :
+ IB_OPCODE_RC_FETCH_ADD;
+ ohdr->u.atomic_eth.vaddr =
+ cpu_to_be64(wqe->wr.wr.atomic.remote_addr);
+ ohdr->u.atomic_eth.rkey =
+ cpu_to_be32(wqe->wr.wr.atomic.rkey);
+ ohdr->u.atomic_eth.swap_data =
+ cpu_to_be64(wqe->wr.wr.atomic.swap);
+ ohdr->u.atomic_eth.compare_data =
+ cpu_to_be64(wqe->wr.wr.atomic.compare_add);
+ hwords += sizeof(struct ib_atomic_eth) / 4;
+ if (newreq) {
+ qp->s_lsn++;
+ wqe->lpsn = wqe->psn;
+ }
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ ss = NULL;
+ len = 0;
+ break;
+
+ default:
+ goto done;
+ }
+ if (newreq) {
+ if (++qp->s_tail >= qp->s_size)
+ qp->s_tail = 0;
+ }
+ bth2 |= qp->s_psn++ & 0xFFFFFF;
+ if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ spin_lock(&dev->pending_lock);
+ if (qp->timerwait.next == LIST_POISON1) {
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->
+ pending_index]);
+ }
+ spin_unlock(&dev->pending_lock);
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ /*
+ * This case can only happen if a send is
+ * restarted. See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_SEND_FIRST:
+ qp->s_state = IB_OPCODE_RC_SEND_MIDDLE;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ bth2 = qp->s_psn++ & 0xFFFFFF;
+ if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ /*
+ * Request an ACK every 1/2 MB to avoid
+ * retransmit timeouts.
+ */
+ if (((wqe->length - len) % (512 * 1024)) == 0)
+ bth2 |= 1 << 31;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ else {
+ qp->s_state =
+ IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ bth2 |= 1 << 31; /* Request ACK. */
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ /*
+ * This case can only happen if a RDMA write is
+ * restarted. See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+ qp->s_state = IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ /* FALLTHROUGH */
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ bth2 = qp->s_psn++ & 0xFFFFFF;
+ if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ /*
+ * Request an ACK every 1/2 MB to avoid
+ * retransmit timeouts.
+ */
+ if (((wqe->length - len) % (512 * 1024)) == 0)
+ bth2 |= 1 << 31;
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = IB_OPCODE_RC_RDMA_WRITE_LAST;
+ else {
+ qp->s_state =
+ IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE;
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ bth2 |= 1 << 31; /* Request ACK. */
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ /*
+ * This case can only happen if a RDMA read is
+ * restarted. See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ len = ((qp->s_psn - wqe->psn) & 0xFFFFFF) * pmtu;
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+ qp->s_state = IB_OPCODE_RC_RDMA_READ_REQUEST;
+ hwords += sizeof(ohdr->u.rc.reth) / 4;
+ bth2 = qp->s_psn++ & 0xFFFFFF;
+ if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = NULL;
+ len = 0;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_REQUEST:
+ case IB_OPCODE_RC_COMPARE_SWAP:
+ case IB_OPCODE_RC_FETCH_ADD:
+ /*
+ * We shouldn't start anything new until this request
+ * is finished. The ACK will handle rescheduling us.
+ * XXX The number of outstanding ones is negotiated
+ * at connection setup time (see pg. 258,289)?
+ * XXX Also, if we support multiple outstanding
+ * requests, we need to check the WQE IB_SEND_FENCE
+ * flag and not send a new request if a RDMA read or
+ * atomic is pending.
+ */
+ goto done;
+ }
+ qp->s_len -= len;
+ bth0 |= qp->s_state << 24;
+ /* XXX queue resend timeout. */
+ }
+ /* Make sure it is non-zero before dropping the lock. */
+ qp->s_hdrwords = hwords;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Construct the header. */
+ extra_bytes = (4 - len) & 3;
+ nwords = (len + extra_bytes) >> 2;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ /* Header size in 32-bit words. */
+ hwords += 10;
+ lrh0 = IPS_LRH_GRH;
+ qp->s_hdr.u.l.grh.version_tclass_flow =
+ cpu_to_be32((6 << 28) |
+ (qp->remote_ah_attr.grh.traffic_class << 20) |
+ qp->remote_ah_attr.grh.flow_label);
+ qp->s_hdr.u.l.grh.paylen =
+ cpu_to_be16(((hwords - 12) + nwords + SIZE_OF_CRC) << 2);
+ qp->s_hdr.u.l.grh.next_hdr = 0x1B;
+ qp->s_hdr.u.l.grh.hop_limit = qp->remote_ah_attr.grh.hop_limit;
+ /* The SGID is 32-bit aligned. */
+ qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
+ qp->s_hdr.u.l.grh.sgid.global.interface_id =
+ ipath_layer_get_guid(dev->ib_unit);
+ qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
+ qp->s_hdrwords = hwords;
+ }
+ qp->s_cur_sge = ss;
+ qp->s_cur_size = len;
+ lrh0 |= qp->remote_ah_attr.sl << 4;
+ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+ /* DEST LID */
+ qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
+ qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->ib_unit));
+ bth0 |= extra_bytes << 20;
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(bth2);
+
+ /* Check for more work to do. */
+ goto again;
+
+done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ clear_bit(IPATH_S_BUSY, &qp->s_flags);
+}
More information about the general
mailing list