[openib-general] Re: [PATCH 10/13] [RFC] ipath verbs, part 1
Paul E. McKenney
paulmck at us.ibm.com
Sun Dec 18 11:59:22 PST 2005
On Fri, Dec 16, 2005 at 03:48:55PM -0800, Roland Dreier wrote:
> First half of ipath verbs driver
Some RCU-related questions interspersed. Basic question is "where is
the lock-free read-side traversal?"
Thanx, Paul
> ---
>
> drivers/infiniband/hw/ipath/ipath_verbs.c | 3244 +++++++++++++++++++++++++++++
> 1 files changed, 3244 insertions(+), 0 deletions(-)
> create mode 100644 drivers/infiniband/hw/ipath/ipath_verbs.c
>
> 72075ecec75f8c42e444a7d7d8ffcf340a845b96
> diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
> new file mode 100644
> index 0000000..808326e
> --- /dev/null
> +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
> @@ -0,0 +1,3244 @@
> +/*
> + * Copyright (c) 2005. PathScale, Inc. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses. You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + * - Redistributions of source code must retain the above
> + * copyright notice, this list of conditions and the following
> + * disclaimer.
> + *
> + * - Redistributions in binary form must reproduce the above
> + * copyright notice, this list of conditions and the following
> + * disclaimer in the documentation and/or other materials
> + * provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + * Patent licenses, if any, provided herein do not apply to
> + * combinations of this program with other software, or any other
> + * product whatsoever.
> + *
> + * $Id: ipath_verbs.c 4491 2005-12-15 22:20:31Z rjwalsh $
> + */
> +
> +#include <linux/config.h>
> +#include <linux/version.h>
> +#include <linux/pci.h>
> +#include <linux/err.h>
> +#include <rdma/ib_pack.h>
> +#include <rdma/ib_smi.h>
> +#include <rdma/ib_mad.h>
> +#include <rdma/ib_user_verbs.h>
> +
> +#include <asm/uaccess.h>
> +#include <asm-generic/bug.h>
> +
> +#include "ipath_common.h"
> +#include "ips_common.h"
> +#include "ipath_layer.h"
> +#include "ipath_verbs.h"
> +
> +/*
> + * Compare the lower 24 bits of the two values.
> + * Returns an integer <, ==, or > than zero.
> + */
> +static inline int cmp24(u32 a, u32 b)
> +{
> + return (((int) a) - ((int) b)) << 8;
> +}
> +
> +#define MODNAME "ib_ipath"
> +#define DRIVER_LOAD_MSG "PathScale " MODNAME " loaded: "
> +#define PFX MODNAME ": "
> +
> +
> +/* Not static, because we don't want the compiler removing it */
> +const char ipath_verbs_version[] = "ipath_verbs " _IPATH_IDSTR;
> +
> +unsigned int ib_ipath_qp_table_size = 251;
> +module_param(ib_ipath_qp_table_size, uint, 0444);
> +MODULE_PARM_DESC(ib_ipath_qp_table_size, "QP table size");
> +
> +unsigned int ib_ipath_lkey_table_size = 12;
> +module_param(ib_ipath_lkey_table_size, uint, 0444);
> +MODULE_PARM_DESC(ib_ipath_lkey_table_size,
> + "LKEY table size in bits (2^n, 1 <= n <= 23)");
> +
> +unsigned int ib_ipath_debug; /* debug mask */
> +module_param(ib_ipath_debug, uint, 0644);
> +MODULE_PARM_DESC(ib_ipath_debug, "Verbs debug mask");
> +
> +
> +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss,
> + u32 len, struct ib_send_wr *wr, struct ib_wc *wc);
> +static void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc);
> +static int ipath_destroy_qp(struct ib_qp *ibqp);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("PathScale <infinipath-support at pathscale.com>");
> +MODULE_DESCRIPTION("Pathscale InfiniPath driver");
> +
> +enum {
> + IPATH_FAULT_RC_DROP_SEND_F = 1,
> + IPATH_FAULT_RC_DROP_SEND_M,
> + IPATH_FAULT_RC_DROP_SEND_L,
> + IPATH_FAULT_RC_DROP_SEND_O,
> + IPATH_FAULT_RC_DROP_RDMA_WRITE_F,
> + IPATH_FAULT_RC_DROP_RDMA_WRITE_M,
> + IPATH_FAULT_RC_DROP_RDMA_WRITE_L,
> + IPATH_FAULT_RC_DROP_RDMA_WRITE_O,
> + IPATH_FAULT_RC_DROP_RDMA_READ_RESP_F,
> + IPATH_FAULT_RC_DROP_RDMA_READ_RESP_M,
> + IPATH_FAULT_RC_DROP_RDMA_READ_RESP_L,
> + IPATH_FAULT_RC_DROP_RDMA_READ_RESP_O,
> + IPATH_FAULT_RC_DROP_ACK,
> +};
> +
> +enum {
> + IPATH_TRANS_INVALID = 0,
> + IPATH_TRANS_ANY2RST,
> + IPATH_TRANS_RST2INIT,
> + IPATH_TRANS_INIT2INIT,
> + IPATH_TRANS_INIT2RTR,
> + IPATH_TRANS_RTR2RTS,
> + IPATH_TRANS_RTS2RTS,
> + IPATH_TRANS_SQERR2RTS,
> + IPATH_TRANS_ANY2ERR,
> + IPATH_TRANS_RTS2SQD, /* XXX Wait for expected ACKs & signal event */
> + IPATH_TRANS_SQD2SQD, /* error if not drained & parameter change */
> + IPATH_TRANS_SQD2RTS, /* error if not drained */
> +};
> +
> +enum {
> + IPATH_POST_SEND_OK = 0x0001,
> + IPATH_POST_RECV_OK = 0x0002,
> + IPATH_PROCESS_RECV_OK = 0x0004,
> + IPATH_PROCESS_SEND_OK = 0x0008,
> +};
> +
> +static int state_ops[IB_QPS_ERR + 1] = {
> + [IB_QPS_RESET] = 0,
> + [IB_QPS_INIT] = IPATH_POST_RECV_OK,
> + [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
> + [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
> + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
> + [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
> + IPATH_POST_SEND_OK,
> + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
> + [IB_QPS_ERR] = 0,
> +};
> +
> +/*
> + * Convert the AETH credit code into the number of credits.
> + */
> +static u32 credit_table[31] = {
> + 0, /* 0 */
> + 1, /* 1 */
> + 2, /* 2 */
> + 3, /* 3 */
> + 4, /* 4 */
> + 6, /* 5 */
> + 8, /* 6 */
> + 12, /* 7 */
> + 16, /* 8 */
> + 24, /* 9 */
> + 32, /* A */
> + 48, /* B */
> + 64, /* C */
> + 96, /* D */
> + 128, /* E */
> + 192, /* F */
> + 256, /* 10 */
> + 384, /* 11 */
> + 512, /* 12 */
> + 768, /* 13 */
> + 1024, /* 14 */
> + 1536, /* 15 */
> + 2048, /* 16 */
> + 3072, /* 17 */
> + 4096, /* 18 */
> + 6144, /* 19 */
> + 8192, /* 1A */
> + 12288, /* 1B */
> + 16384, /* 1C */
> + 24576, /* 1D */
> + 32768 /* 1E */
> +};
> +
> +/*
> + * Convert the AETH RNR timeout code into the number of milliseconds.
> + */
> +static u32 rnr_table[32] = {
> + 656, /* 0 */
> + 1, /* 1 */
> + 1, /* 2 */
> + 1, /* 3 */
> + 1, /* 4 */
> + 1, /* 5 */
> + 1, /* 6 */
> + 1, /* 7 */
> + 1, /* 8 */
> + 1, /* 9 */
> + 1, /* A */
> + 1, /* B */
> + 1, /* C */
> + 1, /* D */
> + 2, /* E */
> + 2, /* F */
> + 3, /* 10 */
> + 4, /* 11 */
> + 6, /* 12 */
> + 8, /* 13 */
> + 11, /* 14 */
> + 16, /* 15 */
> + 21, /* 16 */
> + 31, /* 17 */
> + 41, /* 18 */
> + 62, /* 19 */
> + 82, /* 1A */
> + 123, /* 1B */
> + 164, /* 1C */
> + 246, /* 1D */
> + 328, /* 1E */
> + 492 /* 1F */
> +};
> +
> +/*
> + * Translate ib_wr_opcode into ib_wc_opcode.
> + */
> +static enum ib_wc_opcode wc_opcode[] = {
> + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
> + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
> + [IB_WR_SEND] = IB_WC_SEND,
> + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
> + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
> + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
> + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
> +};
> +
> +/*
> + * Array of device pointers.
> + */
> +static uint32_t number_of_devices;
> +static struct ipath_ibdev **ipath_devices;
> +
> +/*
> + * Global table of GID to attached QPs.
> + * The table is global to all ipath devices since a send from one QP/device
> + * needs to be locally routed to any locally attached QPs on the same
> + * or different device.
> + */
> +static struct rb_root mcast_tree;
> +static spinlock_t mcast_lock = SPIN_LOCK_UNLOCKED;
> +
> +/*
> + * Allocate a structure to link a QP to the multicast GID structure.
> + */
> +static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
> +{
> + struct ipath_mcast_qp *mqp;
> +
> + mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
> + if (!mqp)
> + return NULL;
> +
> + mqp->qp = qp;
> + atomic_inc(&qp->refcount);
> +
> + return mqp;
> +}
> +
> +static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
> +{
> + struct ipath_qp *qp = mqp->qp;
> +
> + /* Notify ipath_destroy_qp() if it is waiting. */
> + if (atomic_dec_and_test(&qp->refcount))
> + wake_up(&qp->wait);
> +
> + kfree(mqp);
> +}
> +
> +/*
> + * Allocate a structure for the multicast GID.
> + * A list of QPs will be attached to this structure.
> + */
> +static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
> +{
> + struct ipath_mcast *mcast;
> +
> + mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
> + if (!mcast)
> + return NULL;
> +
> + mcast->mgid = *mgid;
> + INIT_LIST_HEAD(&mcast->qp_list);
> + init_waitqueue_head(&mcast->wait);
> + atomic_set(&mcast->refcount, 0);
> +
> + return mcast;
> +}
> +
> +static void ipath_mcast_free(struct ipath_mcast *mcast)
> +{
> + struct ipath_mcast_qp *p, *tmp;
> +
> + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
> + ipath_mcast_qp_free(p);
> +
> + kfree(mcast);
> +}
> +
> +/*
> + * Search the global table for the given multicast GID.
> + * Return it or NULL if not found.
> + * The caller is responsible for decrementing the reference count if found.
> + */
> +static struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
> +{
> + struct rb_node *n;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&mcast_lock, flags);
> + n = mcast_tree.rb_node;
> + while (n) {
> + struct ipath_mcast *mcast;
> + int ret;
> +
> + mcast = rb_entry(n, struct ipath_mcast, rb_node);
> +
> + ret = memcmp(mgid->raw, mcast->mgid.raw, sizeof(union ib_gid));
> + if (ret < 0)
> + n = n->rb_left;
> + else if (ret > 0)
> + n = n->rb_right;
> + else {
> + atomic_inc(&mcast->refcount);
> + spin_unlock_irqrestore(&mcast_lock, flags);
> + return mcast;
> + }
> + }
> + spin_unlock_irqrestore(&mcast_lock, flags);
> +
> + return NULL;
> +}
> +
> +/*
> + * Insert the multicast GID into the table and
> + * attach the QP structure.
> + * Return zero if both were added.
> + * Return EEXIST if the GID was already in the table but the QP was added.
> + * Return ESRCH if the QP was already attached and neither structure was added.
> + */
> +static int ipath_mcast_add(struct ipath_mcast *mcast,
> + struct ipath_mcast_qp *mqp)
> +{
> + struct rb_node **n = &mcast_tree.rb_node;
> + struct rb_node *pn = NULL;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&mcast_lock, flags);
> +
> + while (*n) {
> + struct ipath_mcast *tmcast;
> + struct ipath_mcast_qp *p;
> + int ret;
> +
> + pn = *n;
> + tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
> +
> + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
> + sizeof(union ib_gid));
> + if (ret < 0) {
> + n = &pn->rb_left;
> + continue;
> + }
> + if (ret > 0) {
> + n = &pn->rb_right;
> + continue;
> + }
> +
> + /* Search the QP list to see if this is already there. */
> + list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
Given that we hold the global mcast_lock, how is RCU helping here?
Is there a lock-free read-side traversal path somewhere that I am
missing?
> + if (p->qp == mqp->qp) {
> + spin_unlock_irqrestore(&mcast_lock, flags);
> + return ESRCH;
> + }
> + }
> + list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
Ditto...
> + spin_unlock_irqrestore(&mcast_lock, flags);
> + return EEXIST;
> + }
> +
> + list_add_tail_rcu(&mqp->list, &mcast->qp_list);
Ditto...
> + spin_unlock_irqrestore(&mcast_lock, flags);
> +
> + atomic_inc(&mcast->refcount);
> + rb_link_node(&mcast->rb_node, pn, n);
> + rb_insert_color(&mcast->rb_node, &mcast_tree);
> +
> + spin_unlock_irqrestore(&mcast_lock, flags);
> +
> + return 0;
> +}
> +
> +static int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid,
> + u16 lid)
> +{
> + struct ipath_qp *qp = to_iqp(ibqp);
> + struct ipath_mcast *mcast;
> + struct ipath_mcast_qp *mqp;
> +
> + /*
> + * Allocate data structures since its better to do this outside of
> + * spin locks and it will most likely be needed.
> + */
> + mcast = ipath_mcast_alloc(gid);
> + if (mcast == NULL)
> + return -ENOMEM;
> + mqp = ipath_mcast_qp_alloc(qp);
> + if (mqp == NULL) {
> + ipath_mcast_free(mcast);
> + return -ENOMEM;
> + }
> + switch (ipath_mcast_add(mcast, mqp)) {
> + case ESRCH:
> + /* Neither was used: can't attach the same QP twice. */
> + ipath_mcast_qp_free(mqp);
> + ipath_mcast_free(mcast);
> + return -EINVAL;
> + case EEXIST: /* The mcast wasn't used */
> + ipath_mcast_free(mcast);
> + break;
> + default:
> + break;
> + }
> + return 0;
> +}
> +
> +static int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid,
> + u16 lid)
> +{
> + struct ipath_qp *qp = to_iqp(ibqp);
> + struct ipath_mcast *mcast = NULL;
> + struct ipath_mcast_qp *p, *tmp;
> + struct rb_node *n;
> + unsigned long flags;
> + int last = 0;
> +
> + spin_lock_irqsave(&mcast_lock, flags);
> +
> + /* Find the GID in the mcast table. */
> + n = mcast_tree.rb_node;
> + while (1) {
> + int ret;
> +
> + if (n == NULL) {
> + spin_unlock_irqrestore(&mcast_lock, flags);
> + return 0;
> + }
> +
> + mcast = rb_entry(n, struct ipath_mcast, rb_node);
> + ret = memcmp(gid->raw, mcast->mgid.raw, sizeof(union ib_gid));
> + if (ret < 0)
> + n = n->rb_left;
> + else if (ret > 0)
> + n = n->rb_right;
> + else
> + break;
> + }
> +
> + /* Search the QP list. */
> + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
> + if (p->qp != qp)
> + continue;
> + /*
> + * We found it, so remove it, but don't poison the forward link
> + * until we are sure there are no list walkers.
> + */
> + list_del_rcu(&p->list);
Ditto...
> + spin_unlock_irqrestore(&mcast_lock, flags);
> +
> + /* If this was the last attached QP, remove the GID too. */
> + if (list_empty(&mcast->qp_list)) {
> + rb_erase(&mcast->rb_node, &mcast_tree);
> + last = 1;
> + }
> + break;
> + }
> +
> + spin_unlock_irqrestore(&mcast_lock, flags);
> +
> + if (p) {
> + /*
> + * Wait for any list walkers to finish before freeing the
> + * list element.
> + */
> + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
> + ipath_mcast_qp_free(p);
> + }
> + if (last) {
> + atomic_dec(&mcast->refcount);
> + wait_event(mcast->wait, !atomic_read(&mcast->refcount));
> + ipath_mcast_free(mcast);
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * Copy data to SGE memory.
> + */
> +static void copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
> +{
> + struct ipath_sge *sge = &ss->sge;
> +
> + while (length) {
> + u32 len = sge->length;
> +
> + BUG_ON(len == 0);
> + if (len > length)
> + len = length;
> + memcpy(sge->vaddr, data, len);
> + sge->vaddr += len;
> + sge->length -= len;
> + sge->sge_length -= len;
> + if (sge->sge_length == 0) {
> + if (--ss->num_sge)
> + *sge = *ss->sg_list++;
> + } else if (sge->length == 0 && sge->mr != NULL) {
> + if (++sge->n >= IPATH_SEGSZ) {
> + if (++sge->m >= sge->mr->mapsz)
> + break;
> + sge->n = 0;
> + }
> + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
> + sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
> + }
> + data += len;
> + length -= len;
> + }
> +}
> +
> +/*
> + * Skip over length bytes of SGE memory.
> + */
> +static void skip_sge(struct ipath_sge_state *ss, u32 length)
> +{
> + struct ipath_sge *sge = &ss->sge;
> +
> + while (length > sge->sge_length) {
> + length -= sge->sge_length;
> + ss->sge = *ss->sg_list++;
> + }
> + while (length) {
> + u32 len = sge->length;
> +
> + BUG_ON(len == 0);
> + if (len > length)
> + len = length;
> + sge->vaddr += len;
> + sge->length -= len;
> + sge->sge_length -= len;
> + if (sge->sge_length == 0) {
> + if (--ss->num_sge)
> + *sge = *ss->sg_list++;
> + } else if (sge->length == 0 && sge->mr != NULL) {
> + if (++sge->n >= IPATH_SEGSZ) {
> + if (++sge->m >= sge->mr->mapsz)
> + break;
> + sge->n = 0;
> + }
> + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
> + sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
> + }
> + length -= len;
> + }
> +}
> +
> +static inline u32 alloc_qpn(struct ipath_qp_table *qpt)
> +{
> + u32 i, offset, max_scan, qpn;
> + struct qpn_map *map;
> +
> + qpn = qpt->last + 1;
> + if (qpn >= QPN_MAX)
> + qpn = 2;
> + offset = qpn & BITS_PER_PAGE_MASK;
> + map = &qpt->map[qpn / BITS_PER_PAGE];
> + max_scan = qpt->nmaps - !offset;
> + for (i = 0;;) {
> + if (unlikely(!map->page)) {
> + unsigned long page = get_zeroed_page(GFP_KERNEL);
> + unsigned long flags;
> +
> + /*
> + * Free the page if someone raced with us
> + * installing it:
> + */
> + spin_lock_irqsave(&qpt->lock, flags);
> + if (map->page)
> + free_page(page);
> + else
> + map->page = (void *)page;
> + spin_unlock_irqrestore(&qpt->lock, flags);
> + if (unlikely(!map->page))
> + break;
> + }
> + if (likely(atomic_read(&map->n_free))) {
> + do {
> + if (!test_and_set_bit(offset, map->page)) {
> + atomic_dec(&map->n_free);
> + qpt->last = qpn;
> + return qpn;
> + }
> + offset = find_next_offset(map, offset);
> + qpn = mk_qpn(qpt, map, offset);
> + /*
> + * This test differs from alloc_pidmap().
> + * If find_next_offset() does find a zero bit,
> + * we don't need to check for QPN wrapping
> + * around past our starting QPN. We
> + * just need to be sure we don't loop forever.
> + */
> + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
> + }
> + /*
> + * In order to keep the number of pages allocated to a minimum,
> + * we scan the all existing pages before increasing the size
> + * of the bitmap table.
> + */
> + if (++i > max_scan) {
> + if (qpt->nmaps == QPNMAP_ENTRIES)
> + break;
> + map = &qpt->map[qpt->nmaps++];
> + offset = 0;
> + } else if (map < &qpt->map[qpt->nmaps]) {
> + ++map;
> + offset = 0;
> + } else {
> + map = &qpt->map[0];
> + offset = 2;
> + }
> + qpn = mk_qpn(qpt, map, offset);
> + }
> + return 0;
> +}
> +
> +static inline void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
> +{
> + struct qpn_map *map;
> +
> + map = qpt->map + qpn / BITS_PER_PAGE;
> + if (map->page)
> + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
> + atomic_inc(&map->n_free);
> +}
> +
> +/*
> + * Allocate the next available QPN and put the QP into the hash table.
> + * The hash table holds a reference to the QP.
> + */
> +static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
> + enum ib_qp_type type)
> +{
> + unsigned long flags;
> + u32 qpn;
> +
> + if (type == IB_QPT_SMI)
> + qpn = 0;
> + else if (type == IB_QPT_GSI)
> + qpn = 1;
> + else {
> + /* Allocate the next available QPN */
> + qpn = alloc_qpn(qpt);
> + if (qpn == 0) {
> + return -ENOMEM;
> + }
> + }
> + qp->ibqp.qp_num = qpn;
> +
> + /* Add the QP to the hash table. */
> + spin_lock_irqsave(&qpt->lock, flags);
> +
> + qpn %= qpt->max;
> + qp->next = qpt->table[qpn];
> + qpt->table[qpn] = qp;
> + atomic_inc(&qp->refcount);
> +
> + spin_unlock_irqrestore(&qpt->lock, flags);
> + return 0;
> +}
> +
> +/*
> + * Remove the QP from the table so it can't be found asynchronously by
> + * the receive interrupt routine.
> + */
> +static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
> +{
> + struct ipath_qp *q, **qpp;
> + unsigned long flags;
> + int fnd = 0;
> +
> + spin_lock_irqsave(&qpt->lock, flags);
> +
> + /* Remove QP from the hash table. */
> + qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
> + for (; (q = *qpp) != NULL; qpp = &q->next) {
> + if (q == qp) {
> + *qpp = qp->next;
> + qp->next = NULL;
> + atomic_dec(&qp->refcount);
> + fnd = 1;
> + break;
> + }
> + }
> +
> + spin_unlock_irqrestore(&qpt->lock, flags);
> +
> + if (!fnd)
> + return;
> +
> + /* If QPN is not reserved, mark QPN free in the bitmap. */
> + if (qp->ibqp.qp_num > 1)
> + free_qpn(qpt, qp->ibqp.qp_num);
> +
> + wait_event(qp->wait, !atomic_read(&qp->refcount));
> +}
> +
> +/*
> + * Remove all QPs from the table.
> + */
> +static void ipath_free_all_qps(struct ipath_qp_table *qpt)
> +{
> + unsigned long flags;
> + struct ipath_qp *qp, *nqp;
> + u32 n;
> +
> + for (n = 0; n < qpt->max; n++) {
> + spin_lock_irqsave(&qpt->lock, flags);
> + qp = qpt->table[n];
> + qpt->table[n] = NULL;
> + spin_unlock_irqrestore(&qpt->lock, flags);
> +
> + while (qp) {
> + nqp = qp->next;
> + if (qp->ibqp.qp_num > 1)
> + free_qpn(qpt, qp->ibqp.qp_num);
> + if (!atomic_dec_and_test(&qp->refcount) ||
> + !ipath_destroy_qp(&qp->ibqp))
> + _VERBS_INFO("QP memory leak!\n");
> + qp = nqp;
> + }
> + }
> +
> + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) {
> + if (qpt->map[n].page)
> + free_page((unsigned long)qpt->map[n].page);
> + }
> +}
> +
> +/*
> + * Return the QP with the given QPN.
> + * The caller is responsible for decrementing the QP reference count when done.
> + */
> +static struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
> +{
> + unsigned long flags;
> + struct ipath_qp *qp;
> +
> + spin_lock_irqsave(&qpt->lock, flags);
> +
> + for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
> + if (qp->ibqp.qp_num == qpn) {
> + atomic_inc(&qp->refcount);
> + break;
> + }
> + }
> +
> + spin_unlock_irqrestore(&qpt->lock, flags);
> + return qp;
> +}
> +
> +static int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
> + struct ipath_mregion *mr)
> +{
> + unsigned long flags;
> + u32 r;
> + u32 n;
> +
> + spin_lock_irqsave(&rkt->lock, flags);
> +
> + /* Find the next available LKEY */
> + r = n = rkt->next;
> + for (;;) {
> + if (rkt->table[r] == NULL)
> + break;
> + r = (r + 1) & (rkt->max - 1);
> + if (r == n) {
> + spin_unlock_irqrestore(&rkt->lock, flags);
> + _VERBS_INFO("LKEY table full\n");
> + return 0;
> + }
> + }
> + rkt->next = (r + 1) & (rkt->max - 1);
> + /*
> + * Make sure lkey is never zero which is reserved to indicate an
> + * unrestricted LKEY.
> + */
> + rkt->gen++;
> + mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
> + ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) << 8);
> + if (mr->lkey == 0) {
> + mr->lkey |= 1 << 8;
> + rkt->gen++;
> + }
> + rkt->table[r] = mr;
> + spin_unlock_irqrestore(&rkt->lock, flags);
> +
> + return 1;
> +}
> +
> +static void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + if (lkey == 0)
> + return;
> + r = lkey >> (32 - ib_ipath_lkey_table_size);
> + spin_lock_irqsave(&rkt->lock, flags);
> + rkt->table[r] = NULL;
> + spin_unlock_irqrestore(&rkt->lock, flags);
> +}
> +
> +/*
> + * Check the IB SGE for validity and initialize our internal version of it.
> + * Return 1 if OK, else zero.
> + */
> +static int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
> + struct ib_sge *sge, int acc)
> +{
> + struct ipath_mregion *mr;
> + size_t off;
> +
> + /*
> + * We use LKEY == zero to mean a physical kmalloc() address.
> + * This is a bit of a hack since we rely on dma_map_single()
> + * being reversible by calling bus_to_virt().
> + */
> + if (sge->lkey == 0) {
> + isge->mr = NULL;
> + isge->vaddr = bus_to_virt(sge->addr);
> + isge->length = sge->length;
> + isge->sge_length = sge->length;
> + return 1;
> + }
> + spin_lock(&rkt->lock);
> + mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
> + spin_unlock(&rkt->lock);
> + if (unlikely(mr == NULL || mr->lkey != sge->lkey))
> + return 0;
> +
> + off = sge->addr - mr->user_base;
> + if (unlikely(sge->addr < mr->user_base ||
> + off + sge->length > mr->length ||
> + (mr->access_flags & acc) != acc))
> + return 0;
> +
> + off += mr->offset;
> + isge->mr = mr;
> + isge->m = 0;
> + isge->n = 0;
> + while (off >= mr->map[isge->m]->segs[isge->n].length) {
> + off -= mr->map[isge->m]->segs[isge->n].length;
> + if (++isge->n >= IPATH_SEGSZ) {
> + isge->m++;
> + isge->n = 0;
> + }
> + }
> + isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off;
> + isge->length = mr->map[isge->m]->segs[isge->n].length - off;
> + isge->sge_length = sge->length;
> + return 1;
> +}
> +
> +/*
> + * Initialize the qp->s_sge after a restart.
> + * The QP s_lock should be held.
> + */
> +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
> +{
> + struct ipath_ibdev *dev;
> + u32 len;
> +
> + len = ((qp->s_psn - wqe->psn) & 0xFFFFFF) *
> + ib_mtu_enum_to_int(qp->path_mtu);
> + qp->s_sge.sge = wqe->sg_list[0];
> + qp->s_sge.sg_list = wqe->sg_list + 1;
> + qp->s_sge.num_sge = wqe->wr.num_sge;
> + skip_sge(&qp->s_sge, len);
> + qp->s_len = wqe->length - len;
> + dev = to_idev(qp->ibqp.device);
> + spin_lock(&dev->pending_lock);
> + if (qp->timerwait.next == LIST_POISON1)
> + list_add_tail(&qp->timerwait,
> + &dev->pending[dev->pending_index]);
> + spin_unlock(&dev->pending_lock);
> +}
> +
> +/*
> + * Check the IB virtual address, length, and RKEY.
> + * Return 1 if OK, else zero.
> + * The QP r_rq.lock should be held.
> + */
> +static int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
> + u32 len, u64 vaddr, u32 rkey, int acc)
> +{
> + struct ipath_lkey_table *rkt = &dev->lk_table;
> + struct ipath_sge *sge = &ss->sge;
> + struct ipath_mregion *mr;
> + size_t off;
> +
> + spin_lock(&rkt->lock);
> + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
> + spin_unlock(&rkt->lock);
> + if (unlikely(mr == NULL || mr->lkey != rkey))
> + return 0;
> +
> + off = vaddr - mr->iova;
> + if (unlikely(vaddr < mr->iova || off + len > mr->length ||
> + (mr->access_flags & acc) == 0))
> + return 0;
> +
> + off += mr->offset;
> + sge->mr = mr;
> + sge->m = 0;
> + sge->n = 0;
> + while (off >= mr->map[sge->m]->segs[sge->n].length) {
> + off -= mr->map[sge->m]->segs[sge->n].length;
> + if (++sge->n >= IPATH_SEGSZ) {
> + sge->m++;
> + sge->n = 0;
> + }
> + }
> + sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off;
> + sge->length = mr->map[sge->m]->segs[sge->n].length - off;
> + sge->sge_length = len;
> + ss->sg_list = NULL;
> + ss->num_sge = 1;
> + return 1;
> +}
> +
> +/*
> + * Add a new entry to the completion queue.
> + * This may be called with one of the qp->s_lock or qp->r_rq.lock held.
> + */
> +static void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig)
> +{
> + unsigned long flags;
> + u32 next;
> +
> + spin_lock_irqsave(&cq->lock, flags);
> +
> + cq->queue[cq->head] = *entry;
> + next = cq->head + 1;
> + if (next == cq->ibcq.cqe)
> + next = 0;
> + if (next != cq->tail)
> + cq->head = next;
> + else {
> + /* XXX - need to mark current wr as having an error... */
> + }
> +
> + if (cq->notify == IB_CQ_NEXT_COMP ||
> + (cq->notify == IB_CQ_SOLICITED && sig)) {
> + cq->notify = IB_CQ_NONE;
> + cq->triggered++;
> + /*
> + * This will cause send_complete() to be called in
> + * another thread.
> + */
> + tasklet_schedule(&cq->comptask);
> + }
> +
> + spin_unlock_irqrestore(&cq->lock, flags);
> +
> + if (entry->status != IB_WC_SUCCESS)
> + to_idev(cq->ibcq.device)->n_wqe_errs++;
> +}
> +
> +static void send_complete(unsigned long data)
> +{
> + struct ipath_cq *cq = (struct ipath_cq *)data;
> +
> + /*
> + * The completion handler will most likely rearm the notification
> + * and poll for all pending entries. If a new completion entry
> + * is added while we are in this routine, tasklet_schedule()
> + * won't call us again until we return so we check triggered to
> + * see if we need to call the handler again.
> + */
> + for (;;) {
> + u8 triggered = cq->triggered;
> +
> + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +
> + if (cq->triggered == triggered)
> + return;
> + }
> +}
> +
> +/*
> + * This is the QP state transition table.
> + * See ipath_modify_qp() for details.
> + */
> +static const struct {
> + int trans;
> + u32 req_param[IB_QPT_RAW_IPV6];
> + u32 opt_param[IB_QPT_RAW_IPV6];
> +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
> + [IB_QPS_RESET] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_INIT] = {
> + .trans = IPATH_TRANS_RST2INIT,
> + .req_param = {
> + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_ACCESS_FLAGS),
> + [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_ACCESS_FLAGS),
> + },
> + },
> + },
> + [IB_QPS_INIT] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_INIT] = {
> + .trans = IPATH_TRANS_INIT2INIT,
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_ACCESS_FLAGS),
> + [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
> + IB_QP_PORT |
> + IB_QP_ACCESS_FLAGS),
> + }
> + },
> + [IB_QPS_RTR] = {
> + .trans = IPATH_TRANS_INIT2RTR,
> + .req_param = {
> + [IB_QPT_UC] = (IB_QP_AV |
> + IB_QP_PATH_MTU |
> + IB_QP_DEST_QPN |
> + IB_QP_RQ_PSN),
> + [IB_QPT_RC] = (IB_QP_AV |
> + IB_QP_PATH_MTU |
> + IB_QP_DEST_QPN |
> + IB_QP_RQ_PSN |
> + IB_QP_MAX_DEST_RD_ATOMIC |
> + IB_QP_MIN_RNR_TIMER),
> + },
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
> + IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX),
> + [IB_QPT_RC] = (IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX),
> + }
> + }
> + },
> + [IB_QPS_RTR] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_RTS] = {
> + .trans = IPATH_TRANS_RTR2RTS,
> + .req_param = {
> + [IB_QPT_SMI] = IB_QP_SQ_PSN,
> + [IB_QPT_GSI] = IB_QP_SQ_PSN,
> + [IB_QPT_UD] = IB_QP_SQ_PSN,
> + [IB_QPT_UC] = IB_QP_SQ_PSN,
> + [IB_QPT_RC] = (IB_QP_TIMEOUT |
> + IB_QP_RETRY_CNT |
> + IB_QP_RNR_RETRY |
> + IB_QP_SQ_PSN |
> + IB_QP_MAX_QP_RD_ATOMIC),
> + },
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX |
> + IB_QP_PATH_MIG_STATE),
> + [IB_QPT_RC] = (IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX |
> + IB_QP_MIN_RNR_TIMER |
> + IB_QP_PATH_MIG_STATE),
> + }
> + }
> + },
> + [IB_QPS_RTS] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_RTS] = {
> + .trans = IPATH_TRANS_RTS2RTS,
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_ACCESS_FLAGS |
> + IB_QP_ALT_PATH |
> + IB_QP_PATH_MIG_STATE),
> + [IB_QPT_RC] = (IB_QP_ACCESS_FLAGS |
> + IB_QP_ALT_PATH |
> + IB_QP_PATH_MIG_STATE |
> + IB_QP_MIN_RNR_TIMER),
> + }
> + },
> + [IB_QPS_SQD] = {
> + .trans = IPATH_TRANS_RTS2SQD,
> + },
> + },
> + [IB_QPS_SQD] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_RTS] = {
> + .trans = IPATH_TRANS_SQD2RTS,
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PATH_MIG_STATE),
> + [IB_QPT_RC] = (IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_MIN_RNR_TIMER |
> + IB_QP_PATH_MIG_STATE),
> + }
> + },
> + [IB_QPS_SQD] = {
> + .trans = IPATH_TRANS_SQD2SQD,
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
> + [IB_QPT_UC] = (IB_QP_AV |
> + IB_QP_TIMEOUT |
> + IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX |
> + IB_QP_PATH_MIG_STATE),
> + [IB_QPT_RC] = (IB_QP_AV |
> + IB_QP_TIMEOUT |
> + IB_QP_RETRY_CNT |
> + IB_QP_RNR_RETRY |
> + IB_QP_MAX_QP_RD_ATOMIC |
> + IB_QP_MAX_DEST_RD_ATOMIC |
> + IB_QP_CUR_STATE |
> + IB_QP_ALT_PATH |
> + IB_QP_ACCESS_FLAGS |
> + IB_QP_PKEY_INDEX |
> + IB_QP_MIN_RNR_TIMER |
> + IB_QP_PATH_MIG_STATE),
> + }
> + }
> + },
> + [IB_QPS_SQE] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR },
> + [IB_QPS_RTS] = {
> + .trans = IPATH_TRANS_SQERR2RTS,
> + .opt_param = {
> + [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
> + [IB_QPT_UC] = IB_QP_CUR_STATE,
> + [IB_QPT_RC] = (IB_QP_CUR_STATE |
> + IB_QP_MIN_RNR_TIMER),
> + }
> + }
> + },
> + [IB_QPS_ERR] = {
> + [IB_QPS_RESET] = { .trans = IPATH_TRANS_ANY2RST },
> + [IB_QPS_ERR] = { .trans = IPATH_TRANS_ANY2ERR }
> + }
> +};
> +
> +/*
> + * Initialize the QP state to the reset state.
> + */
> +static void ipath_reset_qp(struct ipath_qp *qp)
> +{
> + qp->remote_qpn = 0;
> + qp->qkey = 0;
> + qp->qp_access_flags = 0;
> + qp->s_hdrwords = 0;
> + qp->s_psn = 0;
> + qp->r_psn = 0;
> + atomic_set(&qp->msn, 0);
> + if (qp->ibqp.qp_type == IB_QPT_RC) {
> + qp->s_state = IB_OPCODE_RC_SEND_LAST;
> + qp->r_state = IB_OPCODE_RC_SEND_LAST;
> + } else {
> + qp->s_state = IB_OPCODE_UC_SEND_LAST;
> + qp->r_state = IB_OPCODE_UC_SEND_LAST;
> + }
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> + qp->s_nak_state = 0;
> + qp->s_rnr_timeout = 0;
> + qp->s_head = 0;
> + qp->s_tail = 0;
> + qp->s_cur = 0;
> + qp->s_last = 0;
> + qp->s_ssn = 1;
> + qp->s_lsn = 0;
> + qp->r_rq.head = 0;
> + qp->r_rq.tail = 0;
> + qp->r_reuse_sge = 0;
> +}
> +
> +/*
> + * Flush send work queue.
> + * The QP s_lock should be held.
> + */
> +static void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc)
> +{
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
> +
> + _VERBS_INFO("Send queue error on QP%d/%d: err: %d\n",
> + qp->ibqp.qp_num, qp->remote_qpn, wc->status);
> +
> + spin_lock(&dev->pending_lock);
> + /* XXX What if its already removed by the timeout code? */
> + if (qp->timerwait.next != LIST_POISON1)
> + list_del(&qp->timerwait);
> + if (qp->piowait.next != LIST_POISON1)
> + list_del(&qp->piowait);
> + spin_unlock(&dev->pending_lock);
> +
> + ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
> + if (++qp->s_last >= qp->s_size)
> + qp->s_last = 0;
> +
> + wc->status = IB_WC_WR_FLUSH_ERR;
> +
> + while (qp->s_last != qp->s_head) {
> + wc->wr_id = wqe->wr.wr_id;
> + wc->opcode = wc_opcode[wqe->wr.opcode];
> + ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
> + if (++qp->s_last >= qp->s_size)
> + qp->s_last = 0;
> + wqe = get_swqe_ptr(qp, qp->s_last);
> + }
> + qp->s_cur = qp->s_tail = qp->s_head;
> + qp->state = IB_QPS_SQE;
> +}
> +
> +/*
> + * Flush both send and receive work queues.
> + * QP r_rq.lock and s_lock should be held.
> + */
> +static void ipath_error_qp(struct ipath_qp *qp)
> +{
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + struct ib_wc wc;
> +
> + _VERBS_INFO("QP%d/%d in error state\n",
> + qp->ibqp.qp_num, qp->remote_qpn);
> +
> + spin_lock(&dev->pending_lock);
> + /* XXX What if its already removed by the timeout code? */
> + if (qp->timerwait.next != LIST_POISON1)
> + list_del(&qp->timerwait);
> + if (qp->piowait.next != LIST_POISON1)
> + list_del(&qp->piowait);
> + spin_unlock(&dev->pending_lock);
> +
> + wc.status = IB_WC_WR_FLUSH_ERR;
> + wc.vendor_err = 0;
> + wc.byte_len = 0;
> + wc.imm_data = 0;
> + wc.qp_num = qp->ibqp.qp_num;
> + wc.src_qp = 0;
> + wc.wc_flags = 0;
> + wc.pkey_index = 0;
> + wc.slid = 0;
> + wc.sl = 0;
> + wc.dlid_path_bits = 0;
> + wc.port_num = 0;
> +
> + while (qp->s_last != qp->s_head) {
> + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
> +
> + wc.wr_id = wqe->wr.wr_id;
> + wc.opcode = wc_opcode[wqe->wr.opcode];
> + if (++qp->s_last >= qp->s_size)
> + qp->s_last = 0;
> + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
> + }
> + qp->s_cur = qp->s_tail = qp->s_head;
> + qp->s_hdrwords = 0;
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> +
> + wc.opcode = IB_WC_RECV;
> + while (qp->r_rq.tail != qp->r_rq.head) {
> + wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id;
> + if (++qp->r_rq.tail >= qp->r_rq.size)
> + qp->r_rq.tail = 0;
> + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
> + }
> +}
> +
> +static int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
> + int attr_mask)
> +{
> + struct ipath_qp *qp = to_iqp(ibqp);
> + enum ib_qp_state cur_state, new_state;
> + u32 req_param, opt_param;
> + unsigned long flags;
> +
> + if (attr_mask & IB_QP_CUR_STATE) {
> + cur_state = attr->cur_qp_state;
> + if (cur_state != IB_QPS_RTR &&
> + cur_state != IB_QPS_RTS &&
> + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
> + return -EINVAL;
> + spin_lock_irqsave(&qp->r_rq.lock, flags);
> + spin_lock(&qp->s_lock);
> + } else {
> + spin_lock_irqsave(&qp->r_rq.lock, flags);
> + spin_lock(&qp->s_lock);
> + cur_state = qp->state;
> + }
> +
> + if (attr_mask & IB_QP_STATE) {
> + new_state = attr->qp_state;
> + if (new_state < 0 || new_state > IB_QPS_ERR)
> + goto inval;
> + } else
> + new_state = cur_state;
> +
> + switch (qp_state_table[cur_state][new_state].trans) {
> + case IPATH_TRANS_INVALID:
> + goto inval;
> +
> + case IPATH_TRANS_ANY2RST:
> + ipath_reset_qp(qp);
> + break;
> +
> + case IPATH_TRANS_ANY2ERR:
> + ipath_error_qp(qp);
> + break;
> +
> + }
> +
> + req_param =
> + qp_state_table[cur_state][new_state].req_param[qp->ibqp.qp_type];
> + opt_param =
> + qp_state_table[cur_state][new_state].opt_param[qp->ibqp.qp_type];
> +
> + if ((req_param & attr_mask) != req_param)
> + goto inval;
> +
> + if (attr_mask & ~(req_param | opt_param | IB_QP_STATE))
> + goto inval;
> +
> + if (attr_mask & IB_QP_PKEY_INDEX) {
> + struct ipath_ibdev *dev = to_idev(ibqp->device);
> +
> + if (attr->pkey_index >= ipath_layer_get_npkeys(dev->ib_unit))
> + goto inval;
> + qp->s_pkey_index = attr->pkey_index;
> + }
> +
> + if (attr_mask & IB_QP_DEST_QPN)
> + qp->remote_qpn = attr->dest_qp_num;
> +
> + if (attr_mask & IB_QP_SQ_PSN) {
> + qp->s_next_psn = attr->sq_psn;
> + qp->s_last_psn = qp->s_next_psn - 1;
> + }
> +
> + if (attr_mask & IB_QP_RQ_PSN)
> + qp->r_psn = attr->rq_psn;
> +
> + if (attr_mask & IB_QP_ACCESS_FLAGS)
> + qp->qp_access_flags = attr->qp_access_flags;
> +
> + if (attr_mask & IB_QP_AV)
> + qp->remote_ah_attr = attr->ah_attr;
> +
> + if (attr_mask & IB_QP_PATH_MTU)
> + qp->path_mtu = attr->path_mtu;
> +
> + if (attr_mask & IB_QP_RETRY_CNT)
> + qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
> +
> + if (attr_mask & IB_QP_RNR_RETRY) {
> + qp->s_rnr_retry = attr->rnr_retry;
> + if (qp->s_rnr_retry > 7)
> + qp->s_rnr_retry = 7;
> + qp->s_rnr_retry_cnt = qp->s_rnr_retry;
> + }
> +
> + if (attr_mask & IB_QP_MIN_RNR_TIMER)
> + qp->s_min_rnr_timer = attr->min_rnr_timer & 0x1F;
> +
> + if (attr_mask & IB_QP_QKEY)
> + qp->qkey = attr->qkey;
> +
> + if (attr_mask & IB_QP_PKEY_INDEX)
> + qp->s_pkey_index = attr->pkey_index;
> +
> + qp->state = new_state;
> + spin_unlock(&qp->s_lock);
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> +
> + /*
> + * Try to move to ARMED if QP1 changed to the RTS state.
> + */
> + if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) {
> + struct ipath_ibdev *dev = to_idev(ibqp->device);
> +
> + /*
> + * Bounce the link even if it was active so the SM will
> + * reinitialize the SMA's state.
> + */
> + ipath_kset_linkstate((dev->ib_unit << 16) | IPATH_IB_LINKDOWN);
> + ipath_kset_linkstate((dev->ib_unit << 16) | IPATH_IB_LINKARM);
> + }
> + return 0;
> +
> +inval:
> + spin_unlock(&qp->s_lock);
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> + return -EINVAL;
> +}
> +
> +/*
> + * Compute the AETH (syndrome + MSN).
> + * The QP s_lock should be held.
> + */
> +static u32 ipath_compute_aeth(struct ipath_qp *qp)
> +{
> + u32 aeth = atomic_read(&qp->msn) & 0xFFFFFF;
> +
> + if (qp->s_nak_state) {
> + aeth |= qp->s_nak_state << 24;
> + } else if (qp->ibqp.srq) {
> + /* Shared receive queues don't generate credits. */
> + aeth |= 0x1F << 24;
> + } else {
> + u32 min, max, x;
> + u32 credits;
> +
> + /*
> + * Compute the number of credits available (RWQEs).
> + * XXX Not holding the r_rq.lock here so there is a small
> + * chance that the pair of reads are not atomic.
> + */
> + credits = qp->r_rq.head - qp->r_rq.tail;
> + if ((int)credits < 0)
> + credits += qp->r_rq.size;
> + /* Binary search the credit table to find the code to use. */
> + min = 0;
> + max = 31;
> + for (;;) {
> + x = (min + max) / 2;
> + if (credit_table[x] == credits)
> + break;
> + if (credit_table[x] > credits)
> + max = x;
> + else if (min == x)
> + break;
> + else
> + min = x;
> + }
> + aeth |= x << 24;
> + }
> + return cpu_to_be32(aeth);
> +}
> +
> +
> +static void no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&dev->pending_lock, flags);
> + if (qp->piowait.next == LIST_POISON1)
> + list_add_tail(&qp->piowait, &dev->piowait);
> + spin_unlock_irqrestore(&dev->pending_lock, flags);
> + /*
> + * Note that as soon as ipath_layer_want_buffer() is called and
> + * possibly before it returns, ipath_ib_piobufavail()
> + * could be called. If we are still in the tasklet function,
> + * tasklet_schedule() will not call us until the next time
> + * tasklet_schedule() is called.
> + * We clear the tasklet flag now since we are committing to return
> + * from the tasklet function.
> + */
> + tasklet_unlock(&qp->s_task);
> + ipath_layer_want_buffer(dev->ib_unit);
> + dev->n_piowait++;
> +}
> +
> +/*
> + * Process entries in the send work queue until the queue is exhausted.
> + * Only allow one CPU to send a packet per QP (tasklet).
> + * Otherwise, after we drop the QP lock, two threads could send
> + * packets out of order.
> + * This is similar to do_rc_send() below except we don't have timeouts or
> + * resends.
> + */
> +static void do_uc_send(unsigned long data)
> +{
> + struct ipath_qp *qp = (struct ipath_qp *)data;
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + struct ipath_swqe *wqe;
> + unsigned long flags;
> + u16 lrh0;
> + u32 hwords;
> + u32 nwords;
> + u32 extra_bytes;
> + u32 bth0;
> + u32 bth2;
> + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
> + u32 len;
> + struct ipath_other_headers *ohdr;
> + struct ib_wc wc;
> +
> + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
> + return;
> +
> + if (unlikely(qp->remote_ah_attr.dlid ==
> + ipath_layer_get_lid(dev->ib_unit))) {
> + /* Pass in an uninitialized ib_wc to save stack space. */
> + ipath_ruc_loopback(qp, &wc);
> + clear_bit(IPATH_S_BUSY, &qp->s_flags);
> + return;
> + }
> +
> + ohdr = &qp->s_hdr.u.oth;
> + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
> + ohdr = &qp->s_hdr.u.l.oth;
> +
> +again:
> + /* Check for a constructed packet to be sent. */
> + if (qp->s_hdrwords != 0) {
> + /*
> + * If no PIO bufs are available, return.
> + * An interrupt will call ipath_ib_piobufavail()
> + * when one is available.
> + */
> + if (ipath_verbs_send(dev->ib_unit, qp->s_hdrwords,
> + (uint32_t *) &qp->s_hdr,
> + qp->s_cur_size, qp->s_cur_sge)) {
> + no_bufs_available(qp, dev);
> + return;
> + }
> + /* Record that we sent the packet and s_hdr is empty. */
> + qp->s_hdrwords = 0;
> + }
> +
> + lrh0 = IPS_LRH_BTH;
> + /* header size in 32-bit words LRH+BTH = (8+12)/4. */
> + hwords = 5;
> +
> + /*
> + * The lock is needed to synchronize between
> + * setting qp->s_ack_state and post_send().
> + */
> + spin_lock_irqsave(&qp->s_lock, flags);
> +
> + if (!(state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
> + goto done;
> +
> + bth0 = ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
> +
> + /* Send a request. */
> + wqe = get_swqe_ptr(qp, qp->s_last);
> + switch (qp->s_state) {
> + default:
> + /* Signal the completion of the last send (if there is one). */
> + if (qp->s_last != qp->s_tail) {
> + if (++qp->s_last == qp->s_size)
> + qp->s_last = 0;
> + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
> + (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
> + wc.wr_id = wqe->wr.wr_id;
> + wc.status = IB_WC_SUCCESS;
> + wc.opcode = wc_opcode[wqe->wr.opcode];
> + wc.vendor_err = 0;
> + wc.byte_len = wqe->length;
> + wc.qp_num = qp->ibqp.qp_num;
> + wc.src_qp = qp->remote_qpn;
> + wc.pkey_index = 0;
> + wc.slid = qp->remote_ah_attr.dlid;
> + wc.sl = qp->remote_ah_attr.sl;
> + wc.dlid_path_bits = 0;
> + wc.port_num = 0;
> + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
> + 0);
> + }
> + wqe = get_swqe_ptr(qp, qp->s_last);
> + }
> + /* Check if send work queue is empty. */
> + if (qp->s_tail == qp->s_head)
> + goto done;
> + /*
> + * Start a new request.
> + */
> + qp->s_psn = wqe->psn = qp->s_next_psn;
> + qp->s_sge.sge = wqe->sg_list[0];
> + qp->s_sge.sg_list = wqe->sg_list + 1;
> + qp->s_sge.num_sge = wqe->wr.num_sge;
> + qp->s_len = len = wqe->length;
> + switch (wqe->wr.opcode) {
> + case IB_WR_SEND:
> + case IB_WR_SEND_WITH_IMM:
> + if (len > pmtu) {
> + qp->s_state = IB_OPCODE_UC_SEND_FIRST;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_SEND) {
> + qp->s_state = IB_OPCODE_UC_SEND_ONLY;
> + } else {
> + qp->s_state =
> + IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + }
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + break;
> +
> + case IB_WR_RDMA_WRITE:
> + case IB_WR_RDMA_WRITE_WITH_IMM:
> + ohdr->u.rc.reth.vaddr =
> + cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
> + ohdr->u.rc.reth.rkey =
> + cpu_to_be32(wqe->wr.wr.rdma.rkey);
> + ohdr->u.rc.reth.length = cpu_to_be32(len);
> + hwords += sizeof(struct ib_reth) / 4;
> + if (len > pmtu) {
> + qp->s_state = IB_OPCODE_UC_RDMA_WRITE_FIRST;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
> + qp->s_state = IB_OPCODE_UC_RDMA_WRITE_ONLY;
> + } else {
> + qp->s_state =
> + IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE;
> + /* Immediate data comes after the RETH */
> + ohdr->u.rc.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + }
> + break;
> +
> + default:
> + goto done;
> + }
> + if (++qp->s_tail >= qp->s_size)
> + qp->s_tail = 0;
> + break;
> +
> + case IB_OPCODE_UC_SEND_FIRST:
> + qp->s_state = IB_OPCODE_UC_SEND_MIDDLE;
> + /* FALLTHROUGH */
> + case IB_OPCODE_UC_SEND_MIDDLE:
> + len = qp->s_len;
> + if (len > pmtu) {
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_SEND)
> + qp->s_state = IB_OPCODE_UC_SEND_LAST;
> + else {
> + qp->s_state = IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + }
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + break;
> +
> + case IB_OPCODE_UC_RDMA_WRITE_FIRST:
> + qp->s_state = IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
> + /* FALLTHROUGH */
> + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
> + len = qp->s_len;
> + if (len > pmtu) {
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
> + qp->s_state = IB_OPCODE_UC_RDMA_WRITE_LAST;
> + else {
> + qp->s_state =
> + IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + }
> + break;
> + }
> + bth2 = qp->s_next_psn++ & 0xFFFFFF;
> + qp->s_len -= len;
> + bth0 |= qp->s_state << 24;
> +
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> +
> + /* Construct the header. */
> + extra_bytes = (4 - len) & 3;
> + nwords = (len + extra_bytes) >> 2;
> + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
> + /* Header size in 32-bit words. */
> + hwords += 10;
> + lrh0 = IPS_LRH_GRH;
> + qp->s_hdr.u.l.grh.version_tclass_flow =
> + cpu_to_be32((6 << 28) |
> + (qp->remote_ah_attr.grh.traffic_class << 20) |
> + qp->remote_ah_attr.grh.flow_label);
> + qp->s_hdr.u.l.grh.paylen =
> + cpu_to_be16(((hwords - 12) + nwords + SIZE_OF_CRC) << 2);
> + qp->s_hdr.u.l.grh.next_hdr = 0x1B;
> + qp->s_hdr.u.l.grh.hop_limit = qp->remote_ah_attr.grh.hop_limit;
> + /* The SGID is 32-bit aligned. */
> + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
> + qp->s_hdr.u.l.grh.sgid.global.interface_id =
> + ipath_layer_get_guid(dev->ib_unit);
> + qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
> + }
> + qp->s_hdrwords = hwords;
> + qp->s_cur_sge = &qp->s_sge;
> + qp->s_cur_size = len;
> + lrh0 |= qp->remote_ah_attr.sl << 4;
> + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
> + /* DEST LID */
> + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
> + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
> + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->ib_unit));
> + bth0 |= extra_bytes << 20;
> + ohdr->bth[0] = cpu_to_be32(bth0);
> + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
> + ohdr->bth[2] = cpu_to_be32(bth2);
> +
> + /* Check for more work to do. */
> + goto again;
> +
> +done:
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> + clear_bit(IPATH_S_BUSY, &qp->s_flags);
> +}
> +
> +/*
> + * Process entries in the send work queue until credit or queue is exhausted.
> + * Only allow one CPU to send a packet per QP (tasklet).
> + * Otherwise, after we drop the QP s_lock, two threads could send
> + * packets out of order.
> + */
> +static void do_rc_send(unsigned long data)
> +{
> + struct ipath_qp *qp = (struct ipath_qp *)data;
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + struct ipath_swqe *wqe;
> + struct ipath_sge_state *ss;
> + unsigned long flags;
> + u16 lrh0;
> + u32 hwords;
> + u32 nwords;
> + u32 extra_bytes;
> + u32 bth0;
> + u32 bth2;
> + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
> + u32 len;
> + struct ipath_other_headers *ohdr;
> + char newreq;
> +
> + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
> + return;
> +
> + if (unlikely(qp->remote_ah_attr.dlid ==
> + ipath_layer_get_lid(dev->ib_unit))) {
> + struct ib_wc wc;
> +
> + /*
> + * Pass in an uninitialized ib_wc to be consistent with
> + * other places where ipath_ruc_loopback() is called.
> + */
> + ipath_ruc_loopback(qp, &wc);
> + clear_bit(IPATH_S_BUSY, &qp->s_flags);
> + return;
> + }
> +
> + ohdr = &qp->s_hdr.u.oth;
> + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
> + ohdr = &qp->s_hdr.u.l.oth;
> +
> +again:
> + /* Check for a constructed packet to be sent. */
> + if (qp->s_hdrwords != 0) {
> + /*
> + * If no PIO bufs are available, return.
> + * An interrupt will call ipath_ib_piobufavail()
> + * when one is available.
> + */
> + if (ipath_verbs_send(dev->ib_unit, qp->s_hdrwords,
> + (uint32_t *) &qp->s_hdr,
> + qp->s_cur_size, qp->s_cur_sge)) {
> + no_bufs_available(qp, dev);
> + return;
> + }
> + /* Record that we sent the packet and s_hdr is empty. */
> + qp->s_hdrwords = 0;
> + }
> +
> + lrh0 = IPS_LRH_BTH;
> + /* header size in 32-bit words LRH+BTH = (8+12)/4. */
> + hwords = 5;
> +
> + /*
> + * The lock is needed to synchronize between
> + * setting qp->s_ack_state, resend timer, and post_send().
> + */
> + spin_lock_irqsave(&qp->s_lock, flags);
> +
> + bth0 = ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
> +
> + /* Sending responses has higher priority over sending requests. */
> + if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE) {
> + /*
> + * Send a response.
> + * Note that we are in the responder's side of the QP context.
> + */
> + switch (qp->s_ack_state) {
> + case IB_OPCODE_RC_RDMA_READ_REQUEST:
> + ss = &qp->s_rdma_sge;
> + len = qp->s_rdma_len;
> + if (len > pmtu) {
> + len = pmtu;
> + qp->s_ack_state =
> + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
> + } else {
> + qp->s_ack_state =
> + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
> + }
> + qp->s_rdma_len -= len;
> + bth0 |= qp->s_ack_state << 24;
> + ohdr->u.aeth = ipath_compute_aeth(qp);
> + hwords++;
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
> + qp->s_ack_state =
> + IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
> + /* FALLTHROUGH */
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
> + ss = &qp->s_rdma_sge;
> + len = qp->s_rdma_len;
> + if (len > pmtu) {
> + len = pmtu;
> + } else {
> + ohdr->u.aeth = ipath_compute_aeth(qp);
> + hwords++;
> + qp->s_ack_state =
> + IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
> + }
> + qp->s_rdma_len -= len;
> + bth0 |= qp->s_ack_state << 24;
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
> + /*
> + * We have to prevent new requests from changing
> + * the r_sge state while a ipath_verbs_send()
> + * is in progress.
> + * Changing r_state allows the receiver
> + * to continue processing new packets.
> + * We do it here now instead of above so
> + * that we are sure the packet was sent before
> + * changing the state.
> + */
> + qp->r_state = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> + goto send_req;
> +
> + case IB_OPCODE_RC_COMPARE_SWAP:
> + case IB_OPCODE_RC_FETCH_ADD:
> + ss = NULL;
> + len = 0;
> + qp->r_state = IB_OPCODE_RC_SEND_LAST;
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> + bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
> + ohdr->u.at.aeth = ipath_compute_aeth(qp);
> + ohdr->u.at.atomic_ack_eth =
> + cpu_to_be64(qp->s_ack_atomic);
> + hwords += sizeof(ohdr->u.at) / 4;
> + break;
> +
> + default:
> + /* Send a regular ACK. */
> + ss = NULL;
> + len = 0;
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> + bth0 |= qp->s_ack_state << 24;
> + ohdr->u.aeth = ipath_compute_aeth(qp);
> + hwords++;
> + }
> + bth2 = qp->s_ack_psn++ & 0xFFFFFF;
> + } else {
> + send_req:
> + if (!(state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
> + qp->s_rnr_timeout)
> + goto done;
> +
> + /* Send a request. */
> + wqe = get_swqe_ptr(qp, qp->s_cur);
> + switch (qp->s_state) {
> + default:
> + /*
> + * Resend an old request or start a new one.
> + *
> + * We keep track of the current SWQE so that
> + * we don't reset the "furthest progress" state
> + * if we need to back up.
> + */
> + newreq = 0;
> + if (qp->s_cur == qp->s_tail) {
> + /* Check if send work queue is empty. */
> + if (qp->s_tail == qp->s_head)
> + goto done;
> + qp->s_psn = wqe->psn = qp->s_next_psn;
> + newreq = 1;
> + }
> + /*
> + * Note that we have to be careful not to modify the
> + * original work request since we may need to resend
> + * it.
> + */
> + qp->s_sge.sge = wqe->sg_list[0];
> + qp->s_sge.sg_list = wqe->sg_list + 1;
> + qp->s_sge.num_sge = wqe->wr.num_sge;
> + qp->s_len = len = wqe->length;
> + ss = &qp->s_sge;
> + bth2 = 0;
> + switch (wqe->wr.opcode) {
> + case IB_WR_SEND:
> + case IB_WR_SEND_WITH_IMM:
> + /* If no credit, return. */
> + if (qp->s_lsn != (u32) -1 &&
> + cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
> + goto done;
> + }
> + wqe->lpsn = wqe->psn;
> + if (len > pmtu) {
> + wqe->lpsn += (len - 1) / pmtu;
> + qp->s_state = IB_OPCODE_RC_SEND_FIRST;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_SEND) {
> + qp->s_state = IB_OPCODE_RC_SEND_ONLY;
> + } else {
> + qp->s_state =
> + IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + }
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + bth2 = 1 << 31; /* Request ACK. */
> + if (++qp->s_cur == qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_WR_RDMA_WRITE:
> + if (newreq)
> + qp->s_lsn++;
> + /* FALLTHROUGH */
> + case IB_WR_RDMA_WRITE_WITH_IMM:
> + /* If no credit, return. */
> + if (qp->s_lsn != (u32) -1 &&
> + cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
> + goto done;
> + }
> + ohdr->u.rc.reth.vaddr =
> + cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
> + ohdr->u.rc.reth.rkey =
> + cpu_to_be32(wqe->wr.wr.rdma.rkey);
> + ohdr->u.rc.reth.length = cpu_to_be32(len);
> + hwords += sizeof(struct ib_reth) / 4;
> + wqe->lpsn = wqe->psn;
> + if (len > pmtu) {
> + wqe->lpsn += (len - 1) / pmtu;
> + qp->s_state =
> + IB_OPCODE_RC_RDMA_WRITE_FIRST;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
> + qp->s_state =
> + IB_OPCODE_RC_RDMA_WRITE_ONLY;
> + } else {
> + qp->s_state =
> + IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE;
> + /* Immediate data comes after RETH */
> + ohdr->u.rc.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + if (wqe->wr.
> + send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + }
> + bth2 = 1 << 31; /* Request ACK. */
> + if (++qp->s_cur == qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_WR_RDMA_READ:
> + ohdr->u.rc.reth.vaddr =
> + cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
> + ohdr->u.rc.reth.rkey =
> + cpu_to_be32(wqe->wr.wr.rdma.rkey);
> + ohdr->u.rc.reth.length = cpu_to_be32(len);
> + qp->s_state = IB_OPCODE_RC_RDMA_READ_REQUEST;
> + hwords += sizeof(ohdr->u.rc.reth) / 4;
> + if (newreq) {
> + qp->s_lsn++;
> + /*
> + * Adjust s_next_psn to count the
> + * expected number of responses.
> + */
> + if (len > pmtu)
> + qp->s_next_psn +=
> + (len - 1) / pmtu;
> + wqe->lpsn = qp->s_next_psn++;
> + }
> + ss = NULL;
> + len = 0;
> + if (++qp->s_cur == qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_WR_ATOMIC_CMP_AND_SWP:
> + case IB_WR_ATOMIC_FETCH_AND_ADD:
> + qp->s_state =
> + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ?
> + IB_OPCODE_RC_COMPARE_SWAP :
> + IB_OPCODE_RC_FETCH_ADD;
> + ohdr->u.atomic_eth.vaddr =
> + cpu_to_be64(wqe->wr.wr.atomic.remote_addr);
> + ohdr->u.atomic_eth.rkey =
> + cpu_to_be32(wqe->wr.wr.atomic.rkey);
> + ohdr->u.atomic_eth.swap_data =
> + cpu_to_be64(wqe->wr.wr.atomic.swap);
> + ohdr->u.atomic_eth.compare_data =
> + cpu_to_be64(wqe->wr.wr.atomic.compare_add);
> + hwords += sizeof(struct ib_atomic_eth) / 4;
> + if (newreq) {
> + qp->s_lsn++;
> + wqe->lpsn = wqe->psn;
> + }
> + if (++qp->s_cur == qp->s_size)
> + qp->s_cur = 0;
> + ss = NULL;
> + len = 0;
> + break;
> +
> + default:
> + goto done;
> + }
> + if (newreq) {
> + if (++qp->s_tail >= qp->s_size)
> + qp->s_tail = 0;
> + }
> + bth2 |= qp->s_psn++ & 0xFFFFFF;
> + if ((int)(qp->s_psn - qp->s_next_psn) > 0)
> + qp->s_next_psn = qp->s_psn;
> + spin_lock(&dev->pending_lock);
> + if (qp->timerwait.next == LIST_POISON1) {
> + list_add_tail(&qp->timerwait,
> + &dev->pending[dev->
> + pending_index]);
> + }
> + spin_unlock(&dev->pending_lock);
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
> + /*
> + * This case can only happen if a send is
> + * restarted. See ipath_restart_rc().
> + */
> + ipath_init_restart(qp, wqe);
> + /* FALLTHROUGH */
> + case IB_OPCODE_RC_SEND_FIRST:
> + qp->s_state = IB_OPCODE_RC_SEND_MIDDLE;
> + /* FALLTHROUGH */
> + case IB_OPCODE_RC_SEND_MIDDLE:
> + bth2 = qp->s_psn++ & 0xFFFFFF;
> + if ((int)(qp->s_psn - qp->s_next_psn) > 0)
> + qp->s_next_psn = qp->s_psn;
> + ss = &qp->s_sge;
> + len = qp->s_len;
> + if (len > pmtu) {
> + /*
> + * Request an ACK every 1/2 MB to avoid
> + * retransmit timeouts.
> + */
> + if (((wqe->length - len) % (512 * 1024)) == 0)
> + bth2 |= 1 << 31;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_SEND)
> + qp->s_state = IB_OPCODE_RC_SEND_LAST;
> + else {
> + qp->s_state =
> + IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + }
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + bth2 |= 1 << 31; /* Request ACK. */
> + if (++qp->s_cur >= qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
> + /*
> + * This case can only happen if a RDMA write is
> + * restarted. See ipath_restart_rc().
> + */
> + ipath_init_restart(qp, wqe);
> + /* FALLTHROUGH */
> + case IB_OPCODE_RC_RDMA_WRITE_FIRST:
> + qp->s_state = IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
> + /* FALLTHROUGH */
> + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
> + bth2 = qp->s_psn++ & 0xFFFFFF;
> + if ((int)(qp->s_psn - qp->s_next_psn) > 0)
> + qp->s_next_psn = qp->s_psn;
> + ss = &qp->s_sge;
> + len = qp->s_len;
> + if (len > pmtu) {
> + /*
> + * Request an ACK every 1/2 MB to avoid
> + * retransmit timeouts.
> + */
> + if (((wqe->length - len) % (512 * 1024)) == 0)
> + bth2 |= 1 << 31;
> + len = pmtu;
> + break;
> + }
> + if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
> + qp->s_state = IB_OPCODE_RC_RDMA_WRITE_LAST;
> + else {
> + qp->s_state =
> + IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE;
> + /* Immediate data comes after the BTH */
> + ohdr->u.imm_data = wqe->wr.imm_data;
> + hwords += 1;
> + if (wqe->wr.send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + }
> + bth2 |= 1 << 31; /* Request ACK. */
> + if (++qp->s_cur >= qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
> + /*
> + * This case can only happen if a RDMA read is
> + * restarted. See ipath_restart_rc().
> + */
> + ipath_init_restart(qp, wqe);
> + len = ((qp->s_psn - wqe->psn) & 0xFFFFFF) * pmtu;
> + ohdr->u.rc.reth.vaddr =
> + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
> + ohdr->u.rc.reth.rkey =
> + cpu_to_be32(wqe->wr.wr.rdma.rkey);
> + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
> + qp->s_state = IB_OPCODE_RC_RDMA_READ_REQUEST;
> + hwords += sizeof(ohdr->u.rc.reth) / 4;
> + bth2 = qp->s_psn++ & 0xFFFFFF;
> + if ((int)(qp->s_psn - qp->s_next_psn) > 0)
> + qp->s_next_psn = qp->s_psn;
> + ss = NULL;
> + len = 0;
> + if (++qp->s_cur == qp->s_size)
> + qp->s_cur = 0;
> + break;
> +
> + case IB_OPCODE_RC_RDMA_READ_REQUEST:
> + case IB_OPCODE_RC_COMPARE_SWAP:
> + case IB_OPCODE_RC_FETCH_ADD:
> + /*
> + * We shouldn't start anything new until this request
> + * is finished. The ACK will handle rescheduling us.
> + * XXX The number of outstanding ones is negotiated
> + * at connection setup time (see pg. 258,289)?
> + * XXX Also, if we support multiple outstanding
> + * requests, we need to check the WQE IB_SEND_FENCE
> + * flag and not send a new request if a RDMA read or
> + * atomic is pending.
> + */
> + goto done;
> + }
> + qp->s_len -= len;
> + bth0 |= qp->s_state << 24;
> + /* XXX queue resend timeout. */
> + }
> + /* Make sure it is non-zero before dropping the lock. */
> + qp->s_hdrwords = hwords;
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> +
> + /* Construct the header. */
> + extra_bytes = (4 - len) & 3;
> + nwords = (len + extra_bytes) >> 2;
> + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
> + /* Header size in 32-bit words. */
> + hwords += 10;
> + lrh0 = IPS_LRH_GRH;
> + qp->s_hdr.u.l.grh.version_tclass_flow =
> + cpu_to_be32((6 << 28) |
> + (qp->remote_ah_attr.grh.traffic_class << 20) |
> + qp->remote_ah_attr.grh.flow_label);
> + qp->s_hdr.u.l.grh.paylen =
> + cpu_to_be16(((hwords - 12) + nwords + SIZE_OF_CRC) << 2);
> + qp->s_hdr.u.l.grh.next_hdr = 0x1B;
> + qp->s_hdr.u.l.grh.hop_limit = qp->remote_ah_attr.grh.hop_limit;
> + /* The SGID is 32-bit aligned. */
> + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
> + qp->s_hdr.u.l.grh.sgid.global.interface_id =
> + ipath_layer_get_guid(dev->ib_unit);
> + qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
> + qp->s_hdrwords = hwords;
> + }
> + qp->s_cur_sge = ss;
> + qp->s_cur_size = len;
> + lrh0 |= qp->remote_ah_attr.sl << 4;
> + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
> + /* DEST LID */
> + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
> + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
> + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->ib_unit));
> + bth0 |= extra_bytes << 20;
> + ohdr->bth[0] = cpu_to_be32(bth0);
> + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
> + ohdr->bth[2] = cpu_to_be32(bth2);
> +
> + /* Check for more work to do. */
> + goto again;
> +
> +done:
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> + clear_bit(IPATH_S_BUSY, &qp->s_flags);
> +}
> +
> +static void send_rc_ack(struct ipath_qp *qp)
> +{
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + u16 lrh0;
> + u32 bth0;
> + u32 hwords;
> + struct ipath_other_headers *ohdr;
> +
> + /* Construct the header. */
> + ohdr = &qp->s_hdr.u.oth;
> + lrh0 = IPS_LRH_BTH;
> + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
> + hwords = 6;
> + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
> + ohdr = &qp->s_hdr.u.l.oth;
> + /* Header size in 32-bit words. */
> + hwords += 10;
> + lrh0 = IPS_LRH_GRH;
> + qp->s_hdr.u.l.grh.version_tclass_flow =
> + cpu_to_be32((6 << 28) |
> + (qp->remote_ah_attr.grh.traffic_class << 20) |
> + qp->remote_ah_attr.grh.flow_label);
> + qp->s_hdr.u.l.grh.paylen =
> + cpu_to_be16(((hwords - 12) + SIZE_OF_CRC) << 2);
> + qp->s_hdr.u.l.grh.next_hdr = 0x1B;
> + qp->s_hdr.u.l.grh.hop_limit = qp->remote_ah_attr.grh.hop_limit;
> + /* The SGID is 32-bit aligned. */
> + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
> + qp->s_hdr.u.l.grh.sgid.global.interface_id =
> + ipath_layer_get_guid(dev->ib_unit);
> + qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
> + }
> + bth0 = ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
> + ohdr->u.aeth = ipath_compute_aeth(qp);
> + if (qp->s_ack_state >= IB_OPCODE_RC_COMPARE_SWAP) {
> + bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
> + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic);
> + hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4;
> + } else {
> + bth0 |= IB_OPCODE_RC_ACKNOWLEDGE << 24;
> + }
> + lrh0 |= qp->remote_ah_attr.sl << 4;
> + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
> + /* DEST LID */
> + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
> + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
> + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->ib_unit));
> + ohdr->bth[0] = cpu_to_be32(bth0);
> + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
> + ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & 0xFFFFFF);
> +
> + /*
> + * If we can send the ACK, clear the ACK state.
> + */
> + if (ipath_verbs_send(dev->ib_unit, hwords, (uint32_t *) &qp->s_hdr,
> + 0, NULL) == 0) {
> + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
> + dev->n_rc_qacks++;
> + }
> +}
> +
> +/*
> + * Back up the requester to resend the last un-ACKed request.
> + * The QP s_lock should be held.
> + */
> +static void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
> +{
> + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
> + struct ipath_ibdev *dev;
> + u32 n;
> +
> + /*
> + * If there are no requests pending, we are done.
> + */
> + if (cmp24(psn, qp->s_next_psn) >= 0 || qp->s_last == qp->s_tail)
> + goto done;
> +
> + if (qp->s_retry == 0) {
> + wc->wr_id = wqe->wr.wr_id;
> + wc->status = IB_WC_RETRY_EXC_ERR;
> + wc->opcode = wc_opcode[wqe->wr.opcode];
> + wc->vendor_err = 0;
> + wc->byte_len = 0;
> + wc->qp_num = qp->ibqp.qp_num;
> + wc->src_qp = qp->remote_qpn;
> + wc->pkey_index = 0;
> + wc->slid = qp->remote_ah_attr.dlid;
> + wc->sl = qp->remote_ah_attr.sl;
> + wc->dlid_path_bits = 0;
> + wc->port_num = 0;
> + ipath_sqerror_qp(qp, wc);
> + return;
> + }
> + qp->s_retry--;
> +
> + /*
> + * Remove the QP from the timeout queue.
> + * Note: it may already have been removed by ipath_ib_timer().
> + */
> + dev = to_idev(qp->ibqp.device);
> + spin_lock(&dev->pending_lock);
> + if (qp->timerwait.next != LIST_POISON1)
> + list_del(&qp->timerwait);
> + spin_unlock(&dev->pending_lock);
> +
> + if (wqe->wr.opcode == IB_WR_RDMA_READ)
> + dev->n_rc_resends++;
> + else
> + dev->n_rc_resends += (int)qp->s_psn - (int)psn;
> +
> + /*
> + * If we are starting the request from the beginning, let the
> + * normal send code handle initialization.
> + */
> + qp->s_cur = qp->s_last;
> + if (cmp24(psn, wqe->psn) <= 0) {
> + qp->s_state = IB_OPCODE_RC_SEND_LAST;
> + qp->s_psn = wqe->psn;
> + } else {
> + n = qp->s_cur;
> + for (;;) {
> + if (++n == qp->s_size)
> + n = 0;
> + if (n == qp->s_tail) {
> + if (cmp24(psn, qp->s_next_psn) >= 0) {
> + qp->s_cur = n;
> + wqe = get_swqe_ptr(qp, n);
> + }
> + break;
> + }
> + wqe = get_swqe_ptr(qp, n);
> + if (cmp24(psn, wqe->psn) < 0)
> + break;
> + qp->s_cur = n;
> + }
> + qp->s_psn = psn;
> +
> + /*
> + * Reset the state to restart in the middle of a request.
> + * Don't change the s_sge, s_cur_sge, or s_cur_size.
> + * See do_rc_send().
> + */
> + switch (wqe->wr.opcode) {
> + case IB_WR_SEND:
> + case IB_WR_SEND_WITH_IMM:
> + qp->s_state = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
> + break;
> +
> + case IB_WR_RDMA_WRITE:
> + case IB_WR_RDMA_WRITE_WITH_IMM:
> + qp->s_state = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
> + break;
> +
> + case IB_WR_RDMA_READ:
> + qp->s_state = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
> + break;
> +
> + default:
> + /*
> + * This case shouldn't happen since its only
> + * one PSN per req.
> + */
> + qp->s_state = IB_OPCODE_RC_SEND_LAST;
> + }
> + }
> +
> +done:
> + tasklet_schedule(&qp->s_task);
> +}
> +
> +/*
> + * Handle RC and UC post sends.
> + */
> +static int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr)
> +{
> + struct ipath_swqe *wqe;
> + unsigned long flags;
> + u32 next;
> + int i, j;
> + int acc;
> +
> + /*
> + * Don't allow RDMA reads or atomic operations on UC or
> + * undefined operations.
> + * Make sure buffer is large enough to hold the result for atomics.
> + */
> + if (qp->ibqp.qp_type == IB_QPT_UC) {
> + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
> + return -EINVAL;
> + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
> + return -EINVAL;
> + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
> + (wr->num_sge == 0 || wr->sg_list[0].length < sizeof(u64) ||
> + wr->sg_list[0].addr & 0x7))
> + return -EINVAL;
> +
> + /* IB spec says that num_sge == 0 is OK. */
> + if (wr->num_sge > qp->s_max_sge)
> + return -ENOMEM;
> +
> + spin_lock_irqsave(&qp->s_lock, flags);
> + next = qp->s_head + 1;
> + if (next >= qp->s_size)
> + next = 0;
> + if (next == qp->s_last) {
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> + return -EINVAL;
> + }
> +
> + wqe = get_swqe_ptr(qp, qp->s_head);
> + wqe->wr = *wr;
> + wqe->ssn = qp->s_ssn++;
> + wqe->sg_list[0].mr = NULL;
> + wqe->sg_list[0].vaddr = NULL;
> + wqe->sg_list[0].length = 0;
> + wqe->sg_list[0].sge_length = 0;
> + wqe->length = 0;
> + acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0;
> + for (i = 0, j = 0; i < wr->num_sge; i++) {
> + if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) {
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> + return -EINVAL;
> + }
> + if (wr->sg_list[i].length == 0)
> + continue;
> + if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table,
> + &wqe->sg_list[j], &wr->sg_list[i], acc)) {
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> + return -EINVAL;
> + }
> + wqe->length += wr->sg_list[i].length;
> + j++;
> + }
> + wqe->wr.num_sge = j;
> + qp->s_head = next;
> + /*
> + * Wake up the send tasklet if the QP is not waiting
> + * for an RNR timeout.
> + */
> + next = qp->s_rnr_timeout;
> + spin_unlock_irqrestore(&qp->s_lock, flags);
> +
> + if (next == 0) {
> + if (qp->ibqp.qp_type == IB_QPT_UC)
> + do_uc_send((unsigned long) qp);
> + else
> + do_rc_send((unsigned long) qp);
> + }
> + return 0;
> +}
> +
> +/*
> + * Note that we actually send the data as it is posted instead of putting
> + * the request into a ring buffer. If we wanted to use a ring buffer,
> + * we would need to save a reference to the destination address in the SWQE.
> + */
> +static int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
> +{
> + struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
> + struct ipath_other_headers *ohdr;
> + struct ib_ah_attr *ah_attr;
> + struct ipath_sge_state ss;
> + struct ipath_sge *sg_list;
> + struct ib_wc wc;
> + u32 hwords;
> + u32 nwords;
> + u32 len;
> + u32 extra_bytes;
> + u32 bth0;
> + u16 lrh0;
> + u16 lid;
> + int i;
> +
> + if (!(state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
> + return 0;
> +
> + /* IB spec says that num_sge == 0 is OK. */
> + if (wr->num_sge > qp->s_max_sge)
> + return -EINVAL;
> +
> + if (wr->num_sge > 1) {
> + sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list),
> + GFP_ATOMIC);
> + if (!sg_list)
> + return -ENOMEM;
> + } else
> + sg_list = NULL;
> +
> + /* Check the buffer to send. */
> + ss.sg_list = sg_list;
> + ss.sge.mr = NULL;
> + ss.sge.vaddr = NULL;
> + ss.sge.length = 0;
> + ss.sge.sge_length = 0;
> + ss.num_sge = 0;
> + len = 0;
> + for (i = 0; i < wr->num_sge; i++) {
> + /* Check LKEY */
> + if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0)
> + return -EINVAL;
> +
> + if (wr->sg_list[i].length == 0)
> + continue;
> + if (!ipath_lkey_ok(&dev->lk_table, ss.num_sge ?
> + sg_list + ss.num_sge : &ss.sge,
> + &wr->sg_list[i], 0)) {
> + return -EINVAL;
> + }
> + len += wr->sg_list[i].length;
> + ss.num_sge++;
> + }
> + extra_bytes = (4 - len) & 3;
> + nwords = (len + extra_bytes) >> 2;
> +
> + /* Construct the header. */
> + ah_attr = &to_iah(wr->wr.ud.ah)->attr;
> + if (ah_attr->dlid >= 0xC000 && ah_attr->dlid < 0xFFFF)
> + dev->n_multicast_xmit++;
> + if (unlikely(ah_attr->dlid == ipath_layer_get_lid(dev->ib_unit))) {
> + /* Pass in an uninitialized ib_wc to save stack space. */
> + ipath_ud_loopback(qp, &ss, len, wr, &wc);
> + goto done;
> + }
> + if (ah_attr->ah_flags & IB_AH_GRH) {
> + /* Header size in 32-bit words. */
> + hwords = 17;
> + lrh0 = IPS_LRH_GRH;
> + ohdr = &qp->s_hdr.u.l.oth;
> + qp->s_hdr.u.l.grh.version_tclass_flow =
> + cpu_to_be32((6 << 28) |
> + (ah_attr->grh.traffic_class << 20) |
> + ah_attr->grh.flow_label);
> + qp->s_hdr.u.l.grh.paylen =
> + cpu_to_be16(((wr->opcode ==
> + IB_WR_SEND_WITH_IMM ? 6 : 5) + nwords +
> + SIZE_OF_CRC) << 2);
> + qp->s_hdr.u.l.grh.next_hdr = 0x1B;
> + qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit;
> + /* The SGID is 32-bit aligned. */
> + qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
> + qp->s_hdr.u.l.grh.sgid.global.interface_id =
> + ipath_layer_get_guid(dev->ib_unit);
> + qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid;
> + /*
> + * Don't worry about sending to locally attached
> + * multicast QPs. It is unspecified by the spec. what happens.
> + */
> + } else {
> + /* Header size in 32-bit words. */
> + hwords = 7;
> + lrh0 = IPS_LRH_BTH;
> + ohdr = &qp->s_hdr.u.oth;
> + }
> + if (wr->opcode == IB_WR_SEND_WITH_IMM) {
> + ohdr->u.ud.imm_data = wr->imm_data;
> + wc.imm_data = wr->imm_data;
> + hwords += 1;
> + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
> + } else if (wr->opcode == IB_WR_SEND) {
> + wc.imm_data = 0;
> + bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
> + } else
> + return -EINVAL;
> + lrh0 |= ah_attr->sl << 4;
> + if (qp->ibqp.qp_type == IB_QPT_SMI)
> + lrh0 |= 0xF000; /* Set VL */
> + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
> + qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */
> + qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
> + lid = ipath_layer_get_lid(dev->ib_unit);
> + qp->s_hdr.lrh[3] = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
> + if (wr->send_flags & IB_SEND_SOLICITED)
> + bth0 |= 1 << 23;
> + bth0 |= extra_bytes << 20;
> + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPS_DEFAULT_P_KEY :
> + ipath_layer_get_pkey(dev->ib_unit, qp->s_pkey_index);
> + ohdr->bth[0] = cpu_to_be32(bth0);
> + ohdr->bth[1] = cpu_to_be32(wr->wr.ud.remote_qpn);
> + /* XXX Could lose a PSN count but not worth locking */
> + ohdr->bth[2] = cpu_to_be32(qp->s_psn++ & 0xFFFFFF);
> + /*
> + * Qkeys with the high order bit set mean use the
> + * qkey from the QP context instead of the WR.
> + */
> + ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ?
> + qp->qkey : wr->wr.ud.remote_qkey);
> + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
> + if (ipath_verbs_send(dev->ib_unit, hwords, (uint32_t *) &qp->s_hdr,
> + len, &ss))
> + dev->n_no_piobuf++;
> +
> +done:
> + /* Queue the completion status entry. */
> + if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
> + (wr->send_flags & IB_SEND_SIGNALED)) {
> + wc.wr_id = wr->wr_id;
> + wc.status = IB_WC_SUCCESS;
> + wc.vendor_err = 0;
> + wc.opcode = IB_WC_SEND;
> + wc.byte_len = len;
> + wc.qp_num = qp->ibqp.qp_num;
> + wc.src_qp = 0;
> + wc.wc_flags = 0;
> + /* XXX initialize other fields? */
> + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
> + }
> + kfree(sg_list);
> +
> + return 0;
> +}
> +
> +/*
> + * This may be called from interrupt context.
> + */
> +static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
> + struct ib_send_wr **bad_wr)
> +{
> + struct ipath_qp *qp = to_iqp(ibqp);
> + int err = 0;
> +
> + /* Check that state is OK to post send. */
> + if (!(state_ops[qp->state] & IPATH_POST_SEND_OK)) {
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> +
> + for (; wr; wr = wr->next) {
> + switch (qp->ibqp.qp_type) {
> + case IB_QPT_UC:
> + case IB_QPT_RC:
> + err = ipath_post_rc_send(qp, wr);
> + break;
> +
> + case IB_QPT_SMI:
> + case IB_QPT_GSI:
> + case IB_QPT_UD:
> + err = ipath_post_ud_send(qp, wr);
> + break;
> +
> + default:
> + err = -EINVAL;
> + }
> + if (err) {
> + *bad_wr = wr;
> + break;
> + }
> + }
> + return err;
> +}
> +
> +/*
> + * This may be called from interrupt context.
> + */
> +static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
> + struct ib_recv_wr **bad_wr)
> +{
> + struct ipath_qp *qp = to_iqp(ibqp);
> + unsigned long flags;
> +
> + /* Check that state is OK to post receive. */
> + if (!(state_ops[qp->state] & IPATH_POST_RECV_OK)) {
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> +
> + for (; wr; wr = wr->next) {
> + struct ipath_rwqe *wqe;
> + u32 next;
> + int i, j;
> +
> + if (wr->num_sge > qp->r_rq.max_sge) {
> + *bad_wr = wr;
> + return -ENOMEM;
> + }
> +
> + spin_lock_irqsave(&qp->r_rq.lock, flags);
> + next = qp->r_rq.head + 1;
> + if (next >= qp->r_rq.size)
> + next = 0;
> + if (next == qp->r_rq.tail) {
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> + *bad_wr = wr;
> + return -ENOMEM;
> + }
> +
> + wqe = get_rwqe_ptr(&qp->r_rq, qp->r_rq.head);
> + wqe->wr_id = wr->wr_id;
> + wqe->sg_list[0].mr = NULL;
> + wqe->sg_list[0].vaddr = NULL;
> + wqe->sg_list[0].length = 0;
> + wqe->sg_list[0].sge_length = 0;
> + wqe->length = 0;
> + for (i = 0, j = 0; i < wr->num_sge; i++) {
> + /* Check LKEY */
> + if (to_ipd(qp->ibqp.pd)->user &&
> + wr->sg_list[i].lkey == 0) {
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> + if (wr->sg_list[i].length == 0)
> + continue;
> + if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table,
> + &wqe->sg_list[j], &wr->sg_list[i],
> + IB_ACCESS_LOCAL_WRITE)) {
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> + wqe->length += wr->sg_list[i].length;
> + j++;
> + }
> + wqe->num_sge = j;
> + qp->r_rq.head = next;
> + spin_unlock_irqrestore(&qp->r_rq.lock, flags);
> + }
> + return 0;
> +}
> +
> +/*
> + * This may be called from interrupt context.
> + */
> +static int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
> + struct ib_recv_wr **bad_wr)
> +{
> + struct ipath_srq *srq = to_isrq(ibsrq);
> + struct ipath_ibdev *dev = to_idev(ibsrq->device);
> + unsigned long flags;
> +
> + for (; wr; wr = wr->next) {
> + struct ipath_rwqe *wqe;
> + u32 next;
> + int i, j;
> +
> + if (wr->num_sge > srq->rq.max_sge) {
> + *bad_wr = wr;
> + return -ENOMEM;
> + }
> +
> + spin_lock_irqsave(&srq->rq.lock, flags);
> + next = srq->rq.head + 1;
> + if (next >= srq->rq.size)
> + next = 0;
> + if (next == srq->rq.tail) {
> + spin_unlock_irqrestore(&srq->rq.lock, flags);
> + *bad_wr = wr;
> + return -ENOMEM;
> + }
> +
> + wqe = get_rwqe_ptr(&srq->rq, srq->rq.head);
> + wqe->wr_id = wr->wr_id;
> + wqe->sg_list[0].mr = NULL;
> + wqe->sg_list[0].vaddr = NULL;
> + wqe->sg_list[0].length = 0;
> + wqe->sg_list[0].sge_length = 0;
> + wqe->length = 0;
> + for (i = 0, j = 0; i < wr->num_sge; i++) {
> + /* Check LKEY */
> + if (to_ipd(srq->ibsrq.pd)->user &&
> + wr->sg_list[i].lkey == 0) {
> + spin_unlock_irqrestore(&srq->rq.lock, flags);
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> + if (wr->sg_list[i].length == 0)
> + continue;
> + if (!ipath_lkey_ok(&dev->lk_table,
> + &wqe->sg_list[j], &wr->sg_list[i],
> + IB_ACCESS_LOCAL_WRITE)) {
> + spin_unlock_irqrestore(&srq->rq.lock, flags);
> + *bad_wr = wr;
> + return -EINVAL;
> + }
> + wqe->length += wr->sg_list[i].length;
> + j++;
> + }
> + wqe->num_sge = j;
> + srq->rq.head = next;
> + spin_unlock_irqrestore(&srq->rq.lock, flags);
> + }
> + return 0;
> +}
> +
> +/*
> + * This is called from ipath_qp_rcv() to process an incomming UD packet
> + * for the given QP.
> + * Called at interrupt level.
> + */
> +static void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
> + int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
> +{
> + struct ipath_other_headers *ohdr;
> + int opcode;
> + u32 hdrsize;
> + u32 pad;
> + unsigned long flags;
> + struct ib_wc wc;
> + u32 qkey;
> + u32 src_qp;
> + struct ipath_rq *rq;
> + struct ipath_srq *srq;
> + struct ipath_rwqe *wqe;
> +
> + /* Check for GRH */
> + if (!has_grh) {
> + ohdr = &hdr->u.oth;
> + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */
> + qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
> + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
> + } else {
> + ohdr = &hdr->u.l.oth;
> + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
> + /*
> + * The header with GRH is 68 bytes and the
> + * core driver sets the eager header buffer
> + * size to 56 bytes so the last 12 bytes of
> + * the IB header is in the data buffer.
> + */
> + qkey = be32_to_cpu(((u32 *) data)[1]);
> + src_qp = be32_to_cpu(((u32 *) data)[2]);
> + data += 12;
> + }
> + src_qp &= 0xFFFFFF;
> +
> + /* Check that the qkey matches. */
> + if (unlikely(qkey != qp->qkey)) {
> + /* XXX OK to lose a count once in a while. */
> + dev->qkey_violations++;
> + dev->n_pkt_drops++;
> + return;
> + }
> +
> + /* Get the number of bytes the message was padded by. */
> + pad = (ohdr->bth[0] >> 12) & 3;
> + if (unlikely(tlen < (hdrsize + pad + 4))) {
> + /* Drop incomplete packets. */
> + dev->n_pkt_drops++;
> + return;
> + }
> +
> + /*
> + * A GRH is expected to preceed the data even if not
> + * present on the wire.
> + */
> + wc.byte_len = tlen - (hdrsize + pad + 4) + sizeof(struct ib_grh);
> +
> + /*
> + * The opcode is in the low byte when its in network order
> + * (top byte when in host order).
> + */
> + opcode = *(u8 *) (&ohdr->bth[0]);
> + if (opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
> + if (has_grh) {
> + wc.imm_data = *(u32 *) data;
> + data += sizeof(u32);
> + } else
> + wc.imm_data = ohdr->u.ud.imm_data;
> + wc.wc_flags = IB_WC_WITH_IMM;
> + hdrsize += sizeof(u32);
> + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
> + wc.imm_data = 0;
> + wc.wc_flags = 0;
> + } else {
> + dev->n_pkt_drops++;
> + return;
> + }
> +
> + /*
> + * Get the next work request entry to find where to put the data.
> + * Note that it is safe to drop the lock after changing rq->tail
> + * since ipath_post_receive() won't fill the empty slot.
> + */
> + if (qp->ibqp.srq) {
> + srq = to_isrq(qp->ibqp.srq);
> + rq = &srq->rq;
> + } else {
> + srq = NULL;
> + rq = &qp->r_rq;
> + }
> + spin_lock_irqsave(&rq->lock, flags);
> + if (rq->tail == rq->head) {
> + spin_unlock_irqrestore(&rq->lock, flags);
> + dev->n_pkt_drops++;
> + return;
> + }
> + /* Silently drop packets which are too big. */
> + wqe = get_rwqe_ptr(rq, rq->tail);
> + if (wc.byte_len > wqe->length) {
> + spin_unlock_irqrestore(&rq->lock, flags);
> + dev->n_pkt_drops++;
> + return;
> + }
> + wc.wr_id = wqe->wr_id;
> + qp->r_sge.sge = wqe->sg_list[0];
> + qp->r_sge.sg_list = wqe->sg_list + 1;
> + qp->r_sge.num_sge = wqe->num_sge;
> + if (++rq->tail >= rq->size)
> + rq->tail = 0;
> + if (srq && srq->ibsrq.event_handler) {
> + u32 n;
> +
> + if (rq->head < rq->tail)
> + n = rq->size + rq->head - rq->tail;
> + else
> + n = rq->head - rq->tail;
> + if (n < srq->limit) {
> + struct ib_event ev;
> +
> + srq->limit = 0;
> + spin_unlock_irqrestore(&rq->lock, flags);
> + ev.device = qp->ibqp.device;
> + ev.element.srq = qp->ibqp.srq;
> + ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
> + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
> + } else
> + spin_unlock_irqrestore(&rq->lock, flags);
> + } else
> + spin_unlock_irqrestore(&rq->lock, flags);
> + if (has_grh) {
> + copy_sge(&qp->r_sge, &hdr->u.l.grh, sizeof(struct ib_grh));
> + wc.wc_flags |= IB_WC_GRH;
> + } else
> + skip_sge(&qp->r_sge, sizeof(struct ib_grh));
> + copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh));
> + wc.status = IB_WC_SUCCESS;
> + wc.opcode = IB_WC_RECV;
> + wc.vendor_err = 0;
> + wc.qp_num = qp->ibqp.qp_num;
> + wc.src_qp = src_qp;
> + /* XXX do we know which pkey matched? Only needed for GSI. */
> + wc.pkey_index = 0;
> + wc.slid = be16_to_cpu(hdr->lrh[3]);
> + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
> + wc.dlid_path_bits = 0;
> + /* Signal completion event if the solicited bit is set. */
> + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
> + ohdr->bth[0] & __constant_cpu_to_be32(1 << 23));
> +}
> +
> +/*
> + * This is called from ipath_post_ud_send() to forward a WQE addressed
> + * to the same HCA.
> + */
> +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss,
> + u32 length, struct ib_send_wr *wr,
> + struct ib_wc *wc)
> +{
> + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
> + struct ipath_qp *qp;
> + struct ib_ah_attr *ah_attr;
> + unsigned long flags;
> + struct ipath_rq *rq;
> + struct ipath_srq *srq;
> + struct ipath_sge_state rsge;
> + struct ipath_sge *sge;
> + struct ipath_rwqe *wqe;
> +
> + qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn);
> + if (!qp)
> + return;
> +
> + /* Check that the qkey matches. */
> + if (unlikely(wr->wr.ud.remote_qkey != qp->qkey)) {
> + /* XXX OK to lose a count once in a while. */
> + dev->qkey_violations++;
> + dev->n_pkt_drops++;
> + goto done;
> + }
> +
> + /*
> + * A GRH is expected to preceed the data even if not
> + * present on the wire.
> + */
> + wc->byte_len = length + sizeof(struct ib_grh);
> +
> + if (wr->opcode == IB_WR_SEND_WITH_IMM) {
> + wc->wc_flags = IB_WC_WITH_IMM;
> + wc->imm_data = wr->imm_data;
> + } else {
> + wc->wc_flags = 0;
> + wc->imm_data = 0;
> + }
> +
> + /*
> + * Get the next work request entry to find where to put the data.
> + * Note that it is safe to drop the lock after changing rq->tail
> + * since ipath_post_receive() won't fill the empty slot.
> + */
> + if (qp->ibqp.srq) {
> + srq = to_isrq(qp->ibqp.srq);
> + rq = &srq->rq;
> + } else {
> + srq = NULL;
> + rq = &qp->r_rq;
> + }
> + spin_lock_irqsave(&rq->lock, flags);
> + if (rq->tail == rq->head) {
> + spin_unlock_irqrestore(&rq->lock, flags);
> + dev->n_pkt_drops++;
> + goto done;
> + }
> + /* Silently drop packets which are too big. */
> + wqe = get_rwqe_ptr(rq, rq->tail);
> + if (wc->byte_len > wqe->length) {
> + spin_unlock_irqrestore(&rq->lock, flags);
> + dev->n_pkt_drops++;
> + goto done;
> + }
> + wc->wr_id = wqe->wr_id;
> + rsge.sge = wqe->sg_list[0];
> + rsge.sg_list = wqe->sg_list + 1;
> + rsge.num_sge = wqe->num_sge;
> + if (++rq->tail >= rq->size)
> + rq->tail = 0;
> + if (srq && srq->ibsrq.event_handler) {
> + u32 n;
> +
> + if (rq->head < rq->tail)
> + n = rq->size + rq->head - rq->tail;
> + else
> + n = rq->head - rq->tail;
> + if (n < srq->limit) {
> + struct ib_event ev;
> +
> + srq->limit = 0;
> + spin_unlock_irqrestore(&rq->lock, flags);
> + ev.device = qp->ibqp.device;
> + ev.element.srq = qp->ibqp.srq;
> + ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
> + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
> + } else
> + spin_unlock_irqrestore(&rq->lock, flags);
> + } else
> + spin_unlock_irqrestore(&rq->lock, flags);
> + ah_attr = &to_iah(wr->wr.ud.ah)->attr;
> + if (ah_attr->ah_flags & IB_AH_GRH) {
> + copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
> + wc->wc_flags |= IB_WC_GRH;
> + } else
> + skip_sge(&rsge, sizeof(struct ib_grh));
> + sge = &ss->sge;
> + while (length) {
> + u32 len = sge->length;
> +
> + if (len > length)
> + len = length;
> + BUG_ON(len == 0);
> + copy_sge(&rsge, sge->vaddr, len);
> + sge->vaddr += len;
> + sge->length -= len;
> + sge->sge_length -= len;
> + if (sge->sge_length == 0) {
> + if (--ss->num_sge)
> + *sge = *ss->sg_list++;
> + } else if (sge->length == 0 && sge->mr != NULL) {
> + if (++sge->n >= IPATH_SEGSZ) {
> + if (++sge->m >= sge->mr->mapsz)
> + break;
> + sge->n = 0;
> + }
> + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
> + sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
> + }
> + length -= len;
> + }
> + wc->status = IB_WC_SUCCESS;
> + wc->opcode = IB_WC_RECV;
> + wc->vendor_err = 0;
> + wc->qp_num = qp->ibqp.qp_num;
> + wc->src_qp = sqp->ibqp.qp_num;
> + /* XXX do we know which pkey matched? Only needed for GSI. */
> + wc->pkey_index = 0;
> + wc->slid = ipath_layer_get_lid(dev->ib_unit);
> + wc->sl = ah_attr->sl;
> + wc->dlid_path_bits = 0;
> + /* Signal completion event if the solicited bit is set. */
> + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
> + wr->send_flags & IB_SEND_SOLICITED);
> +
> +done:
> + if (atomic_dec_and_test(&qp->refcount))
> + wake_up(&qp->wait);
> +}
> +
> +/*
> + * Copy the next RWQE into the QP's RWQE.
> + * Return zero if no RWQE is available.
> + * Called at interrupt level with the QP r_rq.lock held.
> + */
> +static int get_rwqe(struct ipath_qp *qp, int wr_id_only)
> +{
> + struct ipath_rq *rq;
> + struct ipath_srq *srq;
> + struct ipath_rwqe *wqe;
> +
> + if (!qp->ibqp.srq) {
> + rq = &qp->r_rq;
> + if (unlikely(rq->tail == rq->head))
> + return 0;
> + wqe = get_rwqe_ptr(rq, rq->tail);
> + qp->r_wr_id = wqe->wr_id;
> + if (!wr_id_only) {
> + qp->r_sge.sge = wqe->sg_list[0];
> + qp->r_sge.sg_list = wqe->sg_list + 1;
> + qp->r_sge.num_sge = wqe->num_sge;
> + qp->r_len = wqe->length;
> + }
> + if (++rq->tail >= rq->size)
> + rq->tail = 0;
> + return 1;
> + }
> +
> + srq = to_isrq(qp->ibqp.srq);
> + rq = &srq->rq;
> + spin_lock(&rq->lock);
> + if (unlikely(rq->tail == rq->head)) {
> + spin_unlock(&rq->lock);
> + return 0;
> + }
> + wqe = get_rwqe_ptr(rq, rq->tail);
> + qp->r_wr_id = wqe->wr_id;
> + if (!wr_id_only) {
> + qp->r_sge.sge = wqe->sg_list[0];
> + qp->r_sge.sg_list = wqe->sg_list + 1;
> + qp->r_sge.num_sge = wqe->num_sge;
> + qp->r_len = wqe->length;
> + }
> + if (++rq->tail >= rq->size)
> + rq->tail = 0;
> + if (srq->ibsrq.event_handler) {
> + struct ib_event ev;
> + u32 n;
> +
> + if (rq->head < rq->tail)
> + n = rq->size + rq->head - rq->tail;
> + else
> + n = rq->head - rq->tail;
> + if (n < srq->limit) {
> + srq->limit = 0;
> + spin_unlock(&rq->lock);
> + ev.device = qp->ibqp.device;
> + ev.element.srq = qp->ibqp.srq;
> + ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
> + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
> + } else
> + spin_unlock(&rq->lock);
> + } else
> + spin_unlock(&rq->lock);
> + return 1;
> +}
> --
> 0.99.9n
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>
>
More information about the general
mailing list