[ewg] [PATCH 1/1 OFED-1.4] IB/ehca: add flush cqes generation

Hoang-Nam Nguyen hnguyen at linux.vnet.ibm.com
Thu Sep 25 23:34:29 PDT 2008


IB/ehca: Add flush error cqe generation

Signed-off-by: Alexander Schmidt <alexs at linux.vnet.ibm.com>
---
 .../fixes/ehca_0100_flush_error_cqe.patch          |  748 ++++++++++++++++++++
 1 files changed, 748 insertions(+), 0 deletions(-)
 create mode 100644 kernel_patches/fixes/ehca_0100_flush_error_cqe.patch

diff --git a/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch b/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch
new file mode 100644
index 0000000..fff5418
--- /dev/null
+++ b/kernel_patches/fixes/ehca_0100_flush_error_cqe.patch
@@ -0,0 +1,748 @@
+When a QP goes into error state, it is required that flush CQEs are
+delivered to the application for any outstanding work requests. eHCA does not
+do this in hardware, so this patch adds software flush CQE generation to the
+ehca driver.
+
+Whenever a QP gets into error state, it is added to the QP error list of its
+respective CQ. If the error QP list of a CQ is not empty, poll_cq()
+generates flush CQEs before polling the actual CQ.
+
+Signed-off-by: Alexander Schmidt <alexs at linux.vnet.ibm.com>
+---
+Applies on top of 2.6.27-rc3, please consider this for 2.6.28.
+
+ drivers/infiniband/hw/ehca/ehca_classes.h |   14 +
+ drivers/infiniband/hw/ehca/ehca_cq.c      |    3 
+ drivers/infiniband/hw/ehca/ehca_iverbs.h  |    2 
+ drivers/infiniband/hw/ehca/ehca_qp.c      |  225 ++++++++++++++++++++++++++++--
+ drivers/infiniband/hw/ehca/ehca_reqs.c    |  211 ++++++++++++++++++++++++----
+ 5 files changed, 412 insertions(+), 43 deletions(-)
+
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_classes.h
++++ infiniband.git/drivers/infiniband/hw/ehca/ehca_classes.h
+@@ -164,6 +164,13 @@ struct ehca_qmap_entry {
+ 	u16 reported;
+ };
+ 
++struct ehca_queue_map {
++	struct ehca_qmap_entry *map;
++	unsigned int entries;
++	unsigned int tail;
++	unsigned int left_to_poll;
++};
++
+ struct ehca_qp {
+ 	union {
+ 		struct ib_qp ib_qp;
+@@ -173,8 +180,9 @@ struct ehca_qp {
+ 	enum ehca_ext_qp_type ext_type;
+ 	enum ib_qp_state state;
+ 	struct ipz_queue ipz_squeue;
+-	struct ehca_qmap_entry *sq_map;
++	struct ehca_queue_map sq_map;
+ 	struct ipz_queue ipz_rqueue;
++	struct ehca_queue_map rq_map;
+ 	struct h_galpas galpas;
+ 	u32 qkey;
+ 	u32 real_qp_num;
+@@ -204,6 +212,8 @@ struct ehca_qp {
+ 	atomic_t nr_events; /* events seen */
+ 	wait_queue_head_t wait_completion;
+ 	int mig_armed;
++	struct list_head sq_err_node;
++	struct list_head rq_err_node;
+ };
+ 
+ #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+@@ -233,6 +243,8 @@ struct ehca_cq {
+ 	/* mmap counter for resources mapped into user space */
+ 	u32 mm_count_queue;
+ 	u32 mm_count_galpa;
++	struct list_head sqp_err_list;
++	struct list_head rqp_err_list;
+ };
+ 
+ enum ehca_mr_flag {
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_reqs.c
++++ infiniband.git/drivers/infiniband/hw/ehca/ehca_reqs.c
+@@ -53,9 +53,25 @@
+ /* in RC traffic, insert an empty RDMA READ every this many packets */
+ #define ACK_CIRC_THRESHOLD 2000000
+ 
++static u64 replace_wr_id(u64 wr_id, u16 idx)
++{
++	u64 ret;
++
++	ret = wr_id & ~QMAP_IDX_MASK;
++	ret |= idx & QMAP_IDX_MASK;
++
++	return ret;
++}
++
++static u16 get_app_wr_id(u64 wr_id)
++{
++	return wr_id & QMAP_IDX_MASK;
++}
++
+ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+ 				  struct ehca_wqe *wqe_p,
+-				  struct ib_recv_wr *recv_wr)
++				  struct ib_recv_wr *recv_wr,
++				  u32 rq_map_idx)
+ {
+ 	u8 cnt_ds;
+ 	if (unlikely((recv_wr->num_sge < 0) ||
+@@ -69,7 +85,7 @@ static inline int ehca_write_rwqe(struct
+ 	/* clear wqe header until sglist */
+ 	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+ 
+-	wqe_p->work_request_id = recv_wr->wr_id;
++	wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+ 	wqe_p->nr_of_data_seg = recv_wr->num_sge;
+ 
+ 	for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+@@ -146,6 +162,7 @@ static inline int ehca_write_swqe(struct
+ 	u64 dma_length;
+ 	struct ehca_av *my_av;
+ 	u32 remote_qkey = send_wr->wr.ud.remote_qkey;
++	struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+ 
+ 	if (unlikely((send_wr->num_sge < 0) ||
+ 		     (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+@@ -158,11 +175,10 @@ static inline int ehca_write_swqe(struct
+ 	/* clear wqe header until sglist */
+ 	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+ 
+-	wqe_p->work_request_id = send_wr->wr_id & ~QMAP_IDX_MASK;
+-	wqe_p->work_request_id |= sq_map_idx & QMAP_IDX_MASK;
++	wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+ 
+-	qp->sq_map[sq_map_idx].app_wr_id = send_wr->wr_id & QMAP_IDX_MASK;
+-	qp->sq_map[sq_map_idx].reported = 0;
++	qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
++	qmap_entry->reported = 0;
+ 
+ 	switch (send_wr->opcode) {
+ 	case IB_WR_SEND:
+@@ -496,7 +512,9 @@ static int internal_post_recv(struct ehc
+ 	struct ehca_wqe *wqe_p;
+ 	int wqe_cnt = 0;
+ 	int ret = 0;
++	u32 rq_map_idx;
+ 	unsigned long flags;
++	struct ehca_qmap_entry *qmap_entry;
+ 
+ 	if (unlikely(!HAS_RQ(my_qp))) {
+ 		ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
+@@ -524,8 +542,15 @@ static int internal_post_recv(struct ehc
+ 			}
+ 			goto post_recv_exit0;
+ 		}
++		/*
++		 * Get the index of the WQE in the recv queue. The same index
++		 * is used for writing into the rq_map.
++		 */
++		rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
++
+ 		/* write a RECV WQE into the QUEUE */
+-		ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, cur_recv_wr);
++		ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, cur_recv_wr,
++				rq_map_idx);
+ 		/*
+ 		 * if something failed,
+ 		 * reset the free entry pointer to the start value
+@@ -540,6 +565,11 @@ static int internal_post_recv(struct ehc
+ 			}
+ 			goto post_recv_exit0;
+ 		}
++
++		qmap_entry = &my_qp->rq_map.map[rq_map_idx];
++		qmap_entry->app_wr_id = get_app_wr_id(cur_recv_wr->wr_id);
++		qmap_entry->reported = 0;
++
+ 		wqe_cnt++;
+ 	} /* eof for cur_recv_wr */
+ 
+@@ -596,10 +626,12 @@ static const u8 ib_wc_opcode[255] = {
+ /* internal function to poll one entry of cq */
+ static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+ {
+-	int ret = 0;
++	int ret = 0, qmap_tail_idx;
+ 	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ 	struct ehca_cqe *cqe;
+ 	struct ehca_qp *my_qp;
++	struct ehca_qmap_entry *qmap_entry;
++	struct ehca_queue_map *qmap;
+ 	int cqe_count = 0, is_error;
+ 
+ repoll:
+@@ -674,27 +706,52 @@ repoll:
+ 		goto repoll;
+ 	wc->qp = &my_qp->ib_qp;
+ 
+-	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT)) {
+-		struct ehca_qmap_entry *qmap_entry;
++	if (is_error) {
+ 		/*
+-		 * We got a send completion and need to restore the original
+-		 * wr_id.
++		 * set left_to_poll to 0 because in error state, we will not
++		 * get any additional CQEs
+ 		 */
+-		qmap_entry = &my_qp->sq_map[cqe->work_request_id &
+-					    QMAP_IDX_MASK];
++		ehca_add_to_err_list(my_qp, 1);
++		my_qp->sq_map.left_to_poll = 0;
+ 
+-		if (qmap_entry->reported) {
+-			ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+-				  my_qp->real_qp_num);
+-			/* found a double cqe, discard it and read next one */
+-			goto repoll;
+-		}
+-		wc->wr_id = cqe->work_request_id & ~QMAP_IDX_MASK;
+-		wc->wr_id |= qmap_entry->app_wr_id;
+-		qmap_entry->reported = 1;
+-	} else
++		if (HAS_RQ(my_qp))
++			ehca_add_to_err_list(my_qp, 0);
++		my_qp->rq_map.left_to_poll = 0;
++	}
++
++	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
++	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
++		/* We got a send completion. */
++		qmap = &my_qp->sq_map;
++	else
+ 		/* We got a receive completion. */
+-		wc->wr_id = cqe->work_request_id;
++		qmap = &my_qp->rq_map;
++
++	qmap_entry = &qmap->map[qmap_tail_idx];
++	if (qmap_entry->reported) {
++		ehca_warn(cq->device, "Double cqe on qp_num=%#x",
++				my_qp->real_qp_num);
++		/* found a double cqe, discard it and read next one */
++		goto repoll;
++	}
++
++	wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
++	qmap_entry->reported = 1;
++
++	/* this is a proper completion, we need to advance the tail pointer */
++	if (++qmap->tail == qmap->entries)
++		qmap->tail = 0;
++
++	/* if left_to_poll is decremented to 0, add the QP to the error list */
++	if (qmap->left_to_poll > 0) {
++		qmap->left_to_poll--;
++		if ((my_qp->sq_map.left_to_poll == 0) &&
++				(my_qp->rq_map.left_to_poll == 0)) {
++			ehca_add_to_err_list(my_qp, 1);
++			if (HAS_RQ(my_qp))
++				ehca_add_to_err_list(my_qp, 0);
++		}
++	}
+ 
+ 	/* eval ib_wc_opcode */
+ 	wc->opcode = ib_wc_opcode[cqe->optype]-1;
+@@ -733,13 +790,88 @@ poll_cq_one_exit0:
+ 	return ret;
+ }
+ 
++static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
++			       struct ib_wc *wc, int num_entries,
++			       struct ipz_queue *ipz_queue, int on_sq)
++{
++	int nr = 0;
++	struct ehca_wqe *wqe;
++	u64 offset;
++	struct ehca_queue_map *qmap;
++	struct ehca_qmap_entry *qmap_entry;
++
++	if (on_sq)
++		qmap = &my_qp->sq_map;
++	else
++		qmap = &my_qp->rq_map;
++
++	qmap_entry = &qmap->map[qmap->tail];
++
++	while ((nr < num_entries) && (qmap_entry->reported == 0)) {
++		/* generate flush CQE */
++		memset(wc, 0, sizeof(*wc));
++
++		offset = qmap->tail * ipz_queue->qe_size;
++		wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
++		if (!wqe) {
++			ehca_err(cq->device, "Invalid wqe offset=%#lx on "
++				 "qp_num=%#x", offset, my_qp->real_qp_num);
++			return nr;
++		}
++
++		wc->wr_id = replace_wr_id(wqe->work_request_id,
++					  qmap_entry->app_wr_id);
++
++		if (on_sq) {
++			switch (wqe->optype) {
++			case WQE_OPTYPE_SEND:
++				wc->opcode = IB_WC_SEND;
++				break;
++			case WQE_OPTYPE_RDMAWRITE:
++				wc->opcode = IB_WC_RDMA_WRITE;
++				break;
++			case WQE_OPTYPE_RDMAREAD:
++				wc->opcode = IB_WC_RDMA_READ;
++				break;
++			default:
++				ehca_err(cq->device, "Invalid optype=%x",
++						wqe->optype);
++				return nr;
++			}
++		} else
++			wc->opcode = IB_WC_RECV;
++
++		if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
++			wc->ex.imm_data = wqe->immediate_data;
++			wc->wc_flags |= IB_WC_WITH_IMM;
++		}
++
++		wc->status = IB_WC_WR_FLUSH_ERR;
++
++		wc->qp = &my_qp->ib_qp;
++
++		/* mark as reported and advance tail pointer */
++		qmap_entry->reported = 1;
++		if (++qmap->tail == qmap->entries)
++			qmap->tail = 0;
++		qmap_entry = &qmap->map[qmap->tail];
++
++		wc++; nr++;
++	}
++
++	return nr;
++
++}
++
+ int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+ {
+ 	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ 	int nr;
++	struct ehca_qp *err_qp;
+ 	struct ib_wc *current_wc = wc;
+ 	int ret = 0;
+ 	unsigned long flags;
++	int entries_left = num_entries;
+ 
+ 	if (num_entries < 1) {
+ 		ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+@@ -749,15 +881,40 @@ int ehca_poll_cq(struct ib_cq *cq, int n
+ 	}
+ 
+ 	spin_lock_irqsave(&my_cq->spinlock, flags);
+-	for (nr = 0; nr < num_entries; nr++) {
++
++	/* generate flush cqes for send queues */
++	list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
++		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
++				&err_qp->ipz_squeue, 1);
++		entries_left -= nr;
++		current_wc += nr;
++
++		if (entries_left == 0)
++			break;
++	}
++
++	/* generate flush cqes for receive queues */
++	list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
++		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
++				&err_qp->ipz_rqueue, 0);
++		entries_left -= nr;
++		current_wc += nr;
++
++		if (entries_left == 0)
++			break;
++	}
++
++	for (nr = 0; nr < entries_left; nr++) {
+ 		ret = ehca_poll_cq_one(cq, current_wc);
+ 		if (ret)
+ 			break;
+ 		current_wc++;
+ 	} /* eof for nr */
++	entries_left -= nr;
++
+ 	spin_unlock_irqrestore(&my_cq->spinlock, flags);
+ 	if (ret == -EAGAIN  || !ret)
+-		ret = nr;
++		ret = num_entries - entries_left;
+ 
+ poll_cq_exit0:
+ 	return ret;
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_cq.c
++++ infiniband.git/drivers/infiniband/hw/ehca/ehca_cq.c
+@@ -276,6 +276,9 @@ struct ib_cq *ehca_create_cq(struct ib_d
+ 	for (i = 0; i < QP_HASHTAB_LEN; i++)
+ 		INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+ 
++	INIT_LIST_HEAD(&my_cq->sqp_err_list);
++	INIT_LIST_HEAD(&my_cq->rqp_err_list);
++
+ 	if (context) {
+ 		struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+ 		struct ehca_create_cq_resp resp;
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_qp.c
++++ infiniband.git/drivers/infiniband/hw/ehca/ehca_qp.c
+@@ -396,6 +396,50 @@ static void ehca_determine_small_queue(s
+ 	queue->is_small = (queue->page_size != 0);
+ }
+ 
++/* needs to be called with cq->spinlock held */
++void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
++{
++	struct list_head *list, *node;
++
++	/* TODO: support low latency QPs */
++	if (qp->ext_type == EQPT_LLQP)
++		return;
++
++	if (on_sq) {
++		list = &qp->send_cq->sqp_err_list;
++		node = &qp->sq_err_node;
++	} else {
++		list = &qp->recv_cq->rqp_err_list;
++		node = &qp->rq_err_node;
++	}
++
++	if (list_empty(node))
++		list_add_tail(node, list);
++
++	return;
++}
++
++static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&cq->spinlock, flags);
++
++	if (!list_empty(node))
++		list_del_init(node);
++
++	spin_unlock_irqrestore(&cq->spinlock, flags);
++}
++
++static void reset_queue_map(struct ehca_queue_map *qmap)
++{
++	int i;
++
++	qmap->tail = 0;
++	for (i = 0; i < qmap->entries; i++)
++		qmap->map[i].reported = 1;
++}
++
+ /*
+  * Create an ib_qp struct that is either a QP or an SRQ, depending on
+  * the value of the is_srq parameter. If init_attr and srq_init_attr share
+@@ -407,12 +451,11 @@ static struct ehca_qp *internal_create_q
+ 	struct ib_srq_init_attr *srq_init_attr,
+ 	struct ib_udata *udata, int is_srq)
+ {
+-	struct ehca_qp *my_qp;
++	struct ehca_qp *my_qp, *my_srq = NULL;
+ 	struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+ 	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+ 					      ib_device);
+ 	struct ib_ucontext *context = NULL;
+-	u32 nr_qes;
+ 	u64 h_ret;
+ 	int is_llqp = 0, has_srq = 0;
+ 	int qp_type, max_send_sge, max_recv_sge, ret;
+@@ -457,8 +500,7 @@ static struct ehca_qp *internal_create_q
+ 
+ 	/* handle SRQ base QPs */
+ 	if (init_attr->srq) {
+-		struct ehca_qp *my_srq =
+-			container_of(init_attr->srq, struct ehca_qp, ib_srq);
++		my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+ 
+ 		has_srq = 1;
+ 		parms.ext_type = EQPT_SRQBASE;
+@@ -716,15 +758,19 @@ static struct ehca_qp *internal_create_q
+ 				 "and pages ret=%i", ret);
+ 			goto create_qp_exit2;
+ 		}
+-		nr_qes = my_qp->ipz_squeue.queue_length /
++
++		my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+ 			 my_qp->ipz_squeue.qe_size;
+-		my_qp->sq_map = vmalloc(nr_qes *
++		my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+ 					sizeof(struct ehca_qmap_entry));
+-		if (!my_qp->sq_map) {
++		if (!my_qp->sq_map.map) {
+ 			ehca_err(pd->device, "Couldn't allocate squeue "
+ 				 "map ret=%i", ret);
+ 			goto create_qp_exit3;
+ 		}
++		INIT_LIST_HEAD(&my_qp->sq_err_node);
++		/* to avoid the generation of bogus flush CQEs */
++		reset_queue_map(&my_qp->sq_map);
+ 	}
+ 
+ 	if (HAS_RQ(my_qp)) {
+@@ -736,6 +782,25 @@ static struct ehca_qp *internal_create_q
+ 				 "and pages ret=%i", ret);
+ 			goto create_qp_exit4;
+ 		}
++
++		my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
++			my_qp->ipz_rqueue.qe_size;
++		my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
++				sizeof(struct ehca_qmap_entry));
++		if (!my_qp->rq_map.map) {
++			ehca_err(pd->device, "Couldn't allocate squeue "
++					"map ret=%i", ret);
++			goto create_qp_exit5;
++		}
++		INIT_LIST_HEAD(&my_qp->rq_err_node);
++		/* to avoid the generation of bogus flush CQEs */
++		reset_queue_map(&my_qp->rq_map);
++	} else if (init_attr->srq) {
++		/* this is a base QP, use the queue map of the SRQ */
++		my_qp->rq_map = my_srq->rq_map;
++		INIT_LIST_HEAD(&my_qp->rq_err_node);
++
++		my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+ 	}
+ 
+ 	if (is_srq) {
+@@ -799,7 +864,7 @@ static struct ehca_qp *internal_create_q
+ 		if (ret) {
+ 			ehca_err(pd->device,
+ 				 "Couldn't assign qp to send_cq ret=%i", ret);
+-			goto create_qp_exit6;
++			goto create_qp_exit7;
+ 		}
+ 	}
+ 
+@@ -825,25 +890,29 @@ static struct ehca_qp *internal_create_q
+ 		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+ 			ehca_err(pd->device, "Copy to udata failed");
+ 			ret = -EINVAL;
+-			goto create_qp_exit7;
++			goto create_qp_exit8;
+ 		}
+ 	}
+ 
+ 	return my_qp;
+ 
+-create_qp_exit7:
++create_qp_exit8:
+ 	ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+ 
+-create_qp_exit6:
++create_qp_exit7:
+ 	kfree(my_qp->mod_qp_parm);
+ 
++create_qp_exit6:
++	if (HAS_RQ(my_qp))
++		vfree(my_qp->rq_map.map);
++
+ create_qp_exit5:
+ 	if (HAS_RQ(my_qp))
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+ 
+ create_qp_exit4:
+ 	if (HAS_SQ(my_qp))
+-		vfree(my_qp->sq_map);
++		vfree(my_qp->sq_map.map);
+ 
+ create_qp_exit3:
+ 	if (HAS_SQ(my_qp))
+@@ -1035,6 +1104,101 @@ static int prepare_sqe_rts(struct ehca_q
+ 	return 0;
+ }
+ 
++static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
++			  struct ehca_queue_map *qmap)
++{
++	void *wqe_v;
++	u64 q_ofs;
++	u32 wqe_idx;
++
++	/* convert real to abs address */
++	wqe_p = wqe_p & (~(1UL << 63));
++
++	wqe_v = abs_to_virt(wqe_p);
++
++	if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
++		ehca_gen_err("Invalid offset for calculating left cqes "
++				"wqe_p=%#lx wqe_v=%p\n", wqe_p, wqe_v);
++		return -EFAULT;
++	}
++
++	wqe_idx = q_ofs / ipz_queue->qe_size;
++	if (wqe_idx < qmap->tail)
++		qmap->left_to_poll = (qmap->entries - qmap->tail) + wqe_idx;
++	else
++		qmap->left_to_poll = wqe_idx - qmap->tail;
++
++	return 0;
++}
++
++static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
++{
++	u64 h_ret;
++	void *send_wqe_p, *recv_wqe_p;
++	int ret;
++	unsigned long flags;
++	int qp_num = my_qp->ib_qp.qp_num;
++
++	/* this hcall is not supported on base QPs */
++	if (my_qp->ext_type != EQPT_SRQBASE) {
++		/* get send and receive wqe pointer */
++		h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
++				my_qp->ipz_qp_handle, &my_qp->pf,
++				&send_wqe_p, &recv_wqe_p, 4);
++		if (h_ret != H_SUCCESS) {
++			ehca_err(&shca->ib_device, "disable_and_get_wqe() "
++				 "failed ehca_qp=%p qp_num=%x h_ret=%li",
++				 my_qp, qp_num, h_ret);
++			return ehca2ib_return_code(h_ret);
++		}
++
++		/*
++		 * acquire lock to ensure that nobody is polling the cq which
++		 * could mean that the qmap->tail pointer is in an
++		 * inconsistent state.
++		 */
++		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
++		ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
++				&my_qp->sq_map);
++		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
++		if (ret)
++			return ret;
++
++
++		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
++		ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
++				&my_qp->rq_map);
++		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
++		if (ret)
++			return ret;
++	} else {
++		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
++		my_qp->sq_map.left_to_poll = 0;
++		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
++
++		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
++		my_qp->rq_map.left_to_poll = 0;
++		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
++	}
++
++	/* this assures flush cqes being generated only for pending wqes */
++	if ((my_qp->sq_map.left_to_poll == 0) &&
++				(my_qp->rq_map.left_to_poll == 0)) {
++		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
++		ehca_add_to_err_list(my_qp, 1);
++		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
++
++		if (HAS_RQ(my_qp)) {
++			spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
++			ehca_add_to_err_list(my_qp, 0);
++			spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
++					flags);
++		}
++	}
++
++	return 0;
++}
++
+ /*
+  * internal_modify_qp with circumvention to handle aqp0 properly
+  * smi_reset2init indicates if this is an internal reset-to-init-call for
+@@ -1539,10 +1703,27 @@ static int internal_modify_qp(struct ib_
+ 			goto modify_qp_exit2;
+ 		}
+ 	}
++	if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)) {
++		ret = check_for_left_cqes(my_qp, shca);
++		if (ret)
++			goto modify_qp_exit2;
++	}
+ 
+ 	if (statetrans == IB_QPST_ANY2RESET) {
+ 		ipz_qeit_reset(&my_qp->ipz_rqueue);
+ 		ipz_qeit_reset(&my_qp->ipz_squeue);
++
++		if (qp_cur_state == IB_QPS_ERR) {
++			del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
++
++			if (HAS_RQ(my_qp))
++				del_from_err_list(my_qp->recv_cq,
++						  &my_qp->rq_err_node);
++		}
++		reset_queue_map(&my_qp->sq_map);
++
++		if (HAS_RQ(my_qp))
++			reset_queue_map(&my_qp->rq_map);
+ 	}
+ 
+ 	if (attr_mask & IB_QP_QKEY)
+@@ -1958,6 +2139,16 @@ static int internal_destroy_qp(struct ib
+ 	idr_remove(&ehca_qp_idr, my_qp->token);
+ 	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+ 
++	/*
++	 * SRQs will never get into an error list and do not have a recv_cq,
++	 * so we need to skip them here.
++	 */
++	if (HAS_RQ(my_qp) && !IS_SRQ(my_qp))
++		del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
++
++	if (HAS_SQ(my_qp))
++		del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
++
+ 	/* now wait until all pending events have completed */
+ 	wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+ 
+@@ -1983,7 +2174,7 @@ static int internal_destroy_qp(struct ib
+ 	if (qp_type == IB_QPT_GSI) {
+ 		struct ib_event event;
+ 		ehca_info(dev, "device %s: port %x is inactive.",
+-			  shca->ib_device.name, port_num);
++				shca->ib_device.name, port_num);
+ 		event.device = &shca->ib_device;
+ 		event.event = IB_EVENT_PORT_ERR;
+ 		event.element.port_num = port_num;
+@@ -1991,11 +2182,15 @@ static int internal_destroy_qp(struct ib
+ 		ib_dispatch_event(&event);
+ 	}
+ 
+-	if (HAS_RQ(my_qp))
++	if (HAS_RQ(my_qp)) {
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
++
++		vfree(my_qp->rq_map.map);
++	}
+ 	if (HAS_SQ(my_qp)) {
+ 		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+-		vfree(my_qp->sq_map);
++
++		vfree(my_qp->sq_map.map);
+ 	}
+ 	kmem_cache_free(qp_cache, my_qp);
+ 	atomic_dec(&shca->num_qps);
+--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_iverbs.h
++++ infiniband.git/drivers/infiniband/hw/ehca/ehca_iverbs.h
+@@ -197,6 +197,8 @@ void ehca_poll_eqs(unsigned long data);
+ int ehca_calc_ipd(struct ehca_shca *shca, int port,
+ 		  enum ib_rate path_rate, u32 *ipd);
+ 
++void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
++
+ #ifdef CONFIG_PPC_64K_PAGES
+ void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+ void ehca_free_fw_ctrlblock(void *ptr);
+--
-- 
1.5.5






More information about the ewg mailing list