[openib-general] [PATCH] handle QP0/1 send queue overrun

Sean Hefty mshefty at ichips.intel.com
Tue Nov 9 17:12:53 PST 2004


The following patch adds support for handling QP0/1 send queue overrun, 
along with a couple of related fixes:

* The patch includes that provided by Roland in order to configure the 
fabric.
* The code no longer modifies the user's send_wr structures when sending 
a MAD.
* Sent MADs work requests are copied in order to handle both queuing and 
for error recovery (when added).
* The receive side code was slightly restructured to use a single 
function to repost receives.  If a receive cannot be posted for some 
reason (e.g. lack of memory), it will now try to refill the receive 
queue when posting an additional receive.  (This will also make it 
possible for the code to be lazier about reposting receives, which would 
allow for better batching of completions.)

Also, I switched my mailer, so I apologize in advance if I hose up my patch.

- Sean



Index: core/agent.c
===================================================================
--- core/agent.c	(revision 1186)
+++ core/agent.c	(working copy)
@@ -117,9 +117,9 @@
  	/* PCI mapping */
  	gather_list.addr = pci_map_single(mad_agent->device->dma_device,
  					  &mad->mad,
-					  sizeof(struct ib_mad),
+					  sizeof mad->mad,
  					  PCI_DMA_TODEVICE);
-	gather_list.length = sizeof(struct ib_mad);
+	gather_list.length = sizeof mad->mad;
  	gather_list.lkey = (*port_priv->mr).lkey;

  	send_wr.next = NULL;
@@ -182,8 +182,7 @@
  		spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
  		pci_unmap_single(mad_agent->device->dma_device,
  				 pci_unmap_addr(agent_send_wr, mapping),
-				 sizeof(struct ib_mad_private) -
-				 sizeof(struct ib_mad_private_header),
+				 sizeof mad->mad,
  				 PCI_DMA_TODEVICE);
  		ib_destroy_ah(agent_send_wr->ah);
  		kfree(agent_send_wr);
@@ -272,7 +271,7 @@
  	/* Unmap PCI */
  	pci_unmap_single(mad_agent->device->dma_device,
  			 pci_unmap_addr(agent_send_wr, mapping),
-			 sizeof(struct ib_mad),
+			 sizeof agent_send_wr->mad->mad,
  			 PCI_DMA_TODEVICE);

  	ib_destroy_ah(agent_send_wr->ah);
Index: core/mad.c
===================================================================
--- core/mad.c	(revision 1186)
+++ core/mad.c	(working copy)
@@ -83,9 +83,8 @@
  static int add_mad_reg_req(struct ib_mad_reg_req *mad_reg_req,
  			   struct ib_mad_agent_private *priv);
  static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
-static int ib_mad_post_receive_mad(struct ib_mad_qp_info *qp_info,
-				   struct ib_mad_private *mad);
-static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
  static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
  static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private 
*mad_send_wr,
  				    struct ib_mad_send_wc *mad_send_wc);
@@ -345,24 +344,11 @@
  }
  EXPORT_SYMBOL(ib_unregister_mad_agent);

-static void queue_mad(struct ib_mad_queue *mad_queue,
-		      struct ib_mad_list_head *mad_list)
-{
-	unsigned long flags;
-
-	mad_list->mad_queue = mad_queue;
-	spin_lock_irqsave(&mad_queue->lock, flags);
-	list_add_tail(&mad_list->list, &mad_queue->list);
-	mad_queue->count++;
-	spin_unlock_irqrestore(&mad_queue->lock, flags);
-}
-
  static void dequeue_mad(struct ib_mad_list_head *mad_list)
  {
  	struct ib_mad_queue *mad_queue;
  	unsigned long flags;

-	BUG_ON(!mad_list->mad_queue);
  	mad_queue = mad_list->mad_queue;
  	spin_lock_irqsave(&mad_queue->lock, flags);
  	list_del(&mad_list->list);
@@ -481,24 +467,35 @@
  }

  static int ib_send_mad(struct ib_mad_agent_private *mad_agent_priv,
-		       struct ib_mad_send_wr_private *mad_send_wr,
-		       struct ib_send_wr *send_wr,
-		       struct ib_send_wr **bad_send_wr)
+		       struct ib_mad_send_wr_private *mad_send_wr)
  {
  	struct ib_mad_qp_info *qp_info;
+	struct ib_send_wr *bad_send_wr;
+	unsigned long flags;
  	int ret;

  	/* Replace user's WR ID with our own to find WR upon completion */
  	qp_info = mad_agent_priv->qp_info;
-	mad_send_wr->wr_id = send_wr->wr_id;
-	send_wr->wr_id = (unsigned long)&mad_send_wr->mad_list;
-	queue_mad(&qp_info->send_queue, &mad_send_wr->mad_list);
+	mad_send_wr->wr_id = mad_send_wr->send_wr.wr_id;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;

-	ret = ib_post_send(mad_agent_priv->agent.qp, send_wr, bad_send_wr);
-	if (ret) {
-		printk(KERN_NOTICE PFX "ib_post_send failed ret = %d\n", ret);
-		dequeue_mad(&mad_send_wr->mad_list);
-		*bad_send_wr = send_wr;
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count++ < qp_info->send_queue.max_active) {
+		list_add_tail(&mad_send_wr->mad_list.list,
+			      &qp_info->send_queue.list);
+		spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+		ret = ib_post_send(mad_agent_priv->agent.qp,
+				   &mad_send_wr->send_wr, &bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			dequeue_mad(&mad_send_wr->mad_list);
+		}
+	} else {
+		list_add_tail(&mad_send_wr->mad_list.list,
+			      &qp_info->overflow_list);
+		spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+		ret = 0;
  	}
  	return ret;
  }
@@ -511,9 +508,8 @@
  		     struct ib_send_wr *send_wr,
  		     struct ib_send_wr **bad_send_wr)
  {
-	int ret;
-	struct ib_send_wr	*cur_send_wr, *next_send_wr;
-	struct ib_mad_agent_private	*mad_agent_priv;
+	int ret = -EINVAL;
+	struct ib_mad_agent_private *mad_agent_priv;

  	/* Validate supplied parameters */
  	if (!bad_send_wr)
@@ -522,6 +518,9 @@
  	if (!mad_agent || !send_wr )
  		goto error2;

+	if (send_wr->num_sge > IB_MAD_SEND_REQ_MAX_SG)
+		goto error2;
+
  	if (!mad_agent->send_handler ||
  	    (send_wr->wr.ud.timeout_ms && !mad_agent->recv_handler))
  		goto error2;
@@ -531,30 +530,31 @@
  				      agent);

  	/* Walk list of send WRs and post each on send list */
-	cur_send_wr = send_wr;
-	while (cur_send_wr) {
+	while (send_wr) {
  		unsigned long			flags;
+		struct ib_send_wr		*next_send_wr;
  		struct ib_mad_send_wr_private	*mad_send_wr;
  		struct ib_smp			*smp;

-		if (!cur_send_wr->wr.ud.mad_hdr) {
-			*bad_send_wr = cur_send_wr;
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion.
+		 */
+		if (!send_wr->wr.ud.mad_hdr) {
  			printk(KERN_ERR PFX "MAD header must be supplied "
-			       "in WR %p\n", cur_send_wr);
-			goto error1;
+			       "in WR %p\n", send_wr);
+			goto error2;
  		}
+		next_send_wr = (struct ib_send_wr *)send_wr->next;

-		next_send_wr = (struct ib_send_wr *)cur_send_wr->next;
-
-		smp = (struct ib_smp *)cur_send_wr->wr.ud.mad_hdr;
+		smp = (struct ib_smp *)send_wr->wr.ud.mad_hdr;
  		if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-			ret = handle_outgoing_smp(mad_agent, smp, cur_send_wr);
-			if (ret < 0) {	/* error */
-				*bad_send_wr = cur_send_wr;
-				goto error1;
-			} else if (ret == 1) {	/* locally consumed */
+			ret = handle_outgoing_smp(mad_agent, smp, send_wr);
+			if (ret < 0)		/* error */
+				goto error2;
+			else if (ret == 1)	/* locally consumed */
  				goto next;
-			}
  		}

  		/* Allocate MAD send WR tracking structure */
@@ -562,16 +562,21 @@
  				      (in_atomic() || irqs_disabled()) ?
  				      GFP_ATOMIC : GFP_KERNEL);
  		if (!mad_send_wr) {
-			*bad_send_wr = cur_send_wr;
  			printk(KERN_ERR PFX "No memory for "
  			       "ib_mad_send_wr_private\n");
-			return -ENOMEM;	
+			ret = -ENOMEM;
+			goto error2;
  		}

+		mad_send_wr->send_wr = *send_wr;
+		mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+		memcpy(mad_send_wr->sg_list, send_wr->sg_list,
+		       sizeof *send_wr->sg_list * send_wr->num_sge);
+		mad_send_wr->send_wr.next = NULL;
  		mad_send_wr->tid = send_wr->wr.ud.mad_hdr->tid;
  		mad_send_wr->agent = mad_agent;
  		/* Timeout will be updated after send completes */
-		mad_send_wr->timeout = msecs_to_jiffies(cur_send_wr->wr.
+		mad_send_wr->timeout = msecs_to_jiffies(send_wr->wr.
  							ud.timeout_ms);
  		/* One reference for each work request to QP + response */
  		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
@@ -584,31 +589,24 @@
  			      &mad_agent_priv->send_list);
  		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);

-		cur_send_wr->next = NULL;
-		ret = ib_send_mad(mad_agent_priv, mad_send_wr,
-				  cur_send_wr, bad_send_wr);
+		ret = ib_send_mad(mad_agent_priv, mad_send_wr);
  		if (ret) {
-			/* Handle QP overrun separately... -ENOMEM */
-			/* Handle posting when QP is in error state... */
-
  			/* Fail send request */
  			spin_lock_irqsave(&mad_agent_priv->lock, flags);
  			list_del(&mad_send_wr->agent_list);
  			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
  			atomic_dec(&mad_agent_priv->refcount);
-			return ret;		
+			goto error2;
  		}
  next:
-		cur_send_wr = next_send_wr;
+		send_wr = next_send_wr;
  	}
-
  	return 0;	

  error2:
  	*bad_send_wr = send_wr;
  error1:
-	return -EINVAL;
+	return ret;
  }
  EXPORT_SYMBOL(ib_post_send_mad);

@@ -1125,7 +1123,6 @@

  	/* Give driver "right of first refusal" on incoming MAD */
  	if (port_priv->device->process_mad) {
-		struct ib_grh *grh;
  		int ret;

  		if (!response) {
@@ -1144,20 +1141,8 @@
  						     &response->mad.mad);
  		if (ret & IB_MAD_RESULT_SUCCESS) {
  			if (ret & IB_MAD_RESULT_REPLY) {
-				if (response->mad.mad.mad_hdr.mgmt_class ==
-				    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-					if (!smi_handle_dr_smp_recv(
-					    (struct ib_smp *)&response->mad.mad,
-					    port_priv->device->node_type,
-					    port_priv->port_num,
-					    port_priv->device->phys_port_cnt)) {
-						goto out;
-					}
-				}
  				/* Send response */
-				grh = (void *)recv->header.recv_buf.mad -
-				      sizeof(struct ib_grh);
-				if (!agent_send(response, grh, wc,
+				if (!agent_send(response, &recv->grh, wc,
  						port_priv->device,
  						port_priv->port_num))
  					response = NULL;
@@ -1178,13 +1163,14 @@
  		 */
  		recv = NULL;
  	}
-
  out:
-	if (recv)
-		kmem_cache_free(ib_mad_cache, recv);
-
  	/* Post another receive request for this QP */
-	ib_mad_post_receive_mad(qp_info, response);
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
  }

  static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
@@ -1291,16 +1277,51 @@
  static void ib_mad_send_done_handler(struct ib_mad_port_private 
*port_priv,
  				     struct ib_wc *wc)
  {
-	struct ib_mad_send_wr_private	*mad_send_wr;
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
  	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	unsigned long flags;
+	int ret;

  	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
  	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
  				   mad_list);
-	dequeue_mad(mad_list);
-	/* Restore client wr_id in WC */
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue. */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_del(&mad_list->list);
+		list_add_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	/* Restore client wr_id in WC and complete send. */
  	wc->wr_id = mad_send_wr->wr_id;
  	ib_mad_complete_send_wr(mad_send_wr, (struct ib_mad_send_wc*)wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				&bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
  }

  /*
@@ -1492,88 +1513,74 @@
  	queue_work(port_priv->wq, &port_priv->work);
  }

-static int ib_mad_post_receive_mad(struct ib_mad_qp_info *qp_info,
-				   struct ib_mad_private *mad)
+/*
+ * Allocate receive MADs and post receive WRs for them.
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
  {
+	unsigned long flags;
+	int post, ret;
  	struct ib_mad_private *mad_priv;
  	struct ib_sge sg_list;
-	struct ib_recv_wr recv_wr;
-	struct ib_recv_wr *bad_recv_wr;
-	int ret;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;

-	if (mad)
-		mad_priv = mad;
-	else {
-		/*
-		 * Allocate memory for receive buffer.
-		 * This is for both MAD and private header
-		 * which contains the receive tracking structure.
-		 * By prepending this header, there is one rather
-		 * than two memory allocations.
-		 */
-		mad_priv = kmem_cache_alloc(ib_mad_cache,
-					    (in_atomic() || irqs_disabled()) ?
-					    GFP_ATOMIC : GFP_KERNEL);
-		if (!mad_priv) {
-			printk(KERN_ERR PFX "No memory for receive buffer\n");
-			return -ENOMEM;
-		}
-	}
-
-	/* Setup scatter list */
-	sg_list.addr = pci_map_single(qp_info->port_priv->device->dma_device,
-				      &mad_priv->grh,
-				      sizeof *mad_priv -
-					sizeof mad_priv->header,
-				      PCI_DMA_FROMDEVICE);
+	/* Initialize common scatter list fields. */
  	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
  	sg_list.lkey = (*qp_info->port_priv->mr).lkey;

-	/* Setup receive WR */
+	/* Initialize common receive WR fields. */
  	recv_wr.next = NULL;
  	recv_wr.sg_list = &sg_list;
  	recv_wr.num_sge = 1;
  	recv_wr.recv_flags = IB_RECV_SIGNALED;
-	recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
-	pci_unmap_addr_set(&mad_priv->header, mapping, sg_list.addr);
-
-	/* Post receive WR. */
-	queue_mad(&qp_info->recv_queue, &mad_priv->header.mad_list);
-	ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
-	if (ret) {
-		dequeue_mad(&mad_priv->header.mad_list);
-		pci_unmap_single(qp_info->port_priv->device->dma_device,
-				 pci_unmap_addr(&mad_priv->header, mapping),
-				 sizeof *mad_priv - sizeof mad_priv->header,
-				 PCI_DMA_FROMDEVICE);
-
-		kmem_cache_free(ib_mad_cache, mad_priv);
-		printk(KERN_NOTICE PFX "ib_post_recv WRID 0x%Lx "
-		       "failed ret = %d\n",
-		       (unsigned long long) recv_wr.wr_id, ret);
-		return -EINVAL;
-	}
-
-	return 0;
-}

-/*
- * Allocate receive MADs and post receive WRs for them
- */
-static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info)
-{
-	int i, ret;
-
-	for (i = 0; i < IB_MAD_QP_RECV_SIZE; i++) {
-		ret = ib_mad_post_receive_mad(qp_info, NULL);
+	do {
+		/* Allocate and map receive buffer. */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				printk(KERN_ERR PFX "No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = pci_map_single(qp_info->port_priv->
+						device->dma_device,
+					&mad_priv->grh,
+					sizeof *mad_priv -
+						sizeof mad_priv->header,
+					PCI_DMA_FROMDEVICE);
+		pci_unmap_addr_set(&mad_priv->header, mapping, sg_list.addr);
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR. */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
  		if (ret) {
-			printk(KERN_ERR PFX "receive post %d failed "
-				"on %s port %d\n", i + 1,
-				qp_info->port_priv->device->name,
-				qp_info->port_priv->port_num);
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			pci_unmap_single(qp_info->port_priv->device->dma_device,
+					 pci_unmap_addr(&mad_priv->header,
+							mapping),
+					 sizeof *mad_priv -
+					   sizeof mad_priv->header,
+					 PCI_DMA_FROMDEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			printk(KERN_ERR PFX "ib_post_recv failed: = %d\n", ret);
  			break;
  		}
-	}
+	} while (post);
  	return ret;
  }

@@ -1625,6 +1632,7 @@
  	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
  	INIT_LIST_HEAD(&qp_info->send_queue.list);
  	qp_info->send_queue.count = 0;
+	INIT_LIST_HEAD(&qp_info->overflow_list);
  	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
  }

@@ -1789,7 +1797,7 @@
  	}

  	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
-		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i]);
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
  		if (ret) {
  			printk(KERN_ERR PFX "Couldn't post receive "
  			       "requests\n");
@@ -1851,6 +1859,7 @@
  	qp_info->port_priv = port_priv;
  	init_mad_queue(qp_info, &qp_info->send_queue);
  	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);

  	memset(&qp_init_attr, 0, sizeof qp_init_attr);
  	qp_init_attr.send_cq = port_priv->cq;
@@ -1870,6 +1879,9 @@
  		ret = PTR_ERR(qp_info->qp);
  		goto error;		
  	}
+	/* Use minimum queue sizes unless the CQ is resized. */
+	qp_info->send_queue.max_active = IB_MAD_QP_SEND_SIZE;
+	qp_info->recv_queue.max_active = IB_MAD_QP_RECV_SIZE;
  	return 0;

  error:
Index: core/mad_priv.h
===================================================================
--- core/mad_priv.h	(revision 1186)
+++ core/mad_priv.h	(working copy)
@@ -122,6 +122,8 @@
  	struct ib_mad_list_head mad_list;
  	struct list_head agent_list;
  	struct ib_mad_agent *agent;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
  	u64 wr_id;			/* client WR ID */
  	u64 tid;
  	unsigned long timeout;
@@ -141,6 +143,7 @@
  	spinlock_t lock;
  	struct list_head list;
  	int count;
+	int max_active;
  	struct ib_mad_qp_info *qp_info;
  };

@@ -149,7 +152,7 @@
  	struct ib_qp *qp;
  	struct ib_mad_queue send_queue;
  	struct ib_mad_queue recv_queue;
-	/* struct ib_mad_queue overflow_queue; */
+	struct list_head overflow_list;
  };

  struct ib_mad_port_private {

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: diffs
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20041109/348c57b2/attachment.ksh>


More information about the general mailing list