[openib-general] [PATCH 3 of 3] mad: large RMPP support, Round 2

Jack Morgenstein jackm at mellanox.co.il
Sun Feb 12 07:30:36 PST 2006


Implement large RMPP support:
Send side: split a multipacket MAD buffer to a list of
segments, (multipacket_list) and send these using a gather list of size 2.
Also, save pointer to last sent segment, and retrieve requested segments
by walking list starting at last sent segment.
Finally, save pointer to last-acked segment. When retrying, retrieve segments
for resending relative to this pointer.  When updating last ack, start at
this pointer.

List scan for get next segment is thus reduced from O(N^^2) to O(N).
In normal flow, the segment list will be scanned only twice (once for
retrieving next segment to send, once for updating the last-ack pointer).

Signed-off-by: Jack Morgenstein <jackm at mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: src/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- src.orig/drivers/infiniband/core/mad_rmpp.c	2006-02-12 16:30:44.624175000 +0200
+++ src/drivers/infiniband/core/mad_rmpp.c	2006-02-12 16:30:53.114901000 +0200
@@ -535,6 +535,7 @@ start_rmpp(struct ib_mad_agent_private *
 static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	struct ib_rmpp_mad *rmpp_mad;
+	struct ib_mad_multipacket_seg *seg;
 	int timeout;
 	u32 paylen;
 
@@ -547,14 +548,16 @@ static int send_next_seg(struct ib_mad_s
 		paylen = mad_send_wr->total_seg * IB_MGMT_RMPP_DATA -
 			 mad_send_wr->pad;
 		rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
-		mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
 	} else {
-		mad_send_wr->send_wr.num_sge = 2;
-		mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
-		mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
-						 mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+		seg = ib_rmpp_get_multipacket_seg(mad_send_wr,
+						  mad_send_wr->seg_num);
+		if (!seg) {
+			printk(KERN_ERR PFX "send_next_seg: "
+			       "could not find segment %d\n",
+			       mad_send_wr->seg_num);
+			return -EINVAL;
+		}
+		mad_send_wr->mad_payload = seg->data;
 		rmpp_mad->rmpp_hdr.paylen_newwin = 0;
 	}
 
@@ -600,6 +603,28 @@ out:
 	spin_unlock_irqrestore(&agent->lock, flags);
 }
 
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr)
+{
+	struct ib_mad_multipacket_seg *seg;
+
+	if (wr->last_ack < 2)
+		return;
+	else if (!wr->last_ack_seg)
+		list_for_each_entry(seg, &wr->multipacket_list, list) {
+			if (wr->last_ack == seg->num) {
+				wr->last_ack_seg = seg;
+				break;
+			}
+		}
+	else
+		list_for_each_entry(seg, &wr->last_ack_seg->list, list) {
+			if (wr->last_ack == seg->num) {
+				wr->last_ack_seg = seg;
+				break;
+			}
+		}
+}
+
 static void process_rmpp_ack(struct ib_mad_agent_private *agent,
 			     struct ib_mad_recv_wc *mad_recv_wc)
 {
@@ -647,6 +672,7 @@ static void process_rmpp_ack(struct ib_m
 
 	if (seg_num > mad_send_wr->last_ack) {
 		mad_send_wr->last_ack = seg_num;
+		adjust_last_ack(mad_send_wr);
 		mad_send_wr->retries = mad_send_wr->send_buf.retries;
 	}
 	mad_send_wr->newwin = newwin;
@@ -793,7 +819,7 @@ out:
 int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	struct ib_rmpp_mad *rmpp_mad;
-	int i, total_len, ret;
+	int ret;
 
 	rmpp_mad = mad_send_wr->send_buf.mad;
 	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
@@ -803,20 +829,16 @@ int ib_send_rmpp_mad(struct ib_mad_send_
 	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
 		return IB_RMPP_RESULT_INTERNAL;
 
-	if (mad_send_wr->send_wr.num_sge > 1)
-		return -EINVAL;		/* TODO: support num_sge > 1 */
+	if (mad_send_wr->send_wr.num_sge != 2)
+		return -EINVAL;
 
 	mad_send_wr->seg_num = 1;
 	mad_send_wr->newwin = 1;
 	mad_send_wr->data_offset = data_offset(rmpp_mad->mad_hdr.mgmt_class);
 
-	total_len = 0;
-	for (i = 0; i < mad_send_wr->send_wr.num_sge; i++)
-		total_len += mad_send_wr->send_wr.sg_list[i].length;
-
-        mad_send_wr->total_seg = (total_len - mad_send_wr->data_offset) /
+	mad_send_wr->total_seg = (mad_send_wr->total_length - mad_send_wr->data_offset) /
 			(sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
-	mad_send_wr->pad = total_len - IB_MGMT_RMPP_HDR -
+	mad_send_wr->pad = mad_send_wr->total_length - IB_MGMT_RMPP_HDR -
 			   be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
 
 	/* We need to wait for the final ACK even if there isn't a response */
@@ -880,6 +902,8 @@ int ib_retry_rmpp(struct ib_mad_send_wr_
 		return IB_RMPP_RESULT_PROCESSED;
 
 	mad_send_wr->seg_num = mad_send_wr->last_ack + 1;
+	mad_send_wr->seg_num_seg = mad_send_wr->last_ack_seg;
+
 	ret = send_next_seg(mad_send_wr);
 	if (ret)
 		return IB_RMPP_RESULT_PROCESSED;
Index: src/drivers/infiniband/core/mad.c
===================================================================
--- src.orig/drivers/infiniband/core/mad.c	2006-02-12 16:30:29.940545000 +0200
+++ src/drivers/infiniband/core/mad.c	2006-02-12 16:30:53.131904000 +0200
@@ -779,6 +779,54 @@ static int get_buf_length(int hdr_len, i
 	return hdr_len + data_len + pad;
 }
 
+static void free_send_multipacket_list(struct ib_mad_send_wr_private *
+				       mad_send_wr)
+{
+	struct ib_mad_multipacket_seg *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->multipacket_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static inline int alloc_send_rmpp_segs(struct ib_mad_send_wr_private *send_wr,
+				       int message_size, int hdr_len,
+				       int data_len, u8 rmpp_version,
+				       gfp_t gfp_mask)
+{
+	struct ib_mad_multipacket_seg *seg;
+	struct ib_rmpp_mad *rmpp_mad = send_wr->send_buf.mad;
+	int seg_size, i = 2;
+
+	rmpp_mad->rmpp_hdr.paylen_newwin =
+			cpu_to_be32(hdr_len - IB_MGMT_RMPP_HDR + data_len);
+	rmpp_mad->rmpp_hdr.rmpp_version = rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	send_wr->total_length = message_size;
+	/* allocate RMPP buffers */
+	message_size -= sizeof(struct ib_mad);
+	seg_size = sizeof(struct ib_mad) - hdr_len;
+	while (message_size > 0) {
+		seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) + seg_size,
+				     gfp_mask);
+		if (!seg) {
+			printk(KERN_ERR "ib_create_send_mad: RMPP mem "
+			       "alloc failed for len %zd, gfp %#x\n",
+			       sizeof(struct ib_mad_multipacket_seg) + seg_size,
+			       gfp_mask);
+			free_send_multipacket_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->size = seg_size;
+		seg->num = i++;
+		list_add_tail(&seg->list, &send_wr->multipacket_list);
+		message_size -= seg_size;
+	}
+	return 0;
+}
+
 struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 					    u32 remote_qpn, u16 pkey_index,
 					    int rmpp_active,
@@ -787,53 +835,54 @@ struct ib_mad_send_buf * ib_create_send_
 {
 	struct ib_mad_agent_private *mad_agent_priv;
 	struct ib_mad_send_wr_private *mad_send_wr;
-	int length, buf_size;
+	int length, message_size, ret;
 	void *buf;
 
 	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
 				      agent);
-	buf_size = get_buf_length(hdr_len, data_len);
+	message_size = get_buf_length(hdr_len, data_len);
 
 	if ((!mad_agent->rmpp_version &&
-	     (rmpp_active || buf_size > sizeof(struct ib_mad))) ||
-	    (!rmpp_active && buf_size > sizeof(struct ib_mad)))
+	     (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+	    (!rmpp_active && message_size > sizeof(struct ib_mad)))
 		return ERR_PTR(-EINVAL);
 
-	length = sizeof *mad_send_wr + buf_size;
-	if (length >= PAGE_SIZE)
-		buf = (void *)__get_free_pages(gfp_mask, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		buf = kmalloc(length, gfp_mask);
+	length = sizeof *mad_send_wr + message_size;
+	buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
 
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	memset(buf, 0, length);
-
-	mad_send_wr = buf + buf_size;
+	mad_send_wr = buf + sizeof(struct ib_mad);
+	INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
 	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->mad_payload = buf + hdr_len;
 
 	mad_send_wr->mad_agent_priv = mad_agent_priv;
-	mad_send_wr->sg_list[0].length = buf_size;
+	mad_send_wr->sg_list[0].length = hdr_len;
 	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
 
 	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
 	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
-	mad_send_wr->send_wr.num_sge = 1;
+	mad_send_wr->send_wr.num_sge = 2;
 	mad_send_wr->send_wr.opcode = IB_WR_SEND;
 	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
 	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
 	mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
 	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+	mad_send_wr->last_ack_seg = NULL;
+	mad_send_wr->seg_num_seg = NULL;
 
 	if (rmpp_active) {
-		struct ib_rmpp_mad *rmpp_mad = mad_send_wr->send_buf.mad;
-		rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(hdr_len -
-						   IB_MGMT_RMPP_HDR + data_len);
-		rmpp_mad->rmpp_hdr.rmpp_version = mad_agent->rmpp_version;
-		rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
-		ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr,
-				  IB_MGMT_RMPP_FLAG_ACTIVE);
+		ret = alloc_send_rmpp_segs(mad_send_wr, message_size, hdr_len,
+					   data_len, mad_agent->rmpp_version,
+					   gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
 	}
 
 	mad_send_wr->send_buf.mad_agent = mad_agent;
@@ -842,23 +891,71 @@ struct ib_mad_send_buf * ib_create_send_
 }
 EXPORT_SYMBOL(ib_create_send_mad);
 
+struct ib_mad_multipacket_seg
+*ib_rmpp_get_multipacket_seg(struct ib_mad_send_wr_private *wr, int seg_num)
+{
+	struct ib_mad_multipacket_seg *seg;
+
+	if (seg_num == 2) {
+		wr->seg_num_seg =
+			container_of(wr->multipacket_list.next,
+				     struct ib_mad_multipacket_seg, list);
+		return wr->seg_num_seg;
+	}
+
+	/* get first list entry if was not already done */
+	if (!wr->seg_num_seg)
+		wr->seg_num_seg =
+			container_of(wr->multipacket_list.next,
+				     struct ib_mad_multipacket_seg, list);
+
+	if (wr->seg_num_seg->num == seg_num)
+		return wr->seg_num_seg;
+	else if (wr->seg_num_seg->num < seg_num) {
+		list_for_each_entry(seg, &wr->seg_num_seg->list, list) {
+			if (seg->num == seg_num) {
+				wr->seg_num_seg = seg;
+				return wr->seg_num_seg;
+			}
+		}
+		return NULL;
+	} else {
+		list_for_each_entry_reverse(seg, &wr->seg_num_seg->list, list) {
+			if (seg->num == seg_num) {
+				wr->seg_num_seg = seg;
+				return wr->seg_num_seg;
+			}
+		}
+		return NULL;
+	}
+	return NULL;
+}
+
+struct ib_mad_multipacket_seg
+*ib_mad_get_multipacket_seg(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *wr;
+
+	if (seg_num < 2)
+		return NULL;
+
+	wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf);
+	return ib_rmpp_get_multipacket_seg(wr, seg_num);
+}
+EXPORT_SYMBOL(ib_mad_get_multipacket_seg);
+
 void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
-	void *mad_send_wr;
-	int length;
+	struct ib_mad_send_wr_private *mad_send_wr;
 
 	mad_agent_priv = container_of(send_buf->mad_agent,
 				      struct ib_mad_agent_private, agent);
 	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
 				   send_buf);
 
-	length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - send_buf->mad);
-	if (length >= PAGE_SIZE)
-		free_pages((unsigned long)send_buf->mad, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		kfree(send_buf->mad);
-
+	free_send_multipacket_list(mad_send_wr);
+	kfree(send_buf->mad);
 	if (atomic_dec_and_test(&mad_agent_priv->refcount))
 		wake_up(&mad_agent_priv->wait);
 }
@@ -881,10 +978,17 @@ int ib_send_mad(struct ib_mad_send_wr_pr
 
 	mad_agent = mad_send_wr->send_buf.mad_agent;
 	sge = mad_send_wr->sg_list;
-	sge->addr = dma_map_single(mad_agent->device->dma_device,
-				   mad_send_wr->send_buf.mad, sge->length,
-				   DMA_TO_DEVICE);
-	pci_unmap_addr_set(mad_send_wr, mapping, sge->addr);
+	sge[0].addr = dma_map_single(mad_agent->device->dma_device,
+				     mad_send_wr->send_buf.mad,
+				     sge[0].length,
+				     DMA_TO_DEVICE);
+	pci_unmap_addr_set(mad_send_wr, header_mapping, sge[0].addr);
+
+	sge[1].addr = dma_map_single(mad_agent->device->dma_device,
+				     mad_send_wr->mad_payload,
+				     sge[1].length,
+				     DMA_TO_DEVICE);
+	pci_unmap_addr_set(mad_send_wr, payload_mapping, sge[1].addr);
 
 	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
 	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
@@ -901,11 +1005,15 @@ int ib_send_mad(struct ib_mad_send_wr_pr
 		list_add_tail(&mad_send_wr->mad_list.list, list);
 	}
 	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
-	if (ret)
+	if (ret) {
 		dma_unmap_single(mad_agent->device->dma_device,
-				 pci_unmap_addr(mad_send_wr, mapping),
-				 sge->length, DMA_TO_DEVICE);
+				 pci_unmap_addr(mad_send_wr, header_mapping),
+				 sge[0].length, DMA_TO_DEVICE);
 
+		dma_unmap_single(mad_agent->device->dma_device,
+				 pci_unmap_addr(mad_send_wr, payload_mapping),
+				 sge[1].length, DMA_TO_DEVICE);
+	}
 	return ret;
 }
 
@@ -1876,8 +1984,11 @@ static void ib_mad_send_done_handler(str
 
 retry:
 	dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
-			 pci_unmap_addr(mad_send_wr, mapping),
+			 pci_unmap_addr(mad_send_wr, header_mapping),
 			 mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
+			 pci_unmap_addr(mad_send_wr, payload_mapping),
+			 mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
 	queued_send_wr = NULL;
 	spin_lock_irqsave(&send_queue->lock, flags);
 	list_del(&mad_list->list);
Index: src/drivers/infiniband/core/user_mad.c
===================================================================
--- src.orig/drivers/infiniband/core/user_mad.c	2006-02-12 16:30:44.636158000 +0200
+++ src/drivers/infiniband/core/user_mad.c	2006-02-12 16:30:53.142901000 +0200
@@ -255,10 +255,11 @@ static void send_handler(struct ib_mad_a
 	ib_free_send_mad(packet->msg);
 
 	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
-		timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL);
+		timeout = kzalloc(sizeof *timeout + sizeof(struct ib_mad),
+				  GFP_KERNEL);
 		if (!timeout)
 			goto out;
-
+		INIT_LIST_HEAD(&timeout->seg_list);
 		timeout->length 	= IB_MGMT_MAD_HDR;
 		timeout->mad.hdr.id 	= packet->mad.hdr.id;
 		timeout->mad.hdr.status = ETIMEDOUT;
@@ -266,7 +267,7 @@ static void send_handler(struct ib_mad_a
 		       sizeof (struct ib_mad_hdr));
 
 		if (queue_packet(file, agent, timeout))
-			kfree(timeout);
+			free_packet(timeout);
 	}
 out:
 	kfree(packet);
@@ -409,6 +410,8 @@ static ssize_t ib_umad_write(struct file
 	__be64 *tid;
 	int ret, length, hdr_len, copy_offset;
 	int rmpp_active, has_rmpp_header;
+	int s, seg_num;
+	struct ib_mad_multipacket_seg *seg;
 
 	if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
 		return -EINVAL;
@@ -485,6 +488,11 @@ static ssize_t ib_umad_write(struct file
 		goto err_ah;
 	}
 
+	if (!rmpp_active && length > sizeof(struct ib_mad)) {
+		ret = -EINVAL;
+		goto err_ah;
+	}
+
 	packet->msg = ib_create_send_mad(agent,
 					 be32_to_cpu(packet->mad.hdr.qpn),
 					 0, rmpp_active,
@@ -502,14 +510,32 @@ static ssize_t ib_umad_write(struct file
 
 	/* Copy MAD headers (RMPP header in place) */
 	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
-	/* Now, copy rest of message from user into send buffer */
+	/* complete copying first 256 bytes of message into send buffer */
 	if (copy_from_user(packet->msg->mad + copy_offset,
 			   buf + sizeof (struct ib_user_mad) + copy_offset,
-			   length - copy_offset)) {
+			   min_t(int, length, sizeof(struct ib_mad)) - copy_offset)) {
 		ret = -EFAULT;
 		goto err_msg;
 	}
 
+	/* if RMPP, copy rest of send message from user to multipacket list */
+	length -= sizeof(struct ib_mad);
+	if (length > 0) {
+		buf +=  sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+		for (seg_num = 2; length > 0; ++seg_num, buf += s, length -= s) {
+			seg = ib_mad_get_multipacket_seg(packet->msg, seg_num);
+			BUG_ON(!seg);
+			s = min_t(int, length, seg->size);
+			if (copy_from_user(seg->data, buf, s)) {
+				ret = -EFAULT;
+				goto err_msg;
+			}
+		}
+		/* Pad last segment with zeroes. */
+		if (seg->size - s)
+			memset(seg->data + s, 0, seg->size - s);
+	}
+
 	/*
 	 * If userspace is generating a request that will generate a
 	 * response, we need to make sure the high-order part of the



More information about the general mailing list