[openib-general] [PATCH 3 of 3] mad: large RMPP support

Jack Morgenstein jackm at mellanox.co.il
Mon Feb 6 23:41:33 PST 2006


patch 3 of 3

---

Large RMPP support, send side: split a multipacket MAD buffer to a list of
segments, (multipacket_list) and send these using an gather list of size 2.

Signed-off-by: Jack Morgenstein <jackm at mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: last_stable/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- last_stable.orig/drivers/infiniband/core/mad_rmpp.c
+++ last_stable/drivers/infiniband/core/mad_rmpp.c
@@ -570,16 +532,23 @@ start_rmpp(struct ib_mad_agent_private *
 	return mad_recv_wc;
 }
 
-static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
+static inline void *get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
 {
-	return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset +
-	       (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) *
-	       (mad_send_wr->seg_num - 1);
+	struct ib_mad_multipacket_seg *seg;
+	int i = 2;
+
+	list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) {
+		if (i == mad_send_wr->seg_num)
+			return seg->data;
+		i++;
+	}
+	return NULL;
 }
 
-static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	struct ib_rmpp_mad *rmpp_mad;
+	void *next_data;
 	int timeout;
 	u32 paylen;
 
@@ -592,14 +561,14 @@ static int send_next_seg(struct ib_mad_s
 		paylen = mad_send_wr->total_seg * IB_MGMT_RMPP_DATA -
 			 mad_send_wr->pad;
 		rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
-		mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
 	} else {
-		mad_send_wr->send_wr.num_sge = 2;
-		mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
-		mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
-						 mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+		next_data = get_seg_addr(mad_send_wr);
+		if (!next_data) {
+			printk(KERN_ERR PFX "send_next_seg: "
+			       "could not find next segment\n");
+			return -EINVAL;
+		}
+		mad_send_wr->send_buf.mad_payload = next_data;
 		rmpp_mad->rmpp_hdr.paylen_newwin = 0;
 	}
 
@@ -838,7 +807,7 @@ out:
 int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	struct ib_rmpp_mad *rmpp_mad;
-	int i, total_len, ret;
+	int ret;
 
 	rmpp_mad = mad_send_wr->send_buf.mad;
 	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
@@ -848,20 +817,16 @@ int ib_send_rmpp_mad(struct ib_mad_send_
 	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
 		return IB_RMPP_RESULT_INTERNAL;
 
-	if (mad_send_wr->send_wr.num_sge > 1)
-		return -EINVAL;		/* TODO: support num_sge > 1 */
+	if (mad_send_wr->send_wr.num_sge != 2)
+		return -EINVAL;
 
 	mad_send_wr->seg_num = 1;
 	mad_send_wr->newwin = 1;
 	mad_send_wr->data_offset = data_offset(rmpp_mad->mad_hdr.mgmt_class);
 
-	total_len = 0;
-	for (i = 0; i < mad_send_wr->send_wr.num_sge; i++)
-		total_len += mad_send_wr->send_wr.sg_list[i].length;
-
-        mad_send_wr->total_seg = (total_len - mad_send_wr->data_offset) /
+	mad_send_wr->total_seg = (mad_send_wr->total_length - mad_send_wr->data_offset) /
 			(sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
-	mad_send_wr->pad = total_len - IB_MGMT_RMPP_HDR -
+	mad_send_wr->pad = mad_send_wr->total_length - IB_MGMT_RMPP_HDR -
 			   be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
 
 	/* We need to wait for the final ACK even if there isn't a response */
Index: last_stable/drivers/infiniband/core/mad.c
===================================================================
--- last_stable.orig/drivers/infiniband/core/mad.c
+++ last_stable/drivers/infiniband/core/mad.c
@@ -779,6 +779,17 @@ static int get_buf_length(int hdr_len, i
 	return hdr_len + data_len + pad;
 }
 
+static void free_send_multipacket_list(struct ib_mad_send_wr_private *
+				       mad_send_wr)
+{
+	struct ib_mad_multipacket_seg *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->multipacket_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
 struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 					    u32 remote_qpn, u16 pkey_index,
 					    int rmpp_active,
@@ -787,39 +798,38 @@ struct ib_mad_send_buf * ib_create_send_
 {
 	struct ib_mad_agent_private *mad_agent_priv;
 	struct ib_mad_send_wr_private *mad_send_wr;
-	int length, buf_size;
+	int length, message_size, seg_size;
 	void *buf;
 
 	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
 				      agent);
-	buf_size = get_buf_length(hdr_len, data_len);
+	message_size = get_buf_length(hdr_len, data_len);
 
 	if ((!mad_agent->rmpp_version &&
-	     (rmpp_active || buf_size > sizeof(struct ib_mad))) ||
-	    (!rmpp_active && buf_size > sizeof(struct ib_mad)))
+	     (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+	    (!rmpp_active && message_size > sizeof(struct ib_mad)))
 		return ERR_PTR(-EINVAL);
 
-	length = sizeof *mad_send_wr + buf_size;
-	if (length >= PAGE_SIZE)
-		buf = (void *)__get_free_pages(gfp_mask, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		buf = kmalloc(length, gfp_mask);
+	length = sizeof *mad_send_wr + message_size;
+	buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
 
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	memset(buf, 0, length);
-
-	mad_send_wr = buf + buf_size;
+	mad_send_wr = buf + sizeof(struct ib_mad);
+	INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
 	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.mad_payload = buf + hdr_len;
 
 	mad_send_wr->mad_agent_priv = mad_agent_priv;
-	mad_send_wr->sg_list[0].length = buf_size;
+	mad_send_wr->sg_list[0].length = hdr_len;
 	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
 
 	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
 	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
-	mad_send_wr->send_wr.num_sge = 1;
+	mad_send_wr->send_wr.num_sge = 2;
 	mad_send_wr->send_wr.opcode = IB_WR_SEND;
 	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
 	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
@@ -827,6 +837,7 @@ struct ib_mad_send_buf * ib_create_send_
 	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
 
 	if (rmpp_active) {
+		struct ib_mad_multipacket_seg *seg;
 		struct ib_rmpp_mad *rmpp_mad = mad_send_wr->send_buf.mad;
 		rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(hdr_len -
 						   IB_MGMT_RMPP_HDR + data_len);
@@ -834,6 +845,27 @@ struct ib_mad_send_buf * ib_create_send_
 		rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
 		ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr,
 				  IB_MGMT_RMPP_FLAG_ACTIVE);
+		mad_send_wr->total_length = message_size;
+		/* allocate RMPP buffers */
+		message_size -= sizeof(struct ib_mad);
+		seg_size = sizeof(struct ib_mad) - hdr_len;
+		while (message_size > 0) {
+			seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) +
+				      seg_size, gfp_mask);
+			if (!seg) {
+				printk(KERN_ERR "ib_create_send_mad: RMPP mem "
+				       "alloc failed for len %zd, gfp %#x\n",
+				       sizeof(struct ib_mad_multipacket_seg) +
+				       seg_size, gfp_mask);
+				free_send_multipacket_list(mad_send_wr);
+				kfree(buf);
+				return ERR_PTR(-ENOMEM);
+			}
+			seg->size = seg_size;
+			list_add_tail(&seg->list,
+				      &mad_send_wr->multipacket_list);
+			message_size -= seg_size;
+		}
 	}
 
 	mad_send_wr->send_buf.mad_agent = mad_agent;
@@ -842,23 +874,36 @@ struct ib_mad_send_buf * ib_create_send_
 }
 EXPORT_SYMBOL(ib_create_send_mad);
 
+struct ib_mad_multipacket_seg *ib_get_multipacket_seg(struct ib_mad_send_buf *
+						      send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_multipacket_seg *seg;
+	int i = 2;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) {
+		if (i == seg_num)
+			return seg;
+		i++;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(ib_get_multipacket_seg);
+
 void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
-	void *mad_send_wr;
-	int length;
+	struct ib_mad_send_wr_private *mad_send_wr;
 
 	mad_agent_priv = container_of(send_buf->mad_agent,
 				      struct ib_mad_agent_private, agent);
 	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
 				   send_buf);
 
-	length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - send_buf->mad);
-	if (length >= PAGE_SIZE)
-		free_pages((unsigned long)send_buf->mad, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		kfree(send_buf->mad);
-
+	free_send_multipacket_list(mad_send_wr);
+	kfree(send_buf->mad);
 	if (atomic_dec_and_test(&mad_agent_priv->refcount))
 		wake_up(&mad_agent_priv->wait);
 }
@@ -881,10 +926,17 @@ int ib_send_mad(struct ib_mad_send_wr_pr
 
 	mad_agent = mad_send_wr->send_buf.mad_agent;
 	sge = mad_send_wr->sg_list;
-	sge->addr = dma_map_single(mad_agent->device->dma_device,
-				   mad_send_wr->send_buf.mad, sge->length,
-				   DMA_TO_DEVICE);
-	pci_unmap_addr_set(mad_send_wr, mapping, sge->addr);
+	sge[0].addr = dma_map_single(mad_agent->device->dma_device,
+				     mad_send_wr->send_buf.mad,
+				     sge[0].length,
+				     DMA_TO_DEVICE);
+	pci_unmap_addr_set(mad_send_wr, header_mapping, sge[0].addr);
+
+	sge[1].addr = dma_map_single(mad_agent->device->dma_device,
+				     mad_send_wr->send_buf.mad_payload,
+				     sge[1].length,
+				     DMA_TO_DEVICE);
+	pci_unmap_addr_set(mad_send_wr, payload_mapping, sge[1].addr);
 
 	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
 	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
@@ -901,11 +953,15 @@ int ib_send_mad(struct ib_mad_send_wr_pr
 		list_add_tail(&mad_send_wr->mad_list.list, list);
 	}
 	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
-	if (ret)
+	if (ret) {
 		dma_unmap_single(mad_agent->device->dma_device,
-				 pci_unmap_addr(mad_send_wr, mapping),
-				 sge->length, DMA_TO_DEVICE);
+				 pci_unmap_addr(mad_send_wr, header_mapping),
+				 sge[0].length, DMA_TO_DEVICE);
 
+		dma_unmap_single(mad_agent->device->dma_device,
+				 pci_unmap_addr(mad_send_wr, payload_mapping),
+				 sge[1].length, DMA_TO_DEVICE);
+	}
 	return ret;
 }
 
@@ -1876,8 +1932,11 @@ static void ib_mad_send_done_handler(str
 
 retry:
 	dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
-			 pci_unmap_addr(mad_send_wr, mapping),
+			 pci_unmap_addr(mad_send_wr, header_mapping),
 			 mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
+			 pci_unmap_addr(mad_send_wr, payload_mapping),
+			 mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
 	queued_send_wr = NULL;
 	spin_lock_irqsave(&send_queue->lock, flags);
 	list_del(&mad_list->list);
Index: last_stable/drivers/infiniband/core/user_mad.c
===================================================================
--- last_stable.orig/drivers/infiniband/core/user_mad.c
+++ last_stable/drivers/infiniband/core/user_mad.c
@@ -187,7 +270,7 @@ static void send_handler(struct ib_mad_a
 	ib_free_send_mad(packet->msg);
 
 	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
-		timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL);
+		timeout = alloc_packet();
 		if (!timeout)
 			goto out;
 
@@ -198,40 +281,12 @@ static void send_handler(struct ib_mad_a
 		       sizeof (struct ib_mad_hdr));
 
 		if (queue_packet(file, agent, timeout))
-			kfree(timeout);
+			free_packet(timeout);
 	}
 out:
 	kfree(packet);
 }
 
-static struct ib_umad_packet *alloc_packet(int buf_size)
-{
-	struct ib_umad_packet *packet;
-	int length = sizeof *packet + buf_size;
-
-	if (length >= PAGE_SIZE)
-		packet = (void *)__get_free_pages(GFP_KERNEL, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		packet = kmalloc(length, GFP_KERNEL);
-
-	if (!packet)
-		return NULL;
-
-	memset(packet, 0, length);
-	return packet;
-}
-
-static void free_packet(struct ib_umad_packet *packet)
-{
-	int length = packet->length + sizeof *packet;
-	if (length >= PAGE_SIZE)
-		free_pages((unsigned long) packet, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		kfree(packet);
-}
-
-
-
 static void recv_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_recv_wc *mad_recv_wc)
 {
@@ -339,6 +422,8 @@ static ssize_t ib_umad_write(struct file
 	__be64 *tid;
 	int ret, length, hdr_len, copy_offset;
 	int rmpp_active, has_rmpp_header;
+	int s, seg_num;
+	struct ib_mad_multipacket_seg *seg;
 
 	if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
 		return -EINVAL;
@@ -415,6 +500,11 @@ static ssize_t ib_umad_write(struct file
 		goto err_ah;
 	}
 
+	if (!rmpp_active && length > sizeof(struct ib_mad)) {
+		ret = -EINVAL;
+		goto err_ah;
+	}
+
 	packet->msg = ib_create_send_mad(agent,
 					 be32_to_cpu(packet->mad.hdr.qpn),
 					 0, rmpp_active,
@@ -432,14 +522,32 @@ static ssize_t ib_umad_write(struct file
 
 	/* Copy MAD headers (RMPP header in place) */
 	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
-	/* Now, copy rest of message from user into send buffer */
+	/* complete copying first 256 bytes of message into send buffer */
 	if (copy_from_user(packet->msg->mad + copy_offset,
 			   buf + sizeof (struct ib_user_mad) + copy_offset,
-			   length - copy_offset)) {
+			   min_t(int, length, sizeof(struct ib_mad)) - copy_offset)) {
 		ret = -EFAULT;
 		goto err_msg;
 	}
 
+	/* if RMPP, copy rest of send message from user to multipacket list */
+	length -= sizeof(struct ib_mad);
+	if (length > 0) {
+		buf +=  sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+		for (seg_num = 2; length > 0; ++seg_num, buf += s, length -= s) {
+			seg = ib_get_multipacket_seg(packet->msg, seg_num);
+			BUG_ON(!seg);
+			s = min_t(int, length, seg->size);
+			if (copy_from_user(seg->data, buf, s)) {
+				ret = -EFAULT;
+				goto err_msg;
+			}
+		}
+		/* Pad last segment with zeroes. */
+		if (seg->size - s)
+			memset(seg->data + s, 0, seg->size - s);
+	}
+
 	/*
 	 * If userspace is generating a request that will generate a
 	 * response, we need to make sure the high-order part of the



More information about the general mailing list