[openib-general] [PATCH] core: segmented rmpp sends

Michael S. Tsirkin mst at mellanox.co.il
Tue Dec 6 10:48:14 PST 2005


With the following in place we are able to perform very large RMPP
transfers. Please comment.

---

Modify the rmpp mad support to accept a linked list of segments
instead of a large physically contigious buffer.
The list is kept in mad_send_wr private data and constructed with
new ib_append_to_multipacket_mad API call.
Modify user_mad.c to allocate large MADs for send/receive by chunks.

Signed-off-by: Jack Morgenstein <jackm at mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

Index: latest/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad_rmpp.c
+++ latest/drivers/infiniband/core/mad_rmpp.c
@@ -433,44 +433,6 @@ static struct ib_mad_recv_wc * complete_
 	return rmpp_wc;
 }
 
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf)
-{
-	struct ib_mad_recv_buf *seg_buf;
-	struct ib_rmpp_mad *rmpp_mad;
-	void *data;
-	int size, len, offset;
-	u8 flags;
-
-	len = mad_recv_wc->mad_len;
-	if (len <= sizeof(struct ib_mad)) {
-		memcpy(buf, mad_recv_wc->recv_buf.mad, len);
-		return;
-	}
-
-	offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
-
-	list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
-		rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
-		flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
-
-		if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
-			data = rmpp_mad;
-			size = sizeof(*rmpp_mad);
-		} else {
-			data = (void *) rmpp_mad + offset;
-			if (flags & IB_MGMT_RMPP_FLAG_LAST)
-				size = len;
-			else
-				size = sizeof(*rmpp_mad) - offset;
-		}
-
-		memcpy(buf, data, size);
-		len -= size;
-		buf += size;
-	}
-}
-EXPORT_SYMBOL(ib_coalesce_recv_mad);
-
 static struct ib_mad_recv_wc *
 continue_rmpp(struct ib_mad_agent_private *agent,
 	      struct ib_mad_recv_wc *mad_recv_wc)
@@ -570,16 +532,26 @@ start_rmpp(struct ib_mad_agent_private *
 	return mad_recv_wc;
 }
 
-static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
+static inline void * get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
 {
-	return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset +
-	       (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) *
-	       (mad_send_wr->seg_num - 1);
+	struct ib_mad_multipacket_seg *seg;
+	int i = 2;
+
+	if (list_empty(&mad_send_wr->multipacket_list))
+		return NULL;
+
+	list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) {
+		if (i == mad_send_wr->seg_num)
+			return seg->data;
+		i++;
+	}
+	return NULL;
 }
 
-static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	struct ib_rmpp_mad *rmpp_mad;
+	void *next_data;
 	int timeout;
 	u32 paylen;
 
@@ -594,12 +566,14 @@ static int send_next_seg(struct ib_mad_s
 		rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
 		mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
 	} else {
-		mad_send_wr->send_wr.num_sge = 2;
-		mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
-		mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
-						 mad_send_wr->data_offset;
-		mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+		next_data = get_seg_addr(mad_send_wr);
+		if (!next_data) {
+			printk(KERN_ERR PFX "send_next_seg: "
+			       "could not find next segment\n");
+			return -EINVAL;
+		}
+		memcpy((void *)rmpp_mad + mad_send_wr->data_offset, next_data,
+		       sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
 		rmpp_mad->rmpp_hdr.paylen_newwin = 0;
 	}
 
Index: latest/drivers/infiniband/include/rdma/ib_mad.h
===================================================================
--- latest.orig/drivers/infiniband/include/rdma/ib_mad.h
+++ latest/drivers/infiniband/include/rdma/ib_mad.h
@@ -141,6 +141,11 @@ struct ib_rmpp_hdr {
 	__be32	paylen_newwin;
 };
 
+struct ib_mad_multipacket_seg {
+	struct list_head list;
+	u8 data[0];
+};
+
 typedef u64 __bitwise ib_sa_comp_mask;
 
 #define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
@@ -485,17 +490,6 @@ int ib_unregister_mad_agent(struct ib_ma
 int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		     struct ib_mad_send_buf **bad_send_buf);
 
-/**
- * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer.
- * @mad_recv_wc: Work completion information for a received MAD.
- * @buf: User-provided data buffer to receive the coalesced buffers.  The
- *   referenced buffer should be at least the size of the mad_len specified
- *   by @mad_recv_wc.
- *
- * This call copies a chain of received MAD segments into a single data buffer,
- * removing duplicated headers.
- */
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf);
 
 /**
  * ib_free_recv_mad - Returns data buffers used to receive a MAD.
@@ -601,6 +595,18 @@ struct ib_mad_send_buf * ib_create_send_
 					    gfp_t gfp_mask);
 
 /**
+ * ib_append_to_multipacket_mad - Append a segment of an RMPP multipacket mad send
+ *   to the send buffer.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg: segment to append to linked list (already filled with data).
+ *
+ * This routine appends a segment of a multipacket RMPP message
+ * (copied from user space) to a MAD for sending.
+ */
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+				  struct ib_mad_multipacket_seg *seg);
+
+/**
  * ib_free_send_mad - Returns data buffers used to send a MAD.
  * @send_buf: Previously allocated send data buffer.
  */
Index: latest/drivers/infiniband/core/mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad.c
+++ latest/drivers/infiniband/core/mad.c
@@ -792,17 +792,13 @@ struct ib_mad_send_buf * ib_create_send_
 		return ERR_PTR(-EINVAL);
 
 	length = sizeof *mad_send_wr + buf_size;
-	if (length >= PAGE_SIZE)
-		buf = (void *)__get_free_pages(gfp_mask, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		buf = kmalloc(length, gfp_mask);
+	buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
 
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	memset(buf, 0, length);
-
-	mad_send_wr = buf + buf_size;
+	mad_send_wr = buf + sizeof(struct ib_mad);
+	INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
 	mad_send_wr->send_buf.mad = buf;
 
 	mad_send_wr->mad_agent_priv = mad_agent_priv;
@@ -834,23 +830,33 @@ struct ib_mad_send_buf * ib_create_send_
 }
 EXPORT_SYMBOL(ib_create_send_mad);
 
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+				struct ib_mad_multipacket_seg *seg)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list_add_tail(&seg->list, &mad_send_wr->multipacket_list);
+}
+EXPORT_SYMBOL(ib_append_to_multipacket_mad);
+
 void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
-	void *mad_send_wr;
-	int length;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_multipacket_seg *seg, *tmp;
 
 	mad_agent_priv = container_of(send_buf->mad_agent,
 				      struct ib_mad_agent_private, agent);
 	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
 				   send_buf);
 
-	length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - send_buf->mad);
-	if (length >= PAGE_SIZE)
-		free_pages((unsigned long)send_buf->mad, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		kfree(send_buf->mad);
-
+	list_for_each_entry_safe(seg, tmp, &mad_send_wr->multipacket_list, list) {
+		list_del(&seg->list);
+		kfree(seg);
+	}
+	kfree(send_buf->mad);
 	if (atomic_dec_and_test(&mad_agent_priv->refcount))
 		wake_up(&mad_agent_priv->wait);
 }
Index: latest/drivers/infiniband/core/mad_priv.h
===================================================================
--- latest.orig/drivers/infiniband/core/mad_priv.h
+++ latest/drivers/infiniband/core/mad_priv.h
@@ -130,6 +130,7 @@ struct ib_mad_send_wr_private {
 	enum ib_wc_status status;
 
 	/* RMPP control */
+	struct list_head multipacket_list;
 	int last_ack;
 	int seg_num;
 	int newwin;
Index: latest/drivers/infiniband/core/user_mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/user_mad.c
+++ latest/drivers/infiniband/core/user_mad.c
@@ -123,6 +123,7 @@ struct ib_umad_packet {
 	struct ib_mad_send_buf *msg;
 	struct list_head   list;
 	int		   length;
+	struct list_head   seg_list;
 	struct ib_user_mad mad;
 };
 
@@ -176,6 +177,87 @@ static int queue_packet(struct ib_umad_f
 	return ret;
 }
 
+static int data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_RMPP_HDR;
+}
+
+static int copy_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+			  struct ib_umad_packet *packet)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	struct ib_rmpp_mad *rmpp_mad;
+	void *data;
+	struct ib_mad_multipacket_seg *seg;
+	int size, len, offset;
+	u8 flags;
+
+	len = mad_recv_wc->mad_len;
+	if (len <= sizeof(struct ib_mad)) {
+		memcpy(&packet->mad.data, mad_recv_wc->recv_buf.mad, len);
+		return 0;
+	}
+
+	offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+
+	list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
+		rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
+		flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
+
+		if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
+			size = sizeof(*rmpp_mad);
+			memcpy(&packet->mad.data, rmpp_mad, size);
+		} else {
+			data = (void *) rmpp_mad + offset;
+			if (flags & IB_MGMT_RMPP_FLAG_LAST)
+				size = len;
+			else
+				size = sizeof(*rmpp_mad) - offset;
+			seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) +
+				      sizeof(struct ib_rmpp_mad) - offset,
+				      GFP_KERNEL);
+			if (!seg)
+				return -ENOMEM;
+			memcpy(seg->data, data, size);
+			list_add_tail(&seg->list, &packet->seg_list);
+		}
+		len -= size;
+	}
+	return 0;
+}
+
+static struct ib_umad_packet *alloc_packet(void)
+{
+	struct ib_umad_packet *packet;
+	int length = sizeof *packet + sizeof(struct ib_mad);
+
+	packet = kzalloc(length, GFP_KERNEL);
+	if (!packet) {
+		printk(KERN_ERR "alloc_packet: mem alloc failed for length %d\n",
+		       length);
+		return NULL;
+	}
+	INIT_LIST_HEAD(&packet->seg_list);
+	return packet;
+}
+
+static void free_packet(struct ib_umad_packet *packet)
+{
+	struct ib_mad_multipacket_seg *seg, *tmp;
+
+	list_for_each_entry_safe(seg, tmp, &packet->seg_list, list) {
+		list_del(&seg->list);
+		kfree(seg);
+	}
+	kfree(packet);
+}
+
 static void send_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_send_wc *send_wc)
 {
@@ -187,7 +269,7 @@ static void send_handler(struct ib_mad_a
 	ib_free_send_mad(packet->msg);
 
 	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
-		timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL);
+		timeout = alloc_packet();
 		if (!timeout)
 			goto out;
 
@@ -198,40 +280,14 @@ static void send_handler(struct ib_mad_a
 		       sizeof (struct ib_mad_hdr));
 
 		if (!queue_packet(file, agent, timeout))
-				return;
+			return;
+		else
+			free_packet(timeout);
 	}
 out:
 	kfree(packet);
 }
 
-static struct ib_umad_packet *alloc_packet(int buf_size)
-{
-	struct ib_umad_packet *packet;
-	int length = sizeof *packet + buf_size;
-
-	if (length >= PAGE_SIZE)
-		packet = (void *)__get_free_pages(GFP_KERNEL, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		packet = kmalloc(length, GFP_KERNEL);
-
-	if (!packet)
-		return NULL;
-
-	memset(packet, 0, length);
-	return packet;
-}
-
-static void free_packet(struct ib_umad_packet *packet)
-{
-	int length = packet->length + sizeof *packet;
-	if (length >= PAGE_SIZE)
-		free_pages((unsigned long) packet, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-	else
-		kfree(packet);
-}
-
-
-
 static void recv_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_recv_wc *mad_recv_wc)
 {
@@ -243,13 +299,16 @@ static void recv_handler(struct ib_mad_a
 		goto out;
 
 	length = mad_recv_wc->mad_len;
-	packet = alloc_packet(length);
+	packet = alloc_packet();
 	if (!packet)
 		goto out;
 
 	packet->length = length;
 
-	ib_coalesce_recv_mad(mad_recv_wc, packet->mad.data);
+	if (copy_recv_mad(mad_recv_wc, packet)) {
+		free_packet(packet);
+		goto out;
+	}
 
 	packet->mad.hdr.status    = 0;
 	packet->mad.hdr.length    = length + sizeof (struct ib_user_mad);
@@ -278,6 +337,7 @@ static ssize_t ib_umad_read(struct file 
 			    size_t count, loff_t *pos)
 {
 	struct ib_umad_file *file = filp->private_data;
+	struct ib_mad_multipacket_seg *seg;
 	struct ib_umad_packet *packet;
 	ssize_t ret;
 
@@ -304,18 +364,42 @@ static ssize_t ib_umad_read(struct file 
 
 	spin_unlock_irq(&file->recv_lock);
 
-	if (count < packet->length + sizeof (struct ib_user_mad)) {
-		/* Return length needed (and first RMPP segment) if too small */
-		if (copy_to_user(buf, &packet->mad,
-				 sizeof (struct ib_user_mad) + sizeof (struct ib_mad)))
-			ret = -EFAULT;
-		else
-			ret = -ENOSPC;
-	} else if (copy_to_user(buf, &packet->mad,
-				packet->length + sizeof (struct ib_user_mad)))
+	if (copy_to_user(buf, &packet->mad,
+			 sizeof(struct ib_user_mad) + sizeof(struct ib_mad))) {
 		ret = -EFAULT;
-	else
+		goto err;
+	}
+
+	if (count < packet->length + sizeof (struct ib_user_mad))
+		/* User buffer too small. Return first RMPP segment (which
+		 * includes RMPP message length).
+		 */
+		ret = -ENOSPC;
+	else if (packet->length <= sizeof(struct ib_mad))
+		ret = packet->length + sizeof(struct ib_user_mad);
+	else {
+		int len = packet->length - sizeof(struct ib_mad);
+		struct ib_rmpp_mad *rmpp_mad =
+				(struct ib_rmpp_mad *) packet->mad.data;
+		int max_seg_payload = sizeof(struct ib_mad) -
+				      data_offset(rmpp_mad->mad_hdr.mgmt_class);
+		int seg_payload;
+		/* multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		buf += sizeof(struct ib_user_mad) + sizeof(struct ib_mad);
+		list_for_each_entry(seg, &packet->seg_list, list) {
+			seg_payload = min_t(int, len, max_seg_payload);
+			if (copy_to_user(buf, seg->data, seg_payload)) {
+				ret = -EFAULT;
+				goto err;
+			}
+			buf += seg_payload;
+			len -= seg_payload;
+		}
 		ret = packet->length + sizeof (struct ib_user_mad);
+	}
+err:
 	if (ret < 0) {
 		/* Requeue packet */
 		spin_lock_irq(&file->recv_lock);
@@ -339,6 +423,8 @@ static ssize_t ib_umad_write(struct file
 	__be64 *tid;
 	int ret, length, hdr_len, copy_offset;
 	int rmpp_active, has_rmpp_header;
+	int max_seg_payload;
+	struct ib_mad_multipacket_seg *seg;
 
 	if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
 		return -EINVAL;
@@ -415,6 +501,11 @@ static ssize_t ib_umad_write(struct file
 		goto err_ah;
 	}
 
+	if (!rmpp_active && length > sizeof(struct ib_mad)) {
+		ret = -EINVAL;
+		goto err_ah;
+	}
+
 	packet->msg = ib_create_send_mad(agent,
 					 be32_to_cpu(packet->mad.hdr.qpn),
 					 0, rmpp_active,
@@ -432,12 +523,39 @@ static ssize_t ib_umad_write(struct file
 
 	/* Copy MAD headers (RMPP header in place) */
 	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
-	/* Now, copy rest of message from user into send buffer */
+	/* complete copying first 256 bytes of message into send buffer */
 	if (copy_from_user(packet->msg->mad + copy_offset,
 			   buf + sizeof (struct ib_user_mad) + copy_offset,
-			   length - copy_offset)) {
+			   min_t(int, length, sizeof(struct ib_mad)) - copy_offset)) {
 		ret = -EFAULT;
-		goto err_msg;
+		goto err_ah;
+	}
+
+	/* if multipacket, copy remainder of send message from user to multipacket list */
+	length -= sizeof(struct ib_mad);
+	buf +=  sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+	max_seg_payload = sizeof(struct ib_mad) -
+			  data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	while (length > 0) {
+		int seg_payload = min_t(int, length, max_seg_payload);
+		seg = kzalloc(sizeof(struct ib_mad_multipacket_seg) +
+			      max_seg_payload, GFP_KERNEL);
+		if (!seg) {
+			printk(KERN_ERR "ib_umad_write: "
+			       "mem alloc failed for length %d\n",
+			       sizeof(struct ib_mad_multipacket_seg) +
+			       max_seg_payload);
+			ret = -ENOMEM;
+			goto err_msg;
+		}
+
+		if (copy_from_user(seg->data, buf, seg_payload)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+		ib_append_to_multipacket_mad(packet->msg, seg);
+		buf += seg_payload;
+		length -= seg_payload;
 	}
 
 	/*

-- 
MST



More information about the general mailing list