[openib-general] [PATCH] core: segmented rmpp sends
Michael S. Tsirkin
mst at mellanox.co.il
Tue Dec 6 10:48:14 PST 2005
With the following in place we are able to perform very large RMPP
transfers. Please comment.
---
Modify the rmpp mad support to accept a linked list of segments
instead of a large physically contigious buffer.
The list is kept in mad_send_wr private data and constructed with
new ib_append_to_multipacket_mad API call.
Modify user_mad.c to allocate large MADs for send/receive by chunks.
Signed-off-by: Jack Morgenstein <jackm at mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
Index: latest/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad_rmpp.c
+++ latest/drivers/infiniband/core/mad_rmpp.c
@@ -433,44 +433,6 @@ static struct ib_mad_recv_wc * complete_
return rmpp_wc;
}
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf)
-{
- struct ib_mad_recv_buf *seg_buf;
- struct ib_rmpp_mad *rmpp_mad;
- void *data;
- int size, len, offset;
- u8 flags;
-
- len = mad_recv_wc->mad_len;
- if (len <= sizeof(struct ib_mad)) {
- memcpy(buf, mad_recv_wc->recv_buf.mad, len);
- return;
- }
-
- offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
-
- list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
- rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
- flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
-
- if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
- data = rmpp_mad;
- size = sizeof(*rmpp_mad);
- } else {
- data = (void *) rmpp_mad + offset;
- if (flags & IB_MGMT_RMPP_FLAG_LAST)
- size = len;
- else
- size = sizeof(*rmpp_mad) - offset;
- }
-
- memcpy(buf, data, size);
- len -= size;
- buf += size;
- }
-}
-EXPORT_SYMBOL(ib_coalesce_recv_mad);
-
static struct ib_mad_recv_wc *
continue_rmpp(struct ib_mad_agent_private *agent,
struct ib_mad_recv_wc *mad_recv_wc)
@@ -570,16 +532,26 @@ start_rmpp(struct ib_mad_agent_private *
return mad_recv_wc;
}
-static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
+static inline void * get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
{
- return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset +
- (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) *
- (mad_send_wr->seg_num - 1);
+ struct ib_mad_multipacket_seg *seg;
+ int i = 2;
+
+ if (list_empty(&mad_send_wr->multipacket_list))
+ return NULL;
+
+ list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) {
+ if (i == mad_send_wr->seg_num)
+ return seg->data;
+ i++;
+ }
+ return NULL;
}
-static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_rmpp_mad *rmpp_mad;
+ void *next_data;
int timeout;
u32 paylen;
@@ -594,12 +566,14 @@ static int send_next_seg(struct ib_mad_s
rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
} else {
- mad_send_wr->send_wr.num_sge = 2;
- mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
- mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
- mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
- mad_send_wr->data_offset;
- mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+ next_data = get_seg_addr(mad_send_wr);
+ if (!next_data) {
+ printk(KERN_ERR PFX "send_next_seg: "
+ "could not find next segment\n");
+ return -EINVAL;
+ }
+ memcpy((void *)rmpp_mad + mad_send_wr->data_offset, next_data,
+ sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
rmpp_mad->rmpp_hdr.paylen_newwin = 0;
}
Index: latest/drivers/infiniband/include/rdma/ib_mad.h
===================================================================
--- latest.orig/drivers/infiniband/include/rdma/ib_mad.h
+++ latest/drivers/infiniband/include/rdma/ib_mad.h
@@ -141,6 +141,11 @@ struct ib_rmpp_hdr {
__be32 paylen_newwin;
};
+struct ib_mad_multipacket_seg {
+ struct list_head list;
+ u8 data[0];
+};
+
typedef u64 __bitwise ib_sa_comp_mask;
#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
@@ -485,17 +490,6 @@ int ib_unregister_mad_agent(struct ib_ma
int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
struct ib_mad_send_buf **bad_send_buf);
-/**
- * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer.
- * @mad_recv_wc: Work completion information for a received MAD.
- * @buf: User-provided data buffer to receive the coalesced buffers. The
- * referenced buffer should be at least the size of the mad_len specified
- * by @mad_recv_wc.
- *
- * This call copies a chain of received MAD segments into a single data buffer,
- * removing duplicated headers.
- */
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf);
/**
* ib_free_recv_mad - Returns data buffers used to receive a MAD.
@@ -601,6 +595,18 @@ struct ib_mad_send_buf * ib_create_send_
gfp_t gfp_mask);
/**
+ * ib_append_to_multipacket_mad - Append a segment of an RMPP multipacket mad send
+ * to the send buffer.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg: segment to append to linked list (already filled with data).
+ *
+ * This routine appends a segment of a multipacket RMPP message
+ * (copied from user space) to a MAD for sending.
+ */
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+ struct ib_mad_multipacket_seg *seg);
+
+/**
* ib_free_send_mad - Returns data buffers used to send a MAD.
* @send_buf: Previously allocated send data buffer.
*/
Index: latest/drivers/infiniband/core/mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad.c
+++ latest/drivers/infiniband/core/mad.c
@@ -792,17 +792,13 @@ struct ib_mad_send_buf * ib_create_send_
return ERR_PTR(-EINVAL);
length = sizeof *mad_send_wr + buf_size;
- if (length >= PAGE_SIZE)
- buf = (void *)__get_free_pages(gfp_mask, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- buf = kmalloc(length, gfp_mask);
+ buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
if (!buf)
return ERR_PTR(-ENOMEM);
- memset(buf, 0, length);
-
- mad_send_wr = buf + buf_size;
+ mad_send_wr = buf + sizeof(struct ib_mad);
+ INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
mad_send_wr->send_buf.mad = buf;
mad_send_wr->mad_agent_priv = mad_agent_priv;
@@ -834,23 +830,33 @@ struct ib_mad_send_buf * ib_create_send_
}
EXPORT_SYMBOL(ib_create_send_mad);
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+ struct ib_mad_multipacket_seg *seg)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+ list_add_tail(&seg->list, &mad_send_wr->multipacket_list);
+}
+EXPORT_SYMBOL(ib_append_to_multipacket_mad);
+
void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
{
struct ib_mad_agent_private *mad_agent_priv;
- void *mad_send_wr;
- int length;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_multipacket_seg *seg, *tmp;
mad_agent_priv = container_of(send_buf->mad_agent,
struct ib_mad_agent_private, agent);
mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
send_buf);
- length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - send_buf->mad);
- if (length >= PAGE_SIZE)
- free_pages((unsigned long)send_buf->mad, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- kfree(send_buf->mad);
-
+ list_for_each_entry_safe(seg, tmp, &mad_send_wr->multipacket_list, list) {
+ list_del(&seg->list);
+ kfree(seg);
+ }
+ kfree(send_buf->mad);
if (atomic_dec_and_test(&mad_agent_priv->refcount))
wake_up(&mad_agent_priv->wait);
}
Index: latest/drivers/infiniband/core/mad_priv.h
===================================================================
--- latest.orig/drivers/infiniband/core/mad_priv.h
+++ latest/drivers/infiniband/core/mad_priv.h
@@ -130,6 +130,7 @@ struct ib_mad_send_wr_private {
enum ib_wc_status status;
/* RMPP control */
+ struct list_head multipacket_list;
int last_ack;
int seg_num;
int newwin;
Index: latest/drivers/infiniband/core/user_mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/user_mad.c
+++ latest/drivers/infiniband/core/user_mad.c
@@ -123,6 +123,7 @@ struct ib_umad_packet {
struct ib_mad_send_buf *msg;
struct list_head list;
int length;
+ struct list_head seg_list;
struct ib_user_mad mad;
};
@@ -176,6 +177,87 @@ static int queue_packet(struct ib_umad_f
return ret;
}
+static int data_offset(u8 mgmt_class)
+{
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ return IB_MGMT_SA_HDR;
+ else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return IB_MGMT_VENDOR_HDR;
+ else
+ return IB_MGMT_RMPP_HDR;
+}
+
+static int copy_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_umad_packet *packet)
+{
+ struct ib_mad_recv_buf *seg_buf;
+ struct ib_rmpp_mad *rmpp_mad;
+ void *data;
+ struct ib_mad_multipacket_seg *seg;
+ int size, len, offset;
+ u8 flags;
+
+ len = mad_recv_wc->mad_len;
+ if (len <= sizeof(struct ib_mad)) {
+ memcpy(&packet->mad.data, mad_recv_wc->recv_buf.mad, len);
+ return 0;
+ }
+
+ offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+
+ list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
+ rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
+ flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
+
+ if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
+ size = sizeof(*rmpp_mad);
+ memcpy(&packet->mad.data, rmpp_mad, size);
+ } else {
+ data = (void *) rmpp_mad + offset;
+ if (flags & IB_MGMT_RMPP_FLAG_LAST)
+ size = len;
+ else
+ size = sizeof(*rmpp_mad) - offset;
+ seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) +
+ sizeof(struct ib_rmpp_mad) - offset,
+ GFP_KERNEL);
+ if (!seg)
+ return -ENOMEM;
+ memcpy(seg->data, data, size);
+ list_add_tail(&seg->list, &packet->seg_list);
+ }
+ len -= size;
+ }
+ return 0;
+}
+
+static struct ib_umad_packet *alloc_packet(void)
+{
+ struct ib_umad_packet *packet;
+ int length = sizeof *packet + sizeof(struct ib_mad);
+
+ packet = kzalloc(length, GFP_KERNEL);
+ if (!packet) {
+ printk(KERN_ERR "alloc_packet: mem alloc failed for length %d\n",
+ length);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&packet->seg_list);
+ return packet;
+}
+
+static void free_packet(struct ib_umad_packet *packet)
+{
+ struct ib_mad_multipacket_seg *seg, *tmp;
+
+ list_for_each_entry_safe(seg, tmp, &packet->seg_list, list) {
+ list_del(&seg->list);
+ kfree(seg);
+ }
+ kfree(packet);
+}
+
static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *send_wc)
{
@@ -187,7 +269,7 @@ static void send_handler(struct ib_mad_a
ib_free_send_mad(packet->msg);
if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
- timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL);
+ timeout = alloc_packet();
if (!timeout)
goto out;
@@ -198,40 +280,14 @@ static void send_handler(struct ib_mad_a
sizeof (struct ib_mad_hdr));
if (!queue_packet(file, agent, timeout))
- return;
+ return;
+ else
+ free_packet(timeout);
}
out:
kfree(packet);
}
-static struct ib_umad_packet *alloc_packet(int buf_size)
-{
- struct ib_umad_packet *packet;
- int length = sizeof *packet + buf_size;
-
- if (length >= PAGE_SIZE)
- packet = (void *)__get_free_pages(GFP_KERNEL, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- packet = kmalloc(length, GFP_KERNEL);
-
- if (!packet)
- return NULL;
-
- memset(packet, 0, length);
- return packet;
-}
-
-static void free_packet(struct ib_umad_packet *packet)
-{
- int length = packet->length + sizeof *packet;
- if (length >= PAGE_SIZE)
- free_pages((unsigned long) packet, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- kfree(packet);
-}
-
-
-
static void recv_handler(struct ib_mad_agent *agent,
struct ib_mad_recv_wc *mad_recv_wc)
{
@@ -243,13 +299,16 @@ static void recv_handler(struct ib_mad_a
goto out;
length = mad_recv_wc->mad_len;
- packet = alloc_packet(length);
+ packet = alloc_packet();
if (!packet)
goto out;
packet->length = length;
- ib_coalesce_recv_mad(mad_recv_wc, packet->mad.data);
+ if (copy_recv_mad(mad_recv_wc, packet)) {
+ free_packet(packet);
+ goto out;
+ }
packet->mad.hdr.status = 0;
packet->mad.hdr.length = length + sizeof (struct ib_user_mad);
@@ -278,6 +337,7 @@ static ssize_t ib_umad_read(struct file
size_t count, loff_t *pos)
{
struct ib_umad_file *file = filp->private_data;
+ struct ib_mad_multipacket_seg *seg;
struct ib_umad_packet *packet;
ssize_t ret;
@@ -304,18 +364,42 @@ static ssize_t ib_umad_read(struct file
spin_unlock_irq(&file->recv_lock);
- if (count < packet->length + sizeof (struct ib_user_mad)) {
- /* Return length needed (and first RMPP segment) if too small */
- if (copy_to_user(buf, &packet->mad,
- sizeof (struct ib_user_mad) + sizeof (struct ib_mad)))
- ret = -EFAULT;
- else
- ret = -ENOSPC;
- } else if (copy_to_user(buf, &packet->mad,
- packet->length + sizeof (struct ib_user_mad)))
+ if (copy_to_user(buf, &packet->mad,
+ sizeof(struct ib_user_mad) + sizeof(struct ib_mad))) {
ret = -EFAULT;
- else
+ goto err;
+ }
+
+ if (count < packet->length + sizeof (struct ib_user_mad))
+ /* User buffer too small. Return first RMPP segment (which
+ * includes RMPP message length).
+ */
+ ret = -ENOSPC;
+ else if (packet->length <= sizeof(struct ib_mad))
+ ret = packet->length + sizeof(struct ib_user_mad);
+ else {
+ int len = packet->length - sizeof(struct ib_mad);
+ struct ib_rmpp_mad *rmpp_mad =
+ (struct ib_rmpp_mad *) packet->mad.data;
+ int max_seg_payload = sizeof(struct ib_mad) -
+ data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ int seg_payload;
+ /* multipacket RMPP MAD message. Copy remainder of message.
+ * Note that last segment may have a shorter payload.
+ */
+ buf += sizeof(struct ib_user_mad) + sizeof(struct ib_mad);
+ list_for_each_entry(seg, &packet->seg_list, list) {
+ seg_payload = min_t(int, len, max_seg_payload);
+ if (copy_to_user(buf, seg->data, seg_payload)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ buf += seg_payload;
+ len -= seg_payload;
+ }
ret = packet->length + sizeof (struct ib_user_mad);
+ }
+err:
if (ret < 0) {
/* Requeue packet */
spin_lock_irq(&file->recv_lock);
@@ -339,6 +423,8 @@ static ssize_t ib_umad_write(struct file
__be64 *tid;
int ret, length, hdr_len, copy_offset;
int rmpp_active, has_rmpp_header;
+ int max_seg_payload;
+ struct ib_mad_multipacket_seg *seg;
if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
return -EINVAL;
@@ -415,6 +501,11 @@ static ssize_t ib_umad_write(struct file
goto err_ah;
}
+ if (!rmpp_active && length > sizeof(struct ib_mad)) {
+ ret = -EINVAL;
+ goto err_ah;
+ }
+
packet->msg = ib_create_send_mad(agent,
be32_to_cpu(packet->mad.hdr.qpn),
0, rmpp_active,
@@ -432,12 +523,39 @@ static ssize_t ib_umad_write(struct file
/* Copy MAD headers (RMPP header in place) */
memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
- /* Now, copy rest of message from user into send buffer */
+ /* complete copying first 256 bytes of message into send buffer */
if (copy_from_user(packet->msg->mad + copy_offset,
buf + sizeof (struct ib_user_mad) + copy_offset,
- length - copy_offset)) {
+ min_t(int, length, sizeof(struct ib_mad)) - copy_offset)) {
ret = -EFAULT;
- goto err_msg;
+ goto err_ah;
+ }
+
+ /* if multipacket, copy remainder of send message from user to multipacket list */
+ length -= sizeof(struct ib_mad);
+ buf += sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+ max_seg_payload = sizeof(struct ib_mad) -
+ data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ while (length > 0) {
+ int seg_payload = min_t(int, length, max_seg_payload);
+ seg = kzalloc(sizeof(struct ib_mad_multipacket_seg) +
+ max_seg_payload, GFP_KERNEL);
+ if (!seg) {
+ printk(KERN_ERR "ib_umad_write: "
+ "mem alloc failed for length %d\n",
+ sizeof(struct ib_mad_multipacket_seg) +
+ max_seg_payload);
+ ret = -ENOMEM;
+ goto err_msg;
+ }
+
+ if (copy_from_user(seg->data, buf, seg_payload)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ ib_append_to_multipacket_mad(packet->msg, seg);
+ buf += seg_payload;
+ length -= seg_payload;
}
/*
--
MST
More information about the general
mailing list