[openib-general] [PATCH 3 of 3] mad: large RMPP support, Round 2
Jack Morgenstein
jackm at mellanox.co.il
Sun Feb 12 07:30:36 PST 2006
Implement large RMPP support:
Send side: split a multipacket MAD buffer to a list of
segments, (multipacket_list) and send these using a gather list of size 2.
Also, save pointer to last sent segment, and retrieve requested segments
by walking list starting at last sent segment.
Finally, save pointer to last-acked segment. When retrying, retrieve segments
for resending relative to this pointer. When updating last ack, start at
this pointer.
List scan for get next segment is thus reduced from O(N^^2) to O(N).
In normal flow, the segment list will be scanned only twice (once for
retrieving next segment to send, once for updating the last-ack pointer).
Signed-off-by: Jack Morgenstein <jackm at mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>
Index: src/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- src.orig/drivers/infiniband/core/mad_rmpp.c 2006-02-12 16:30:44.624175000 +0200
+++ src/drivers/infiniband/core/mad_rmpp.c 2006-02-12 16:30:53.114901000 +0200
@@ -535,6 +535,7 @@ start_rmpp(struct ib_mad_agent_private *
static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_rmpp_mad *rmpp_mad;
+ struct ib_mad_multipacket_seg *seg;
int timeout;
u32 paylen;
@@ -547,14 +548,16 @@ static int send_next_seg(struct ib_mad_s
paylen = mad_send_wr->total_seg * IB_MGMT_RMPP_DATA -
mad_send_wr->pad;
rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
- mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
} else {
- mad_send_wr->send_wr.num_sge = 2;
- mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
- mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
- mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
- mad_send_wr->data_offset;
- mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+ seg = ib_rmpp_get_multipacket_seg(mad_send_wr,
+ mad_send_wr->seg_num);
+ if (!seg) {
+ printk(KERN_ERR PFX "send_next_seg: "
+ "could not find segment %d\n",
+ mad_send_wr->seg_num);
+ return -EINVAL;
+ }
+ mad_send_wr->mad_payload = seg->data;
rmpp_mad->rmpp_hdr.paylen_newwin = 0;
}
@@ -600,6 +603,28 @@ out:
spin_unlock_irqrestore(&agent->lock, flags);
}
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr)
+{
+ struct ib_mad_multipacket_seg *seg;
+
+ if (wr->last_ack < 2)
+ return;
+ else if (!wr->last_ack_seg)
+ list_for_each_entry(seg, &wr->multipacket_list, list) {
+ if (wr->last_ack == seg->num) {
+ wr->last_ack_seg = seg;
+ break;
+ }
+ }
+ else
+ list_for_each_entry(seg, &wr->last_ack_seg->list, list) {
+ if (wr->last_ack == seg->num) {
+ wr->last_ack_seg = seg;
+ break;
+ }
+ }
+}
+
static void process_rmpp_ack(struct ib_mad_agent_private *agent,
struct ib_mad_recv_wc *mad_recv_wc)
{
@@ -647,6 +672,7 @@ static void process_rmpp_ack(struct ib_m
if (seg_num > mad_send_wr->last_ack) {
mad_send_wr->last_ack = seg_num;
+ adjust_last_ack(mad_send_wr);
mad_send_wr->retries = mad_send_wr->send_buf.retries;
}
mad_send_wr->newwin = newwin;
@@ -793,7 +819,7 @@ out:
int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
{
struct ib_rmpp_mad *rmpp_mad;
- int i, total_len, ret;
+ int ret;
rmpp_mad = mad_send_wr->send_buf.mad;
if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
@@ -803,20 +829,16 @@ int ib_send_rmpp_mad(struct ib_mad_send_
if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
return IB_RMPP_RESULT_INTERNAL;
- if (mad_send_wr->send_wr.num_sge > 1)
- return -EINVAL; /* TODO: support num_sge > 1 */
+ if (mad_send_wr->send_wr.num_sge != 2)
+ return -EINVAL;
mad_send_wr->seg_num = 1;
mad_send_wr->newwin = 1;
mad_send_wr->data_offset = data_offset(rmpp_mad->mad_hdr.mgmt_class);
- total_len = 0;
- for (i = 0; i < mad_send_wr->send_wr.num_sge; i++)
- total_len += mad_send_wr->send_wr.sg_list[i].length;
-
- mad_send_wr->total_seg = (total_len - mad_send_wr->data_offset) /
+ mad_send_wr->total_seg = (mad_send_wr->total_length - mad_send_wr->data_offset) /
(sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
- mad_send_wr->pad = total_len - IB_MGMT_RMPP_HDR -
+ mad_send_wr->pad = mad_send_wr->total_length - IB_MGMT_RMPP_HDR -
be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
/* We need to wait for the final ACK even if there isn't a response */
@@ -880,6 +902,8 @@ int ib_retry_rmpp(struct ib_mad_send_wr_
return IB_RMPP_RESULT_PROCESSED;
mad_send_wr->seg_num = mad_send_wr->last_ack + 1;
+ mad_send_wr->seg_num_seg = mad_send_wr->last_ack_seg;
+
ret = send_next_seg(mad_send_wr);
if (ret)
return IB_RMPP_RESULT_PROCESSED;
Index: src/drivers/infiniband/core/mad.c
===================================================================
--- src.orig/drivers/infiniband/core/mad.c 2006-02-12 16:30:29.940545000 +0200
+++ src/drivers/infiniband/core/mad.c 2006-02-12 16:30:53.131904000 +0200
@@ -779,6 +779,54 @@ static int get_buf_length(int hdr_len, i
return hdr_len + data_len + pad;
}
+static void free_send_multipacket_list(struct ib_mad_send_wr_private *
+ mad_send_wr)
+{
+ struct ib_mad_multipacket_seg *s, *t;
+
+ list_for_each_entry_safe(s, t, &mad_send_wr->multipacket_list, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static inline int alloc_send_rmpp_segs(struct ib_mad_send_wr_private *send_wr,
+ int message_size, int hdr_len,
+ int data_len, u8 rmpp_version,
+ gfp_t gfp_mask)
+{
+ struct ib_mad_multipacket_seg *seg;
+ struct ib_rmpp_mad *rmpp_mad = send_wr->send_buf.mad;
+ int seg_size, i = 2;
+
+ rmpp_mad->rmpp_hdr.paylen_newwin =
+ cpu_to_be32(hdr_len - IB_MGMT_RMPP_HDR + data_len);
+ rmpp_mad->rmpp_hdr.rmpp_version = rmpp_version;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ send_wr->total_length = message_size;
+ /* allocate RMPP buffers */
+ message_size -= sizeof(struct ib_mad);
+ seg_size = sizeof(struct ib_mad) - hdr_len;
+ while (message_size > 0) {
+ seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) + seg_size,
+ gfp_mask);
+ if (!seg) {
+ printk(KERN_ERR "ib_create_send_mad: RMPP mem "
+ "alloc failed for len %zd, gfp %#x\n",
+ sizeof(struct ib_mad_multipacket_seg) + seg_size,
+ gfp_mask);
+ free_send_multipacket_list(send_wr);
+ return -ENOMEM;
+ }
+ seg->size = seg_size;
+ seg->num = i++;
+ list_add_tail(&seg->list, &send_wr->multipacket_list);
+ message_size -= seg_size;
+ }
+ return 0;
+}
+
struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
u32 remote_qpn, u16 pkey_index,
int rmpp_active,
@@ -787,53 +835,54 @@ struct ib_mad_send_buf * ib_create_send_
{
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
- int length, buf_size;
+ int length, message_size, ret;
void *buf;
mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
agent);
- buf_size = get_buf_length(hdr_len, data_len);
+ message_size = get_buf_length(hdr_len, data_len);
if ((!mad_agent->rmpp_version &&
- (rmpp_active || buf_size > sizeof(struct ib_mad))) ||
- (!rmpp_active && buf_size > sizeof(struct ib_mad)))
+ (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+ (!rmpp_active && message_size > sizeof(struct ib_mad)))
return ERR_PTR(-EINVAL);
- length = sizeof *mad_send_wr + buf_size;
- if (length >= PAGE_SIZE)
- buf = (void *)__get_free_pages(gfp_mask, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- buf = kmalloc(length, gfp_mask);
+ length = sizeof *mad_send_wr + message_size;
+ buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
if (!buf)
return ERR_PTR(-ENOMEM);
- memset(buf, 0, length);
-
- mad_send_wr = buf + buf_size;
+ mad_send_wr = buf + sizeof(struct ib_mad);
+ INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
mad_send_wr->send_buf.mad = buf;
+ mad_send_wr->mad_payload = buf + hdr_len;
mad_send_wr->mad_agent_priv = mad_agent_priv;
- mad_send_wr->sg_list[0].length = buf_size;
+ mad_send_wr->sg_list[0].length = hdr_len;
mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+ mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
- mad_send_wr->send_wr.num_sge = 1;
+ mad_send_wr->send_wr.num_sge = 2;
mad_send_wr->send_wr.opcode = IB_WR_SEND;
mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+ mad_send_wr->last_ack_seg = NULL;
+ mad_send_wr->seg_num_seg = NULL;
if (rmpp_active) {
- struct ib_rmpp_mad *rmpp_mad = mad_send_wr->send_buf.mad;
- rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(hdr_len -
- IB_MGMT_RMPP_HDR + data_len);
- rmpp_mad->rmpp_hdr.rmpp_version = mad_agent->rmpp_version;
- rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
- ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr,
- IB_MGMT_RMPP_FLAG_ACTIVE);
+ ret = alloc_send_rmpp_segs(mad_send_wr, message_size, hdr_len,
+ data_len, mad_agent->rmpp_version,
+ gfp_mask);
+ if (ret) {
+ kfree(buf);
+ return ERR_PTR(ret);
+ }
}
mad_send_wr->send_buf.mad_agent = mad_agent;
@@ -842,23 +891,71 @@ struct ib_mad_send_buf * ib_create_send_
}
EXPORT_SYMBOL(ib_create_send_mad);
+struct ib_mad_multipacket_seg
+*ib_rmpp_get_multipacket_seg(struct ib_mad_send_wr_private *wr, int seg_num)
+{
+ struct ib_mad_multipacket_seg *seg;
+
+ if (seg_num == 2) {
+ wr->seg_num_seg =
+ container_of(wr->multipacket_list.next,
+ struct ib_mad_multipacket_seg, list);
+ return wr->seg_num_seg;
+ }
+
+ /* get first list entry if was not already done */
+ if (!wr->seg_num_seg)
+ wr->seg_num_seg =
+ container_of(wr->multipacket_list.next,
+ struct ib_mad_multipacket_seg, list);
+
+ if (wr->seg_num_seg->num == seg_num)
+ return wr->seg_num_seg;
+ else if (wr->seg_num_seg->num < seg_num) {
+ list_for_each_entry(seg, &wr->seg_num_seg->list, list) {
+ if (seg->num == seg_num) {
+ wr->seg_num_seg = seg;
+ return wr->seg_num_seg;
+ }
+ }
+ return NULL;
+ } else {
+ list_for_each_entry_reverse(seg, &wr->seg_num_seg->list, list) {
+ if (seg->num == seg_num) {
+ wr->seg_num_seg = seg;
+ return wr->seg_num_seg;
+ }
+ }
+ return NULL;
+ }
+ return NULL;
+}
+
+struct ib_mad_multipacket_seg
+*ib_mad_get_multipacket_seg(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+ struct ib_mad_send_wr_private *wr;
+
+ if (seg_num < 2)
+ return NULL;
+
+ wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf);
+ return ib_rmpp_get_multipacket_seg(wr, seg_num);
+}
+EXPORT_SYMBOL(ib_mad_get_multipacket_seg);
+
void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
{
struct ib_mad_agent_private *mad_agent_priv;
- void *mad_send_wr;
- int length;
+ struct ib_mad_send_wr_private *mad_send_wr;
mad_agent_priv = container_of(send_buf->mad_agent,
struct ib_mad_agent_private, agent);
mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
send_buf);
- length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - send_buf->mad);
- if (length >= PAGE_SIZE)
- free_pages((unsigned long)send_buf->mad, long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
- else
- kfree(send_buf->mad);
-
+ free_send_multipacket_list(mad_send_wr);
+ kfree(send_buf->mad);
if (atomic_dec_and_test(&mad_agent_priv->refcount))
wake_up(&mad_agent_priv->wait);
}
@@ -881,10 +978,17 @@ int ib_send_mad(struct ib_mad_send_wr_pr
mad_agent = mad_send_wr->send_buf.mad_agent;
sge = mad_send_wr->sg_list;
- sge->addr = dma_map_single(mad_agent->device->dma_device,
- mad_send_wr->send_buf.mad, sge->length,
- DMA_TO_DEVICE);
- pci_unmap_addr_set(mad_send_wr, mapping, sge->addr);
+ sge[0].addr = dma_map_single(mad_agent->device->dma_device,
+ mad_send_wr->send_buf.mad,
+ sge[0].length,
+ DMA_TO_DEVICE);
+ pci_unmap_addr_set(mad_send_wr, header_mapping, sge[0].addr);
+
+ sge[1].addr = dma_map_single(mad_agent->device->dma_device,
+ mad_send_wr->mad_payload,
+ sge[1].length,
+ DMA_TO_DEVICE);
+ pci_unmap_addr_set(mad_send_wr, payload_mapping, sge[1].addr);
spin_lock_irqsave(&qp_info->send_queue.lock, flags);
if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
@@ -901,11 +1005,15 @@ int ib_send_mad(struct ib_mad_send_wr_pr
list_add_tail(&mad_send_wr->mad_list.list, list);
}
spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
- if (ret)
+ if (ret) {
dma_unmap_single(mad_agent->device->dma_device,
- pci_unmap_addr(mad_send_wr, mapping),
- sge->length, DMA_TO_DEVICE);
+ pci_unmap_addr(mad_send_wr, header_mapping),
+ sge[0].length, DMA_TO_DEVICE);
+ dma_unmap_single(mad_agent->device->dma_device,
+ pci_unmap_addr(mad_send_wr, payload_mapping),
+ sge[1].length, DMA_TO_DEVICE);
+ }
return ret;
}
@@ -1876,8 +1984,11 @@ static void ib_mad_send_done_handler(str
retry:
dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
- pci_unmap_addr(mad_send_wr, mapping),
+ pci_unmap_addr(mad_send_wr, header_mapping),
mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+ dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device,
+ pci_unmap_addr(mad_send_wr, payload_mapping),
+ mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
queued_send_wr = NULL;
spin_lock_irqsave(&send_queue->lock, flags);
list_del(&mad_list->list);
Index: src/drivers/infiniband/core/user_mad.c
===================================================================
--- src.orig/drivers/infiniband/core/user_mad.c 2006-02-12 16:30:44.636158000 +0200
+++ src/drivers/infiniband/core/user_mad.c 2006-02-12 16:30:53.142901000 +0200
@@ -255,10 +255,11 @@ static void send_handler(struct ib_mad_a
ib_free_send_mad(packet->msg);
if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
- timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL);
+ timeout = kzalloc(sizeof *timeout + sizeof(struct ib_mad),
+ GFP_KERNEL);
if (!timeout)
goto out;
-
+ INIT_LIST_HEAD(&timeout->seg_list);
timeout->length = IB_MGMT_MAD_HDR;
timeout->mad.hdr.id = packet->mad.hdr.id;
timeout->mad.hdr.status = ETIMEDOUT;
@@ -266,7 +267,7 @@ static void send_handler(struct ib_mad_a
sizeof (struct ib_mad_hdr));
if (queue_packet(file, agent, timeout))
- kfree(timeout);
+ free_packet(timeout);
}
out:
kfree(packet);
@@ -409,6 +410,8 @@ static ssize_t ib_umad_write(struct file
__be64 *tid;
int ret, length, hdr_len, copy_offset;
int rmpp_active, has_rmpp_header;
+ int s, seg_num;
+ struct ib_mad_multipacket_seg *seg;
if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
return -EINVAL;
@@ -485,6 +488,11 @@ static ssize_t ib_umad_write(struct file
goto err_ah;
}
+ if (!rmpp_active && length > sizeof(struct ib_mad)) {
+ ret = -EINVAL;
+ goto err_ah;
+ }
+
packet->msg = ib_create_send_mad(agent,
be32_to_cpu(packet->mad.hdr.qpn),
0, rmpp_active,
@@ -502,14 +510,32 @@ static ssize_t ib_umad_write(struct file
/* Copy MAD headers (RMPP header in place) */
memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
- /* Now, copy rest of message from user into send buffer */
+ /* complete copying first 256 bytes of message into send buffer */
if (copy_from_user(packet->msg->mad + copy_offset,
buf + sizeof (struct ib_user_mad) + copy_offset,
- length - copy_offset)) {
+ min_t(int, length, sizeof(struct ib_mad)) - copy_offset)) {
ret = -EFAULT;
goto err_msg;
}
+ /* if RMPP, copy rest of send message from user to multipacket list */
+ length -= sizeof(struct ib_mad);
+ if (length > 0) {
+ buf += sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+ for (seg_num = 2; length > 0; ++seg_num, buf += s, length -= s) {
+ seg = ib_mad_get_multipacket_seg(packet->msg, seg_num);
+ BUG_ON(!seg);
+ s = min_t(int, length, seg->size);
+ if (copy_from_user(seg->data, buf, s)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ }
+ /* Pad last segment with zeroes. */
+ if (seg->size - s)
+ memset(seg->data + s, 0, seg->size - s);
+ }
+
/*
* If userspace is generating a request that will generate a
* response, we need to make sure the high-order part of the
More information about the general
mailing list