[ofa-general] [PATCH] ipoib large send offload

Eli Cohen eli at mellanox.co.il
Sun Aug 12 04:44:27 PDT 2007


Add LSO supprt to ipoib

Signed-off-by: Eli Cohen <eli at mellnaox.co.il>

---

Index: linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-09 08:56:09.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-09 09:33:19.000000000 +0300
@@ -705,7 +705,13 @@ static int ipoib_start_xmit(struct sk_bu
 				goto out;
 			}
 
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			if (skb_is_gso(skb))
+				ipoib_send_gso(dev, skb, neigh->ah,
+					       IPOIB_QPN(skb->dst->neighbour->ha));
+			else
+                                ipoib_send(dev, skb, neigh->ah,
+					   IPOIB_QPN(skb->dst->neighbour->ha));
+
 			goto out;
 		}
 
@@ -1186,9 +1192,13 @@ static struct net_device *ipoib_add_port
 		goto event_failed;
 	}
 
-	if (!set_tx_csum(priv->dev))
+	if (!set_tx_csum(priv->dev)) {
 		priv->dev->features |= NETIF_F_SG;
 
+		if (priv->ca->flags & IB_DEVICE_TCP_GSO)
+			priv->dev->features |= NETIF_F_TSO;
+	}
+
 	set_rx_csum(priv->dev);
 
 	result = register_netdev(priv->dev);
Index: linux-2.6.23-rc1/drivers/net/mlx4/fw.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/net/mlx4/fw.c	2007-08-09 08:56:08.000000000 +0300
+++ linux-2.6.23-rc1/drivers/net/mlx4/fw.c	2007-08-09 08:56:11.000000000 +0300
@@ -133,6 +133,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *
 #define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
 #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
 #define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
@@ -215,6 +216,13 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *
 	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
 	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+	field &= 0x1f;
+	if (!field)
+		dev_cap->max_gso_sz = 0;
+	else
+		dev_cap->max_gso_sz = 1 << field;
+
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
 	dev_cap->max_rdma_global = 1 << (field & 0x3f);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
@@ -377,6 +385,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *
 		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
 	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
 		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
 
 	dump_dev_cap_flags(dev, dev_cap->flags);
 
Index: linux-2.6.23-rc1/drivers/net/mlx4/fw.h
===================================================================
--- linux-2.6.23-rc1.orig/drivers/net/mlx4/fw.h	2007-08-09 08:41:54.000000000 +0300
+++ linux-2.6.23-rc1/drivers/net/mlx4/fw.h	2007-08-09 08:56:11.000000000 +0300
@@ -96,6 +96,7 @@ struct mlx4_dev_cap {
 	u8  bmme_flags;
 	u32 reserved_lkey;
 	u64 max_icm_sz;
+	int max_gso_sz;
 };
 
 struct mlx4_adapter {
Index: linux-2.6.23-rc1/drivers/net/mlx4/main.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/net/mlx4/main.c	2007-08-09 08:41:54.000000000 +0300
+++ linux-2.6.23-rc1/drivers/net/mlx4/main.c	2007-08-09 08:56:11.000000000 +0300
@@ -158,6 +158,7 @@ static int __devinit mlx4_dev_cap(struct
 	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
 	dev->caps.flags		     = dev_cap->flags;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
 	return 0;
 }
Index: linux-2.6.23-rc1/drivers/infiniband/hw/mlx4/main.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/hw/mlx4/main.c	2007-08-09 08:56:08.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/hw/mlx4/main.c	2007-08-09 08:56:11.000000000 +0300
@@ -101,6 +101,8 @@ static int mlx4_ib_query_device(struct i
 		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
 		props->device_cap_flags |= IB_DEVICE_IP_CSUM;
+	if (dev->dev->caps.max_gso_sz)
+		props->device_cap_flags |= IB_DEVICE_TCP_GSO;
 
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
@@ -572,6 +574,8 @@ static void *mlx4_ib_add(struct mlx4_dev
 
 	if (ibdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
 		ibdev->ib_dev.flags |= IB_DEVICE_IP_CSUM;
+        if (ibdev->dev->caps.max_gso_sz)
+		ibdev->ib_dev.flags |= IB_DEVICE_TCP_GSO;
 
 	if (init_node_data(ibdev))
 		goto err_map;
Index: linux-2.6.23-rc1/drivers/infiniband/hw/mlx4/qp.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/hw/mlx4/qp.c	2007-08-09 08:56:10.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/hw/mlx4/qp.c	2007-08-09 09:03:35.000000000 +0300
@@ -65,6 +65,7 @@ struct mlx4_ib_sqp {
 
 static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_SEND]			= __constant_cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]			= __constant_cpu_to_be32(MLX4_OPCODE_LSO),
 	[IB_WR_SEND_WITH_IMM]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
 	[IB_WR_RDMA_WRITE]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
 	[IB_WR_RDMA_WRITE_WITH_IMM]	= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
@@ -740,7 +741,8 @@ static int __mlx4_ib_modify_qp(struct ib
 	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
 	else if (ibqp->qp_type == IB_QPT_UD)
-		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+		context->mtu_msgmax = (IB_MTU_4096 << 5) |
+		ilog2(dev->dev->caps.max_gso_sz);
 	else if (attr_mask & IB_QP_PATH_MTU) {
 		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
 			printk(KERN_ERR "path MTU (%u) is invalid\n",
@@ -1312,6 +1314,28 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
 			set_datagram_seg(wqe, wr);
 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->opcode == IB_WR_LSO) {
+				int halign;
+
+				memcpy(((struct mlx4_lso_seg *)wqe)->header,
+				       wr->wr.ud.header, wr->wr.ud.hlen);
+				wmb();
+				((struct mlx4_lso_seg *)wqe)->mss_hdr_size =
+					cpu_to_be32(((wr->wr.ud.mss - wr->wr.ud.hlen) << 16) |
+						    wr->wr.ud.hlen);
+
+				halign = ALIGN(wr->wr.ud.hlen, 16);
+				wqe += halign;
+				size += halign >> 4;
+
+				if (unlikely(wr->num_sge > qp->sq.max_gs - (halign >> 4))) {
+					err = -EINVAL;
+					*bad_wr = wr;
+					goto out;
+				}
+			}
+
 			break;
 
 		case IB_QPT_SMI:
@@ -1365,6 +1389,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
 		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
 			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
 
+
 		/*
 		 * We can improve latency by not stamping the last
 		 * send queue WQE until after ringing the doorbell, so
Index: linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-08-09 08:56:09.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib.h	2007-08-09 08:56:11.000000000 +0300
@@ -375,6 +375,10 @@ int ipoib_add_pkey_attr(struct net_devic
 
 void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn);
+
+void ipoib_send_gso(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn);
+
 void ipoib_reap_ah(struct work_struct *work);
 
 void ipoib_flush_paths(struct net_device *dev);
Index: linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-08-09 08:56:09.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-08-09 09:33:01.000000000 +0300
@@ -38,6 +38,7 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/ip.h>
+#include <linux/tcp.h>
 
 #include <rdma/ib_cache.h>
 
@@ -255,15 +256,22 @@ repost:
 }
 
 static int dma_unmap_list(struct ib_device *ca, struct ipoib_mapping_st *map,
-			   u16 n)
+			   u16 n, int gso)
 {
-	int i, len;
+	int i, len, first;
 
 	BUG_ON(!n);
-	ib_dma_unmap_single(ca, map[0].addr, map[0].size, DMA_TO_DEVICE);
-	len = map[0].size;
+	if (!gso) {
+		ib_dma_unmap_single(ca, map[0].addr, map[0].size, DMA_TO_DEVICE);
+		len = map[0].size;
+		first = 1;
+	}
+	else {
+		len = 0;
+		first = 0;
+	}
 
-	for (i = 1; i < n; ++i) {
+	for (i = first; i < n; ++i) {
 		ib_dma_unmap_page(ca, map[i].addr, map[i].size,
 				  DMA_TO_DEVICE);
 		len += map[i].size;
@@ -282,6 +290,7 @@ static void ipoib_ib_handle_tx_wc(struct
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
+
 	if (unlikely(wr_id >= ipoib_sendq_size)) {
 		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
 			   wr_id, ipoib_sendq_size);
@@ -289,8 +298,14 @@ static void ipoib_ib_handle_tx_wc(struct
 	}
 
 	tx_req = &priv->tx_ring[wr_id];
-	priv->stats.tx_bytes += dma_unmap_list(priv->ca, tx_req->mapping,
-					       skb_shinfo(tx_req->skb)->nr_frags + 1);
+
+	if (skb_is_gso(tx_req->skb))
+		priv->stats.tx_bytes += dma_unmap_list(priv->ca, tx_req->mapping,
+						       skb_shinfo(tx_req->skb)->nr_frags, 1);
+	else
+		priv->stats.tx_bytes += dma_unmap_list(priv->ca, tx_req->mapping,
+						       skb_shinfo(tx_req->skb)->nr_frags + 1, 0);
+
 	++priv->stats.tx_packets;
 
 	dev_kfree_skb_any(tx_req->skb);
@@ -373,7 +388,8 @@ void ipoib_ib_completion(struct ib_cq *c
 static inline int post_send(struct ipoib_dev_priv *priv,
 			    unsigned int wr_id,
 			    struct ib_ah *address, u32 qpn,
-			    struct ipoib_mapping_st *mapping, int ngather)
+			    struct ipoib_mapping_st *mapping, int ngather,
+			    void *lso_header, int h_len)
 {
 	struct ib_send_wr *bad_wr;
 	int i;
@@ -388,9 +404,89 @@ static inline int post_send(struct ipoib
 	priv->tx_wr.wr.ud.remote_qpn  = qpn;
 	priv->tx_wr.wr.ud.ah 	      = address;
 
+	if (lso_header) {
+		priv->tx_wr.wr.ud.mss = priv->dev->mtu;
+		priv->tx_wr.wr.ud.header = lso_header;
+		priv->tx_wr.wr.ud.hlen = h_len;
+		priv->tx_wr.opcode 	= IB_WR_LSO;
+	}
+	else
+		priv->tx_wr.opcode 	= IB_WR_SEND;
+
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
 
+
+void ipoib_send_gso(struct net_device *dev, struct sk_buff *skb,
+		    struct ipoib_ah *address, u32 qpn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_tx_buf *tx_req;
+	struct skb_frag_struct *frag;
+	u64 addr;
+	unsigned short i;
+
+	ipoib_dbg_data(priv, "sending gso packet, length=%d address=%p qpn=0x%06x\n",
+		       skb->len, address, qpn);
+
+	if (unlikely((skb_headlen(skb) - IPOIB_ENCAP_LEN) !=
+	    ((ip_hdr(skb)->ihl + tcp_hdr(skb)->doff) << 2))) {
+		ipoib_warn(priv, "headlen (%d) does not match ip (%d)and "
+			   "tcp headers(%d), dropping skb\n",
+			   skb_headlen(skb) - IPOIB_ENCAP_LEN, ip_hdr(skb)->ihl << 2,
+			   tcp_hdr(skb)->doff << 2);
+		++priv->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		frag = &skb_shinfo(skb)->frags[i];
+		addr = ib_dma_map_page(priv->ca, frag->page, frag->page_offset,
+				       frag->size, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, addr)))
+			goto map_err;
+
+		tx_req->mapping[i].addr = addr;
+		tx_req->mapping[i].size = frag->size;
+//		printk("%s: [%d] addr = 0x%llx, size = %d\n", __func__, i, addr, frag->size);
+	}
+
+	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+			       address->ah, qpn, tx_req->mapping,
+			       skb_shinfo(skb)->nr_frags,
+			       skb->data, skb_headlen(skb)))) {
+		ipoib_warn(priv, "post_send failed\n");
+		goto map_err;
+	} else {
+		dev->trans_start = jiffies;
+
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+
+		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
+			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+			netif_stop_queue(dev);
+			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+		}
+	}
+	return;
+
+map_err:
+	dma_unmap_list(priv->ca, tx_req->mapping, i, 1);
+	dev_kfree_skb_any(skb);
+}
+
 void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		struct ipoib_ah *address, u32 qpn)
 {
@@ -444,7 +540,7 @@ void ipoib_send(struct net_device *dev, 
 	}
 
 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
-			       address->ah, qpn, tx_req->mapping, skb_shinfo(skb)->nr_frags + 1))) {
+			       address->ah, qpn, tx_req->mapping, skb_shinfo(skb)->nr_frags + 1, NULL, 0))) {
 		ipoib_warn(priv, "post_send failed\n");
 		goto map_err;
 	} else {
@@ -462,7 +558,7 @@ void ipoib_send(struct net_device *dev, 
 	return;
 
 map_err:
-	dma_unmap_list(priv->ca, tx_req->mapping, i + 1);
+	dma_unmap_list(priv->ca, tx_req->mapping, i + 1, 0);
 	dev_kfree_skb_any(skb);
 }
 
@@ -657,7 +753,7 @@ int ipoib_ib_dev_stop(struct net_device 
 				tx_req = &priv->tx_ring[priv->tx_tail &
 							(ipoib_sendq_size - 1)];
 				dma_unmap_list(priv->ca, tx_req->mapping,
-					       skb_shinfo(tx_req->skb)->nr_frags + 1);
+					       skb_shinfo(tx_req->skb)->nr_frags + 1, skb_is_gso(tx_req->skb));
 				dev_kfree_skb_any(tx_req->skb);
 				++priv->tx_tail;
 			}
Index: linux-2.6.23-rc1/include/linux/mlx4/device.h
===================================================================
--- linux-2.6.23-rc1.orig/include/linux/mlx4/device.h	2007-08-09 08:41:54.000000000 +0300
+++ linux-2.6.23-rc1/include/linux/mlx4/device.h	2007-08-09 08:56:11.000000000 +0300
@@ -177,6 +177,7 @@ struct mlx4_caps {
 	u32			flags;
 	u16			stat_rate_support;
 	u8			port_width_cap[MLX4_MAX_PORTS + 1];
+	int			max_gso_sz;
 };
 
 struct mlx4_buf_list {
Index: linux-2.6.23-rc1/include/linux/mlx4/qp.h
===================================================================
--- linux-2.6.23-rc1.orig/include/linux/mlx4/qp.h	2007-08-09 08:56:08.000000000 +0300
+++ linux-2.6.23-rc1/include/linux/mlx4/qp.h	2007-08-09 08:56:11.000000000 +0300
@@ -215,6 +215,11 @@ struct mlx4_wqe_datagram_seg {
 	__be32			reservd[2];
 };
 
+struct mlx4_lso_seg {
+	__be32                  mss_hdr_size;
+	__be32                  header[0];
+};
+
 struct mlx4_wqe_bind_seg {
 	__be32			flags1;
 	__be32			flags2;
Index: linux-2.6.23-rc1/include/rdma/ib_verbs.h
===================================================================
--- linux-2.6.23-rc1.orig/include/rdma/ib_verbs.h	2007-08-09 08:56:08.000000000 +0300
+++ linux-2.6.23-rc1/include/rdma/ib_verbs.h	2007-08-09 08:56:11.000000000 +0300
@@ -94,7 +94,8 @@ enum ib_device_cap_flags {
 	IB_DEVICE_ZERO_STAG		= (1<<15),
 	IB_DEVICE_SEND_W_INV		= (1<<16),
 	IB_DEVICE_MEM_WINDOW		= (1<<17),
-	IB_DEVICE_IP_CSUM               = (1<<18)
+	IB_DEVICE_IP_CSUM               = (1<<18),
+	IB_DEVICE_TCP_GSO               = (1<<19)
 };
 
 enum ib_atomic_cap {
@@ -606,6 +607,7 @@ enum ib_wr_opcode {
 	IB_WR_RDMA_WRITE,
 	IB_WR_RDMA_WRITE_WITH_IMM,
 	IB_WR_SEND,
+	IB_WR_LSO,
 	IB_WR_SEND_WITH_IMM,
 	IB_WR_RDMA_READ,
 	IB_WR_ATOMIC_CMP_AND_SWP,
@@ -648,6 +650,9 @@ struct ib_send_wr {
 		} atomic;
 		struct {
 			struct ib_ah *ah;
+			void   *header;
+			int     hlen;
+			int     mss;
 			u32	remote_qpn;
 			u32	remote_qkey;
 			u16	pkey_index; /* valid for GSI only */
Index: linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- linux-2.6.23-rc1.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-08-09 08:56:07.000000000 +0300
+++ linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-08-09 08:56:11.000000000 +0300
@@ -200,7 +200,6 @@ int ipoib_transport_dev_init(struct net_
 	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
 		priv->tx_sge[i].lkey 	= priv->mr->lkey;
 
-	priv->tx_wr.opcode 	= IB_WR_SEND;
 	priv->tx_wr.sg_list 	= priv->tx_sge;
 	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
 




More information about the general mailing list