[ofa-general] 4K MTU patch review
Eli Cohen
eli at mellanox.co.il
Wed Feb 6 09:45:40 PST 2008
Hi Shirley,
I have created the following patch based on your patch series such that
it touches less of the previous code and is thus smaller. Please review
it and see if you think we can make any use of it.
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h 2008-02-06 18:57:34.847708000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h 2008-02-06 19:22:02.372113000 +0200
@@ -56,11 +56,11 @@
/* constants */
enum {
- IPOIB_PACKET_SIZE = 2048,
- IPOIB_BUF_SIZE = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
IPOIB_ENCAP_LEN = 4,
+ IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+ IPOIB_UD_RX_SG = 2, /* for 4K MTU */
+
IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
@@ -141,9 +141,9 @@ struct ipoib_mcast {
struct net_device *dev;
};
-struct ipoib_rx_buf {
+struct ipoib_sg_rx_buf {
struct sk_buff *skb;
- u64 mapping;
+ u64 mapping[IPOIB_UD_RX_SG];
};
struct ipoib_tx_buf {
@@ -337,7 +337,7 @@ struct ipoib_dev_priv {
struct net_device *dev;
struct ib_recv_wr rx_wr_draft[UD_POST_RCV_COUNT];
- struct ib_sge sglist_draft[UD_POST_RCV_COUNT];
+ struct ib_sge sglist_draft[UD_POST_RCV_COUNT][IPOIB_UD_RX_SG];
unsigned int rx_outst;
struct napi_struct napi;
@@ -378,7 +378,7 @@ struct ipoib_dev_priv {
unsigned int admin_mtu;
unsigned int mcast_mtu;
- struct ipoib_rx_buf *rx_ring;
+ struct ipoib_sg_rx_buf *rx_ring;
spinlock_t tx_lock;
struct ipoib_tx_buf *tx_ring;
@@ -412,6 +412,7 @@ struct ipoib_dev_priv {
struct ipoib_ethtool_st etool;
struct timer_list poll_timer;
struct ib_ah *own_ah;
+ int max_ib_mtu;
};
struct ipoib_ah {
@@ -452,6 +453,28 @@ struct ipoib_neigh {
struct list_head list;
};
+#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES)
+
+static inline int ipoib_ud_need_sg(int ib_mtu)
+{
+ return (IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE) ? 1 : 0;
+}
+
+static inline void ipoib_sg_dma_unmap_rx(struct ipoib_dev_priv *priv,
+ u64 mapping[IPOIB_UD_RX_SG])
+{
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+ ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
+ DMA_FROM_DEVICE);
+ ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ } else
+ ib_dma_unmap_single(priv->ca, mapping[0],
+ IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+ DMA_FROM_DEVICE);
+}
+
/*
* We stash a pointer to our private neighbour information after our
* hardware address in neigh->ha. The ALIGN() expression here makes
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2008-02-06 18:57:34.007682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2008-02-06 19:23:08.903503000 +0200
@@ -96,14 +96,35 @@ static void clean_pending_receives(struc
for (i = 0; i < priv->rx_outst; ++i) {
id = priv->rx_wr_draft[i].wr_id & ~IPOIB_OP_RECV;
- ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
- IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[i].mapping);
dev_kfree_skb_any(priv->rx_ring[id].skb);
priv->rx_ring[id].skb = NULL;
}
priv->rx_outst = 0;
}
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
+ struct sk_buff *skb, unsigned int length)
+{
+ unsigned int size;
+
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+
+ /* put header into skb */
+ size = min(length, (unsigned)IPOIB_UD_HEAD_SIZE);
+ __skb_put(skb, size);
+
+ length -= size;
+
+ frag->size = length;
+ skb->data_len += length;
+ skb->len += length;
+ skb->truesize += length;
+ } else
+ skb_put(skb, length);
+}
+
static int ipoib_ib_post_receive(struct net_device *dev, int id)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -111,7 +132,9 @@ static int ipoib_ib_post_receive(struct
int ret = 0;
int i = priv->rx_outst;
- priv->sglist_draft[i].addr = priv->rx_ring[id].mapping;
+ priv->sglist_draft[i][0].addr = priv->rx_ring[id].mapping[0];
+ priv->sglist_draft[i][1].addr = priv->rx_ring[id].mapping[1];
+
priv->rx_wr_draft[i].wr_id = id | IPOIB_OP_RECV;
if (++priv->rx_outst == UD_POST_RCV_COUNT) {
ret = ib_post_recv(priv->qp, priv->rx_wr_draft, &bad_wr);
@@ -120,8 +143,8 @@ static int ipoib_ib_post_receive(struct
ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
while (bad_wr) {
id = bad_wr->wr_id & ~IPOIB_OP_RECV;
- ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
- IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ ipoib_sg_dma_unmap_rx(priv,
+ priv->rx_ring[i].mapping);
dev_kfree_skb_any(priv->rx_ring[id].skb);
priv->rx_ring[id].skb = NULL;
}
@@ -136,11 +159,17 @@ static int ipoib_alloc_rx_skb(struct net
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
- u64 addr;
+ int buf_size;
+ u64 *mapping = priv->rx_ring[id].mapping;
+
+ if (ipoib_ud_need_sg(priv->max_ib_mtu))
+ buf_size = IPOIB_UD_HEAD_SIZE;
+ else
+ buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
- skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
- if (!skb)
- return -ENOMEM;
+ skb = dev_alloc_skb(buf_size + 4);
+ if (unlikely(!skb))
+ return -ENOMEM;
/*
* IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -149,17 +178,33 @@ static int ipoib_alloc_rx_skb(struct net
*/
skb_reserve(skb, 4);
- addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
- DMA_FROM_DEVICE);
- if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
- dev_kfree_skb_any(skb);
- return -EIO;
+ mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+ DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+ goto out_free;
+
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+ struct page *page = alloc_page(GFP_ATOMIC);
+
+ if (!page)
+ goto partial_error;
+
+ skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+ mapping[1] = ib_dma_map_page(priv->ca, page, 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+ goto partial_error;
}
- priv->rx_ring[id].skb = skb;
- priv->rx_ring[id].mapping = addr;
+ priv->rx_ring[id].skb = skb;
+ return 0;
- return 0;
+partial_error:
+ ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
+
+out_free:
+ dev_kfree_skb_any(skb);
+ return -ENOMEM;
}
static int ipoib_ib_post_receives(struct net_device *dev)
@@ -186,7 +231,6 @@ static void ipoib_ib_handle_rx_wc(struct
struct ipoib_dev_priv *priv = netdev_priv(dev);
unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
struct sk_buff *skb;
- u64 addr;
ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
wr_id, wc->status);
@@ -198,15 +242,13 @@ static void ipoib_ib_handle_rx_wc(struct
}
skb = priv->rx_ring[wr_id].skb;
- addr = priv->rx_ring[wr_id].mapping;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
ipoib_warn(priv, "failed recv event "
"(status=%d, wrid=%d vend_err %x)\n",
wc->status, wr_id, wc->vendor_err);
- ib_dma_unmap_single(priv->ca, addr,
- IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
dev_kfree_skb_any(skb);
priv->rx_ring[wr_id].skb = NULL;
return;
@@ -231,9 +273,9 @@ static void ipoib_ib_handle_rx_wc(struct
ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
wc->byte_len, wc->slid);
- ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
- skb_put(skb, wc->byte_len);
+ ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
skb_pull(skb, IB_GRH_BYTES);
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
@@ -828,15 +870,13 @@ int ipoib_ib_dev_stop(struct net_device
* all our pending work requests.
*/
for (i = 0; i < ipoib_recvq_size; ++i) {
- struct ipoib_rx_buf *rx_req;
+ struct ipoib_sg_rx_buf *rx_req;
rx_req = &priv->rx_ring[i];
if (rx_req->skb) {
- ib_dma_unmap_single(priv->ca,
- rx_req->mapping,
- IPOIB_BUF_SIZE,
- DMA_FROM_DEVICE);
+ ipoib_sg_dma_unmap_rx(priv,
+ priv->rx_ring[i].mapping);
dev_kfree_skb_any(rx_req->skb);
rx_req->skb = NULL;
}
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c 2008-02-06 18:57:29.657377000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c 2008-02-06 19:25:56.553407000 +0200
@@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
return 0;
}
- if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+ if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
return -EINVAL;
priv->admin_mtu = new_mtu;
@@ -981,10 +981,6 @@ static void ipoib_setup(struct net_devic
dev->tx_queue_len = ipoib_sendq_size * 2;
dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
- /* MTU will be reset when mcast join happens */
- dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
- priv->mcast_mtu = priv->admin_mtu = dev->mtu;
-
memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
netif_carrier_off(dev);
@@ -1130,6 +1126,7 @@ static struct net_device *ipoib_add_port
struct ib_device *hca, u8 port)
{
struct ipoib_dev_priv *priv;
+ struct ib_port_attr attr;
int result = -ENOMEM;
priv = ipoib_intf_alloc(format);
@@ -1140,6 +1137,18 @@ static struct net_device *ipoib_add_port
priv->dev->features |= NETIF_F_HIGHDMA;
+ if (!ib_query_port(hca, port, &attr))
+ priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+ else {
+ printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+ hca->name, port);
+ goto device_init_failed;
+ }
+
+ /* MTU will be reset when mcast join happens */
+ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
+ priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
+
result = ib_query_pkey(hca, port, 0, &priv->pkey);
if (result) {
printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2008-02-06 18:56:03.372135000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2008-02-06 19:22:02.386120000 +0200
@@ -567,8 +567,7 @@ void ipoib_mcast_join_task(struct work_s
return;
}
- priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
- IPOIB_ENCAP_LEN;
+ priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
if (!ipoib_cm_admin_enabled(dev))
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2008-02-06 18:57:34.010682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2008-02-06 19:22:02.373118000 +0200
@@ -151,7 +151,7 @@ int ipoib_transport_dev_init(struct net_
.max_send_wr = ipoib_sendq_size,
.max_recv_wr = ipoib_recvq_size,
.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
- .max_recv_sge = 1
+ .max_recv_sge = IPOIB_UD_RX_SG
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_UD,
@@ -227,16 +227,27 @@ int ipoib_transport_dev_init(struct net_
priv->tx_wr.send_flags = IB_SEND_SIGNALED;
for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
- priv->sglist_draft[i].length = IPOIB_BUF_SIZE;
- priv->sglist_draft[i].lkey = priv->mr->lkey;
-
- priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i];
- priv->rx_wr_draft[i].num_sge = 1;
+ priv->sglist_draft[i][0].lkey = priv->mr->lkey;
+ priv->sglist_draft[i][1].lkey = priv->mr->lkey;
+ priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i][0];
if (i < UD_POST_RCV_COUNT - 1)
priv->rx_wr_draft[i].next = &priv->rx_wr_draft[i + 1];
}
priv->rx_wr_draft[i].next = NULL;
+ if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+ for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+ priv->sglist_draft[i][0].length = IPOIB_UD_HEAD_SIZE;
+ priv->sglist_draft[i][1].length = PAGE_SIZE;
+ priv->rx_wr_draft[i].num_sge = IPOIB_UD_RX_SG;
+ }
+ } else {
+ for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+ priv->sglist_draft[i][0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+ priv->rx_wr_draft[i].num_sge = 1;
+ }
+ }
+
return 0;
out_free_scq:
More information about the general
mailing list