[ofa-general] Re: [PATCH 1 of 2] IB/mlx4: For 64-bit systems, use large virtually contiguous queue buffers (vmap)

Wed Feb 6 21:18:36 PST 2008

OK, I applied this along with a couple of cleanup patches of my own.
I decided to use the vmap() access for CQ and SRQ buffers too, since I
think that the CPU's MMU should be faster than walking our own page
tables in software.

Here's what I applied (I still need to test tomorrow so there may be
silly bugs):

commit b57aacfa7a95328f469d0360e49289b023c47e9e
Author: Roland Dreier <rolandd at cisco.com>
Date:   Wed Feb 6 21:17:59 2008 -0800

    mlx4_core: Clean up struct mlx4_buf
    
    Now that struct mlx4_buf.u is a struct instead of a union because of
    the vmap() changes, there's no point in having a struct at all.  So
    move .direct and .page_list directly into struct mlx4_buf and get rid
    of a bunch of unnecessary ".u"s.
    
    Signed-off-by: Roland Dreier <rolandd at cisco.com>
---
 drivers/net/mlx4/alloc.c    |   40 ++++++++++++++++++++--------------------
 drivers/net/mlx4/mr.c       |    4 ++--
 include/linux/mlx4/device.h |   10 ++++------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index 2da2c2e..521dc03 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -116,40 +116,40 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->nbufs        = 1;
 		buf->npages       = 1;
 		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
 						       size, &t, GFP_KERNEL);
-		if (!buf->u.direct.buf)
+		if (!buf->direct.buf)
 			return -ENOMEM;
 
-		buf->u.direct.map = t;
+		buf->direct.map = t;
 
 		while (t & ((1 << buf->page_shift) - 1)) {
 			--buf->page_shift;
 			buf->npages *= 2;
 		}
 
-		memset(buf->u.direct.buf, 0, size);
+		memset(buf->direct.buf, 0, size);
 	} else {
 		int i;
 
 		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 		buf->npages      = buf->nbufs;
 		buf->page_shift  = PAGE_SHIFT;
-		buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list,
+		buf->page_list   = kzalloc(buf->nbufs * sizeof *buf->page_list,
 					   GFP_KERNEL);
-		if (!buf->u.page_list)
+		if (!buf->page_list)
 			return -ENOMEM;
 
 		for (i = 0; i < buf->nbufs; ++i) {
-			buf->u.page_list[i].buf =
+			buf->page_list[i].buf =
 				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
 						   &t, GFP_KERNEL);
-			if (!buf->u.page_list[i].buf)
+			if (!buf->page_list[i].buf)
 				goto err_free;
 
-			buf->u.page_list[i].map = t;
+			buf->page_list[i].map = t;
 
-			memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
+			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
 		}
 
 		if (BITS_PER_LONG == 64) {
@@ -158,10 +158,10 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 			if (!pages)
 				goto err_free;
 			for (i = 0; i < buf->nbufs; ++i)
-				pages[i] = virt_to_page(buf->u.page_list[i].buf);
-			buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+				pages[i] = virt_to_page(buf->page_list[i].buf);
+			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
 			kfree(pages);
-			if (!buf->u.direct.buf)
+			if (!buf->direct.buf)
 				goto err_free;
 		}
 	}
@@ -180,18 +180,18 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 	int i;
 
 	if (buf->nbufs == 1)
-		dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
-				  buf->u.direct.map);
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  buf->direct.map);
 	else {
 		if (BITS_PER_LONG == 64)
-			vunmap(buf->u.direct.buf);
+			vunmap(buf->direct.buf);
 
 		for (i = 0; i < buf->nbufs; ++i)
-			if (buf->u.page_list[i].buf)
+			if (buf->page_list[i].buf)
 				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
-						  buf->u.page_list[i].buf,
-						  buf->u.page_list[i].map);
-		kfree(buf->u.page_list);
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
 	}
 }
 EXPORT_SYMBOL_GPL(mlx4_buf_free);
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 9c9e308..679dfdb 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -419,9 +419,9 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 
 	for (i = 0; i < buf->npages; ++i)
 		if (buf->nbufs == 1)
-			page_list[i] = buf->u.direct.map + (i << buf->page_shift);
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
 		else
-			page_list[i] = buf->u.page_list[i].map;
+			page_list[i] = buf->page_list[i].map;
 
 	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 6316077..4210ac4 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -189,10 +189,8 @@ struct mlx4_buf_list {
 };
 
 struct mlx4_buf {
-	struct {
-		struct mlx4_buf_list	direct;
-		struct mlx4_buf_list   *page_list;
-	} u;
+	struct mlx4_buf_list	direct;
+	struct mlx4_buf_list   *page_list;
 	int			nbufs;
 	int			npages;
 	int			page_shift;
@@ -311,9 +309,9 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
 	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
-		return buf->u.direct.buf + offset;
+		return buf->direct.buf + offset;
 	else
-		return buf->u.page_list[offset >> PAGE_SHIFT].buf +
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
 			(offset & (PAGE_SIZE - 1));
 }
 

commit 313abe55a87bc10e55d00f337d609e17ad5f8c9a
Author: Jack Morgenstein <jackm at dev.mellanox.co.il>
Date:   Mon Jan 28 10:40:51 2008 +0200

    mlx4_core: For 64-bit systems, vmap() kernel queue buffers
    
    Since kernel virtual memory is not a problem on 64-bit systems, there
    is no reason to use our own 2-layer page mapping scheme for large
    kernel queue buffers on such systems.  Instead, map the page list to a
    single virtually contiguous buffer with vmap(), so that can we access
    buffer memory via direct indexing.
    
    Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>
    Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>
    Signed-off-by: Roland Dreier <rolandd at cisco.com>
---
 drivers/net/mlx4/alloc.c    |   16 ++++++++++++++++
 include/linux/mlx4/device.h |    4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index b226e01..2da2c2e 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 			memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
 		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; ++i)
+				pages[i] = virt_to_page(buf->u.page_list[i].buf);
+			buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->u.direct.buf)
+				goto err_free;
+		}
 	}
 
 	return 0;
@@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 		dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
 				  buf->u.direct.map);
 	else {
+		if (BITS_PER_LONG == 64)
+			vunmap(buf->u.direct.buf);
+
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->u.page_list[i].buf)
 				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a0afa75..6316077 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -189,7 +189,7 @@ struct mlx4_buf_list {
 };
 
 struct mlx4_buf {
-	union {
+	struct {
 		struct mlx4_buf_list	direct;
 		struct mlx4_buf_list   *page_list;
 	} u;
@@ -310,7 +310,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
 static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
 {
-	if (buf->nbufs == 1)
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
 		return buf->u.direct.buf + offset;
 	else
 		return buf->u.page_list[offset >> PAGE_SHIFT].buf +

commit 1c69fc2a9012e160c8d459f63df74a6b01db8322
Author: Roland Dreier <rolandd at cisco.com>
Date:   Wed Feb 6 21:07:54 2008 -0800

    IB/mlx4: Consolidate code to get an entry from a struct mlx4_buf
    
    We use struct mlx4_buf for kernel QP, CQ and SRQ buffers, and the code
    to look up an entry is duplicated in get_cqe_from_buf() and the QP and
    SRQ versions of get_wqe().  Factor this out into mlx4_buf_offset().
    
    This will also make it easier to switch over to using vmap() for buffers.
    
    Signed-off-by: Roland Dreier <rolandd at cisco.com>
---
 drivers/infiniband/hw/mlx4/cq.c  |    8 +-------
 drivers/infiniband/hw/mlx4/qp.c  |    6 +-----
 drivers/infiniband/hw/mlx4/srq.c |    8 +-------
 include/linux/mlx4/device.h      |    8 ++++++++
 4 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 7950aa6..8ac7b97 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -64,13 +64,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
 
 static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
 {
-	int offset = n * sizeof (struct mlx4_cqe);
-
-	if (buf->buf.nbufs == 1)
-		return buf->buf.u.direct.buf + offset;
-	else
-		return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf +
-			(offset & (PAGE_SIZE - 1));
+	return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
 }
 
 static void *get_cqe(struct mlx4_ib_cq *cq, int n)
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 8cba9c5..376db73 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -96,11 +96,7 @@ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 
 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
 {
-	if (qp->buf.nbufs == 1)
-		return qp->buf.u.direct.buf + offset;
-	else
-		return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
-			(offset & (PAGE_SIZE - 1));
+	return mlx4_buf_offset(&qp->buf, offset);
 }
 
 static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index e7e9a3d..beaa3b0 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -38,13 +38,7 @@
 
 static void *get_wqe(struct mlx4_ib_srq *srq, int n)
 {
-	int offset = n << srq->msrq.wqe_shift;
-
-	if (srq->buf.nbufs == 1)
-		return srq->buf.u.direct.buf + offset;
-	else
-		return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf +
-			(offset & (PAGE_SIZE - 1));
+	return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
 }
 
 static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 222815d..a0afa75 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -308,6 +308,14 @@ struct mlx4_init_port_param {
 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		   struct mlx4_buf *buf);
 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
+{
+	if (buf->nbufs == 1)
+		return buf->u.direct.buf + offset;
+	else
+		return buf->u.page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
 
 int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
 void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);