[ofa-general] [PATCH 2/3] libmlx4 - Optimize memory allocation of QP buffers with 64K pages

Mon May 18 00:55:16 PDT 2009

QP buffers are allocated with mlx4_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().

  However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:

(requested size rounded to the page size) + (page size) + (malloc overhead)

rounded internally to the page size.

  So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.

  This patch creates a new function mlx4_alloc_page() for use by
mlx4_alloc_qp_buf() that does an mmap() instead of a posix_memalign() when
the page size is 64K.

Signed-off-by: Sebastien Dugue <sebastien.dugue at bull.net>

---
 src/buf.c  |   40 ++++++++++++++++++++++++++++++++++++++--
 src/mlx4.h |    7 +++++++
 src/qp.c   |    5 +++--
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/buf.c b/src/buf.c
index 0e5f9b6..c8b6823 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
 #endif /* HAVE_CONFIG_H */
 
 #include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
 
 #include "mlx4.h"
 
@@ -69,14 +71,48 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
 	if (ret)
 		free(buf->buf);
 
-	if (!ret)
+	if (!ret) {
 		buf->length = size;
+		buf->type = MLX4_MALIGN;
+	}
 
 	return ret;
 }
 
+#define PAGE_64K	(1UL << 16)
+
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	/* Use the standard posix_memalign() call for pages < 64K */
+	if (page_size < PAGE_64K)
+		return mlx4_alloc_buf(buf, size, page_size);
+
+	/* Otherwise we can save a lot by using mmap directly */
+	buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (buf->buf == MAP_FAILED)
+		return errno;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		munmap(buf->buf, align(size, page_size));
+	else {
+		buf->length = size;
+		buf->type = MLX4_MMAP;
+	}
+
+        return ret;
+ }
+
 void mlx4_free_buf(struct mlx4_buf *buf)
 {
 	ibv_dofork_range(buf->buf, buf->length);
-	free(buf->buf);
+
+	if ( buf->type == MLX4_MMAP )
+		munmap(buf->buf, buf->length);
+	else
+		free(buf->buf);
 }
diff --git a/src/mlx4.h b/src/mlx4.h
index 827a201..83547f5 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -161,9 +161,15 @@ struct mlx4_context {
 	pthread_mutex_t			db_list_mutex;
 };
 
+enum mlx4_buf_type {
+	MLX4_MMAP,
+	MLX4_MALIGN
+};
+
 struct mlx4_buf {
 	void			       *buf;
 	size_t				length;
+	enum mlx4_buf_type		type;
 };
 
 struct mlx4_pd {
@@ -288,6 +294,7 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
 }
 
 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size);
 void mlx4_free_buf(struct mlx4_buf *buf);
 
 uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
diff --git a/src/qp.c b/src/qp.c
index d194ae3..557e255 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -604,8 +604,9 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
 		qp->sq.offset = 0;
 	}
 
-	if (mlx4_alloc_buf(&qp->buf,
-			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+	if (mlx4_alloc_page(&qp->buf,
+			    align(qp->buf_size,
+				  to_mdev(pd->context->device)->page_size),
 			    to_mdev(pd->context->device)->page_size)) {
 		free(qp->sq.wrid);
 		free(qp->rq.wrid);
-- 
1.6.3.rc3.12.gb7937