[ofa-general] [PATCH 1/3] libmthca - Optimize memory allocation of QP buffers with 64K pages

sebastien dugue sebastien.dugue at bull.net
Mon May 18 00:55:25 PDT 2009


  QP buffers are allocated with mthca_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().

  However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:

(requested size rounded to the page size) + (page size) + (malloc overhead)

rounded internally to the page size.

  So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.

  This patch creates a new function mthca_alloc_page() for use by
mthca_alloc_qp_buf() that does an mmap() instead of a posix_memalign() when
the page size is 64K.

Signed-off-by: Sebastien Dugue <sebastien.dugue at bull.net>

---
 src/buf.c   |   40 ++++++++++++++++++++++++++++++++++++++--
 src/mthca.h |    7 +++++++
 src/qp.c    |    7 ++++---
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/buf.c b/src/buf.c
index 6c1be4f..ae37e9c 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
 #endif /* HAVE_CONFIG_H */
 
 #include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
 
 #include "mthca.h"
 
@@ -69,8 +71,38 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
 	if (ret)
 		free(buf->buf);
 
-	if (!ret)
+	if (!ret) {
 		buf->length = size;
+		buf->type = MTHCA_MALIGN;
+	}
+
+	return ret;
+}
+
+#define PAGE_64K	(1UL << 16)
+
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	/* Use the standard posix_memalign() call for pages < 64K */
+	if (page_size < PAGE_64K)
+		return mthca_alloc_buf(buf, size, page_size);
+
+	/* Otherwise we can save a lot by using mmap directly */
+	buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+	if (buf->buf == MAP_FAILED)
+		return errno;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		munmap(buf->buf, align(size, page_size));
+	else {
+		buf->length = size;
+		buf->type = MTHCA_MMAP;
+	}
 
 	return ret;
 }
@@ -78,5 +110,9 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
 void mthca_free_buf(struct mthca_buf *buf)
 {
 	ibv_dofork_range(buf->buf, buf->length);
-	free(buf->buf);
+
+	if ( buf->type == MTHCA_MMAP )
+		munmap(buf->buf, buf->length);
+	else
+		free(buf->buf);
 }
diff --git a/src/mthca.h b/src/mthca.h
index 66751f3..7db15a7 100644
--- a/src/mthca.h
+++ b/src/mthca.h
@@ -138,9 +138,15 @@ struct mthca_context {
 	int		       qp_table_mask;
 };
 
+enum mthca_buf_type {
+	MTHCA_MMAP,
+	MTHCA_MALIGN
+};
+
 struct mthca_buf {
 	void		       *buf;
 	size_t			length;
+	enum mthca_buf_type	type;
 };
 
 struct mthca_pd {
@@ -291,6 +297,7 @@ static inline int mthca_is_memfree(struct ibv_context *ibctx)
 }
 
 int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size);
 void mthca_free_buf(struct mthca_buf *buf);
 
 int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
diff --git a/src/qp.c b/src/qp.c
index 84dd206..15f4805 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -848,9 +848,10 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
 
 	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
 
-	if (mthca_alloc_buf(&qp->buf,
-			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
-			    to_mdev(pd->context->device)->page_size)) {
+	if (mthca_alloc_page(&qp->buf,
+			     align(qp->buf_size,
+				   to_mdev(pd->context->device)->page_size),
+			     to_mdev(pd->context->device)->page_size)) {
 		free(qp->wrid);
 		return -1;
 	}
-- 
1.6.3.rc3.12.gb7937




More information about the general mailing list