[ofa-general] [PATCH 1/3] libmthca - Optimize memory allocation of QP buffers with 64K pages
sebastien dugue
sebastien.dugue at bull.net
Mon May 18 00:55:25 PDT 2009
QP buffers are allocated with mthca_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().
However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:
(requested size rounded to the page size) + (page size) + (malloc overhead)
rounded internally to the page size.
So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.
This patch creates a new function mthca_alloc_page() for use by
mthca_alloc_qp_buf() that does an mmap() instead of a posix_memalign() when
the page size is 64K.
Signed-off-by: Sebastien Dugue <sebastien.dugue at bull.net>
---
src/buf.c | 40 ++++++++++++++++++++++++++++++++++++++--
src/mthca.h | 7 +++++++
src/qp.c | 7 ++++---
3 files changed, 49 insertions(+), 5 deletions(-)
diff --git a/src/buf.c b/src/buf.c
index 6c1be4f..ae37e9c 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
#endif /* HAVE_CONFIG_H */
#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
#include "mthca.h"
@@ -69,8 +71,38 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
if (ret)
free(buf->buf);
- if (!ret)
+ if (!ret) {
buf->length = size;
+ buf->type = MTHCA_MALIGN;
+ }
+
+ return ret;
+}
+
+#define PAGE_64K (1UL << 16)
+
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size)
+{
+ int ret;
+
+ /* Use the standard posix_memalign() call for pages < 64K */
+ if (page_size < PAGE_64K)
+ return mthca_alloc_buf(buf, size, page_size);
+
+ /* Otherwise we can save a lot by using mmap directly */
+ buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (buf->buf == MAP_FAILED)
+ return errno;
+
+ ret = ibv_dontfork_range(buf->buf, size);
+ if (ret)
+ munmap(buf->buf, align(size, page_size));
+ else {
+ buf->length = size;
+ buf->type = MTHCA_MMAP;
+ }
return ret;
}
@@ -78,5 +110,9 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size)
void mthca_free_buf(struct mthca_buf *buf)
{
ibv_dofork_range(buf->buf, buf->length);
- free(buf->buf);
+
+ if ( buf->type == MTHCA_MMAP )
+ munmap(buf->buf, buf->length);
+ else
+ free(buf->buf);
}
diff --git a/src/mthca.h b/src/mthca.h
index 66751f3..7db15a7 100644
--- a/src/mthca.h
+++ b/src/mthca.h
@@ -138,9 +138,15 @@ struct mthca_context {
int qp_table_mask;
};
+enum mthca_buf_type {
+ MTHCA_MMAP,
+ MTHCA_MALIGN
+};
+
struct mthca_buf {
void *buf;
size_t length;
+ enum mthca_buf_type type;
};
struct mthca_pd {
@@ -291,6 +297,7 @@ static inline int mthca_is_memfree(struct ibv_context *ibctx)
}
int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size);
void mthca_free_buf(struct mthca_buf *buf);
int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
diff --git a/src/qp.c b/src/qp.c
index 84dd206..15f4805 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -848,9 +848,10 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
- if (mthca_alloc_buf(&qp->buf,
- align(qp->buf_size, to_mdev(pd->context->device)->page_size),
- to_mdev(pd->context->device)->page_size)) {
+ if (mthca_alloc_page(&qp->buf,
+ align(qp->buf_size,
+ to_mdev(pd->context->device)->page_size),
+ to_mdev(pd->context->device)->page_size)) {
free(qp->wrid);
return -1;
}
--
1.6.3.rc3.12.gb7937
More information about the general
mailing list