[ofa-general] [PATCH 2/3] libmlx4 - Optimize memory allocation of QP buffers with 64K pages
sebastien dugue
sebastien.dugue at bull.net
Mon May 18 00:55:16 PDT 2009
QP buffers are allocated with mlx4_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().
However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:
(requested size rounded to the page size) + (page size) + (malloc overhead)
rounded internally to the page size.
So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.
This patch creates a new function mlx4_alloc_page() for use by
mlx4_alloc_qp_buf() that does an mmap() instead of a posix_memalign() when
the page size is 64K.
Signed-off-by: Sebastien Dugue <sebastien.dugue at bull.net>
---
src/buf.c | 40 ++++++++++++++++++++++++++++++++++++++--
src/mlx4.h | 7 +++++++
src/qp.c | 5 +++--
3 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/src/buf.c b/src/buf.c
index 0e5f9b6..c8b6823 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
#endif /* HAVE_CONFIG_H */
#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
#include "mlx4.h"
@@ -69,14 +71,48 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
if (ret)
free(buf->buf);
- if (!ret)
+ if (!ret) {
buf->length = size;
+ buf->type = MLX4_MALIGN;
+ }
return ret;
}
+#define PAGE_64K (1UL << 16)
+
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size)
+{
+ int ret;
+
+ /* Use the standard posix_memalign() call for pages < 64K */
+ if (page_size < PAGE_64K)
+ return mlx4_alloc_buf(buf, size, page_size);
+
+ /* Otherwise we can save a lot by using mmap directly */
+ buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (buf->buf == MAP_FAILED)
+ return errno;
+
+ ret = ibv_dontfork_range(buf->buf, size);
+ if (ret)
+ munmap(buf->buf, align(size, page_size));
+ else {
+ buf->length = size;
+ buf->type = MLX4_MMAP;
+ }
+
+ return ret;
+ }
+
void mlx4_free_buf(struct mlx4_buf *buf)
{
ibv_dofork_range(buf->buf, buf->length);
- free(buf->buf);
+
+ if ( buf->type == MLX4_MMAP )
+ munmap(buf->buf, buf->length);
+ else
+ free(buf->buf);
}
diff --git a/src/mlx4.h b/src/mlx4.h
index 827a201..83547f5 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -161,9 +161,15 @@ struct mlx4_context {
pthread_mutex_t db_list_mutex;
};
+enum mlx4_buf_type {
+ MLX4_MMAP,
+ MLX4_MALIGN
+};
+
struct mlx4_buf {
void *buf;
size_t length;
+ enum mlx4_buf_type type;
};
struct mlx4_pd {
@@ -288,6 +294,7 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
}
int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size);
void mlx4_free_buf(struct mlx4_buf *buf);
uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
diff --git a/src/qp.c b/src/qp.c
index d194ae3..557e255 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -604,8 +604,9 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
qp->sq.offset = 0;
}
- if (mlx4_alloc_buf(&qp->buf,
- align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+ if (mlx4_alloc_page(&qp->buf,
+ align(qp->buf_size,
+ to_mdev(pd->context->device)->page_size),
to_mdev(pd->context->device)->page_size)) {
free(qp->sq.wrid);
free(qp->rq.wrid);
--
1.6.3.rc3.12.gb7937
More information about the general
mailing list