[openib-general] [resend][RFC][PATCH] adding call to madvise

Gleb Natapov glebn at voltaire.com
Sun May 14 06:42:40 PDT 2006


Hello Roland,

Here is the new version of the patch. It tries to address most of your comments.
I've looked at possibility to use autoconf for detecting MADV_* defines,
but I haven't found AC_CHECK_DEFUN or something like this to check for
available defines. Besides what should we do in case the define is
available? Define HAVE_MADV_DOFORK and check this instead of MADV_DOFORK?
It seem redundant to me. 

Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h	(revision 7141)
+++ libibverbs/include/infiniband/verbs.h	(working copy)
@@ -289,6 +289,8 @@ struct ibv_mr {
 	uint32_t		handle;
 	uint32_t		lkey;
 	uint32_t		rkey;
+	void                   *addr;
+	size_t                 length;
 };
 
 struct ibv_global_route {
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c	(revision 7141)
+++ libibverbs/src/verbs.c	(working copy)
@@ -154,10 +154,13 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 {
 	struct ibv_mr *mr;
 
+	ibv_dontfork_range(addr, length);
 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
 	if (mr) {
 		mr->context = pd->context;
 		mr->pd      = pd;
+	} else {
+		ibv_dofork_range(addr, length);
 	}
 
 	return mr;
@@ -165,7 +168,12 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 
 int ibv_dereg_mr(struct ibv_mr *mr)
 {
-	return mr->context->ops.dereg_mr(mr);
+	int rc = mr->context->ops.dereg_mr(mr);
+
+	if (!rc)
+		ibv_dofork_range(mr->addr, mr->length);
+	
+	return rc;
 }
 
 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h	(revision 7141)
+++ libibverbs/src/ibverbs.h	(working copy)
@@ -61,8 +61,8 @@ extern HIDDEN int abi_ver;
 extern HIDDEN int ibverbs_init(struct ibv_device ***list);
 
 extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+extern HIDDEN int ibv_dontfork_range(void *base, size_t size);
+extern HIDDEN int ibv_dofork_range(void *base, size_t size);
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\
 	do {								\
Index: libibverbs/src/cmd.c
===================================================================
--- libibverbs/src/cmd.c	(revision 7141)
+++ libibverbs/src/cmd.c	(working copy)
@@ -238,6 +238,8 @@ int ibv_cmd_reg_mr(struct ibv_pd *pd, vo
 	mr->handle  = resp.mr_handle;
 	mr->lkey    = resp.lkey;
 	mr->rkey    = resp.rkey;
+	mr->addr    = addr;
+	mr->length  = length;
 
 	return 0;
 }
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c	(revision 7141)
+++ libibverbs/src/memory.c	(working copy)
@@ -43,6 +43,13 @@
 
 #include "ibverbs.h"
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
+
 /*
  * We keep a linked list of page ranges that have been locked along with a
  * reference count to manage overlapping registrations, etc.
@@ -136,7 +143,7 @@ static void __mm_remove(struct ibv_mem_n
 		node->next->prev = node->prev;
 }
 
-int ibv_lock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -187,8 +194,8 @@ int ibv_lock_range(void *base, size_t si
 
 
 		if (node->refcnt++ == 0) {
-			ret = mlock((void *) node->start,
-				    node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				    node->end - node->start + 1, MADV_DONTFORK);
 			if (ret)
 				goto out;
 		}
@@ -202,7 +209,7 @@ out:
 	return ret;
 }
 
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dofork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -226,8 +233,8 @@ int ibv_unlock_range(void *base, size_t 
 
 	while (node && node->end <= end) {
 		if (--node->refcnt == 0) {
-			ret = munlock((void *) node->start,
-				      node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				      node->end - node->start + 1, MADV_DOFORK);
 		}
 
 		if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
Index: libmthca/src/qp.c
===================================================================
--- libmthca/src/qp.c	(revision 7141)
+++ libmthca/src/qp.c	(working copy)
@@ -819,8 +819,10 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd
 
 	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
 
-	if (posix_memalign(&qp->buf, to_mdev(pd->context->device)->page_size,
-			   align(qp->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_memalign_dontfork(&qp->buf,
+				to_mdev(pd->context->device)->page_size,
+			  	align(qp->buf_size,
+				to_mdev(pd->context->device)->page_size))) {
 		free(qp->wrid);
 		return -1;
 	}
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h	(revision 7141)
+++ libmthca/src/mthca.h	(working copy)
@@ -36,6 +36,8 @@
 #ifndef MTHCA_H
 #define MTHCA_H
 
+#include <sys/mman.h>
+
 #include <infiniband/driver.h>
 #include <infiniband/arch.h>
 
@@ -341,4 +343,32 @@ void mthca_free_av(struct mthca_ah *ah);
 int mthca_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int mthca_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
+
+static inline int mthca_memalign_dontfork(void **memptr, size_t alignment,
+					  size_t size)
+{
+	int ret;
+
+	ret = posix_memalign(memptr, alignment, size);
+
+	if (ret)
+		return ret;
+
+	madvise(*memptr, size, MADV_DONTFORK);
+
+	return 0;
+}
+
+static inline void mthca_free_dofork(void *ptr, size_t size)
+{
+	madvise(ptr, size, MADV_DOFORK);
+	free(ptr);
+}
+
 #endif /* MTHCA_H */
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c	(revision 7141)
+++ libmthca/src/verbs.c	(working copy)
@@ -247,7 +247,7 @@ err_unreg:
 	mthca_dereg_mr(cq->mr);
 
 err_buf:
-	free(cq->buf);
+	mthca_free_dofork(cq->buf, cqe * MTHCA_CQ_ENTRY_SIZE);
 
 err:
 	free(cq);
@@ -263,6 +263,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	void *buf;
 	int old_cqe;
 	int ret;
+	size_t length;
 
 	pthread_spin_lock(&cq->lock);
 
@@ -282,7 +283,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 			    cqe * MTHCA_CQ_ENTRY_SIZE,
 			    0, IBV_ACCESS_LOCAL_WRITE);
 	if (!mr) {
-		free(buf);
+		mthca_free_dofork(buf, cqe * MTHCA_CQ_ENTRY_SIZE);
 		ret = ENOMEM;
 		goto out;
 	}
@@ -295,14 +296,15 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
 	if (ret) {
 		mthca_dereg_mr(mr);
-		free(buf);
+		mthca_free_dofork(buf, cqe * MTHCA_CQ_ENTRY_SIZE);
 		goto out;
 	}
 
 	mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
 
+	length = cq->mr->length;
 	mthca_dereg_mr(cq->mr);
-	free(cq->buf);
+	mthca_free_dofork(cq->buf, length);
 
 	cq->buf = buf;
 	cq->mr  = mr;
@@ -315,6 +317,7 @@ out:
 int mthca_destroy_cq(struct ibv_cq *cq)
 {
 	int ret;
+	size_t length;
 
 	ret = ibv_cmd_destroy_cq(cq);
 	if (ret)
@@ -327,9 +330,10 @@ int mthca_destroy_cq(struct ibv_cq *cq)
 			      to_mcq(cq)->arm_db_index);
 	}
 
+	length = to_mcq(cq)->mr->length;
 	mthca_dereg_mr(to_mcq(cq)->mr);
 
-	free(to_mcq(cq)->buf);
+	mthca_free_dofork(to_mcq(cq)->buf, length);
 	free(to_mcq(cq));
 
 	return 0;
@@ -422,7 +426,7 @@ err_unreg:
 
 err_free:
 	free(srq->wrid);
-	free(srq->buf);
+	mthca_free_dofork(srq->buf, srq->buf_size);
 
 err:
 	free(srq);
@@ -461,7 +465,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
 
 	mthca_dereg_mr(to_msrq(srq)->mr);
 
-	free(to_msrq(srq)->buf);
+	mthca_free_dofork(to_msrq(srq)->buf, to_msrq(srq)->buf_size);
 	free(to_msrq(srq)->wrid);
 	free(to_msrq(srq));
 
@@ -566,7 +570,7 @@ err_unreg:
 
 err_free:
 	free(qp->wrid);
-	free(qp->buf);
+	mthca_free_dofork(qp->buf, qp->buf_size);
 
 err:
 	free(qp);
@@ -648,7 +652,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
 
 	mthca_dereg_mr(to_mqp(qp)->mr);
 
-	free(to_mqp(qp)->buf);
+	mthca_free_dofork(to_mqp(qp)->buf, to_mqp(qp)->buf_size);
 	free(to_mqp(qp)->wrid);
 	free(to_mqp(qp));
 
Index: libmthca/src/cq.c
===================================================================
--- libmthca/src/cq.c	(revision 7141)
+++ libmthca/src/cq.c	(working copy)
@@ -606,8 +606,9 @@ void *mthca_alloc_cq_buf(struct mthca_de
 	void *buf;
 	int i;
 
-	if (posix_memalign(&buf, dev->page_size,
-			   align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size)))
+	if (mthca_memalign_dontfork(&buf, dev->page_size,
+				    align(nent * MTHCA_CQ_ENTRY_SIZE,
+				    dev->page_size)))
 		return NULL;
 
 	for (i = 0; i < nent; ++i)
Index: libmthca/src/srq.c
===================================================================
--- libmthca/src/srq.c	(revision 7141)
+++ libmthca/src/srq.c	(working copy)
@@ -291,8 +291,10 @@ int mthca_alloc_srq_buf(struct ibv_pd *p
 
 	srq->buf_size = srq->max << srq->wqe_shift;
 
-	if (posix_memalign(&srq->buf, to_mdev(pd->context->device)->page_size,
-			   align(srq->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_memalign_dontfork(&srq->buf,
+				to_mdev(pd->context->device)->page_size,
+				align(srq->buf_size,
+				to_mdev(pd->context->device)->page_size))) {
 		free(srq->wrid);
 		return -1;
 	}
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c	(revision 7141)
+++ libmthca/src/ah.c	(working copy)
@@ -59,14 +59,14 @@ static struct mthca_ah_page *__add_page(
 	if (!page)
 		return NULL;
 
-	if (posix_memalign(&page->buf, page_size, page_size)) {
+	if (mthca_memalign_dontfork(&page->buf, page_size, page_size)) {
 		free(page);
 		return NULL;
 	}
 
 	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
 	if (!page->mr) {
-		free(page->buf);
+		mthca_free_dofork(page->buf, page_size);
 		free(page);
 		return NULL;
 	}
@@ -175,6 +175,8 @@ void mthca_free_av(struct mthca_ah *ah)
 		page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int)));
 
 		if (!--page->use_cnt) {
+			size_t length;
+
 			if (page->prev)
 				page->prev->next = page->next;
 			else
@@ -182,8 +184,9 @@ void mthca_free_av(struct mthca_ah *ah)
 			if (page->next)
 				page->next->prev = page->prev;
 
+			length = page->mr->length;
 			mthca_dereg_mr(page->mr);
-			free(page->buf);
+			mthca_free_dofork(page->buf, length);
 			free(page);
 		}
 
--
			Gleb.



More information about the general mailing list