[openib-general] [RFC][PATCH] adding call to madvise

Gleb Natapov glebn at voltaire.com
Thu May 4 04:47:15 PDT 2006


Hello Roland,

Included patch adds call to madvise(MADV_DO[NT]FORK) to libibverbs and
libmthca. In libibverbs it uses memory.c to do reference counting on
overlapping user registrations and in libmthca it marks all internal qp/cq
memory.

The MADV_DOFORK/MADV_DONTFORK defines not yet propagate to libc so I
added them in local header files just to be able to compile. I think the 
proper way to handle this is in configure. Suggestions are welcome.

Note that this patch also changes ABI since struct ibv_mr is bigger now.

Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h	(revision 6750)
+++ libibverbs/include/infiniband/verbs.h	(working copy)
@@ -289,6 +289,8 @@ struct ibv_mr {
 	uint32_t		handle;
 	uint32_t		lkey;
 	uint32_t		rkey;
+	void                   *addr;
+	size_t                 length;
 };
 
 struct ibv_global_route {
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c	(revision 6750)
+++ libibverbs/src/verbs.c	(working copy)
@@ -154,10 +154,15 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 {
 	struct ibv_mr *mr;
 
+	ibv_dontfork_range(addr, length);
 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
 	if (mr) {
 		mr->context = pd->context;
 		mr->pd      = pd;
+		mr->addr    = addr;
+		mr->length  = length;
+	} else {
+		ibv_dofork_range(addr, length);
 	}
 
 	return mr;
@@ -165,7 +170,12 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 
 int ibv_dereg_mr(struct ibv_mr *mr)
 {
-	return mr->context->ops.dereg_mr(mr);
+	int rc = mr->context->ops.dereg_mr(mr);
+
+	if (!rc)
+		ibv_dofork_range(mr->addr, mr->length);
+	
+	return rc;
 }
 
 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h	(revision 6750)
+++ libibverbs/src/ibverbs.h	(working copy)
@@ -61,8 +61,8 @@ extern HIDDEN int abi_ver;
 extern HIDDEN int ibverbs_init(struct ibv_device ***list);
 
 extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+extern HIDDEN int ibv_dontfork_range(void *base, size_t size);
+extern HIDDEN int ibv_dofork_range(void *base, size_t size);
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\
 	do {								\
@@ -85,4 +85,11 @@ extern HIDDEN int ibv_unlock_range(void 
 		(cmd)->response  = (uintptr_t) (out);			\
 	} while (0)
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
+
 #endif /* IB_VERBS_H */
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c	(revision 6750)
+++ libibverbs/src/memory.c	(working copy)
@@ -136,7 +136,7 @@ static void __mm_remove(struct ibv_mem_n
 		node->next->prev = node->prev;
 }
 
-int ibv_lock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -187,8 +187,8 @@ int ibv_lock_range(void *base, size_t si
 
 
 		if (node->refcnt++ == 0) {
-			ret = mlock((void *) node->start,
-				    node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				    node->end - node->start + 1, MADV_DONTFORK);
 			if (ret)
 				goto out;
 		}
@@ -202,7 +202,7 @@ out:
 	return ret;
 }
 
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dofork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -226,8 +226,8 @@ int ibv_unlock_range(void *base, size_t 
 
 	while (node && node->end <= end) {
 		if (--node->refcnt == 0) {
-			ret = munlock((void *) node->start,
-				      node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				      node->end - node->start + 1, MADV_DOFORK);
 		}
 
 		if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h	(revision 6750)
+++ libmthca/src/mthca.h	(working copy)
@@ -341,4 +341,10 @@ void mthca_free_av(struct mthca_ah *ah);
 int mthca_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int mthca_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
 #endif /* MTHCA_H */
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c	(revision 6750)
+++ libmthca/src/verbs.c	(working copy)
@@ -134,6 +134,9 @@ static struct ibv_mr *__mthca_reg_mr(str
 		return NULL;
 	}
 
+	mr->addr = addr;
+	mr->length = length;
+
 	return mr;
 }
 
@@ -188,6 +191,7 @@ struct ibv_cq *mthca_create_cq(struct ib
 	if (!cq->buf)
 		goto err;
 
+	madvise(cq->buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DONTFORK);
 	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
 				cqe * MTHCA_CQ_ENTRY_SIZE,
 				0, IBV_ACCESS_LOCAL_WRITE);
@@ -247,6 +251,7 @@ err_unreg:
 	mthca_dereg_mr(cq->mr);
 
 err_buf:
+	madvise(cq->buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DOFORK);
 	free(cq->buf);
 
 err:
@@ -278,6 +283,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 		goto out;
 	}
 
+	madvise(buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DONTFORK);
 	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
 			    cqe * MTHCA_CQ_ENTRY_SIZE,
 			    0, IBV_ACCESS_LOCAL_WRITE);
@@ -302,12 +308,14 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
 
 	mthca_dereg_mr(cq->mr);
+	madvise(cq->mr->addr, cq->mr->length, MADV_DOFORK);
 	free(cq->buf);
 
 	cq->buf = buf;
 	cq->mr  = mr;
 
 out:
+	madvise(buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DOFORK);
 	pthread_spin_unlock(&cq->lock);
 	return ret;
 }
@@ -328,6 +336,7 @@ int mthca_destroy_cq(struct ibv_cq *cq)
 	}
 
 	mthca_dereg_mr(to_mcq(cq)->mr);
+	madvise(to_mcq(cq)->mr->addr, to_mcq(cq)->mr->length, MADV_DOFORK);
 
 	free(to_mcq(cq)->buf);
 	free(to_mcq(cq));
@@ -381,6 +390,7 @@ struct ibv_srq *mthca_create_srq(struct 
 	if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
 		goto err;
 
+	madvise(srq->buf, srq->buf_size, MADV_DONTFORK);
 	srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
 	if (!srq->mr)
 		goto err_free;
@@ -421,6 +431,7 @@ err_unreg:
 	mthca_dereg_mr(srq->mr);
 
 err_free:
+	madvise(srq->buf, srq->buf_size, MADV_DOFORK);
 	free(srq->wrid);
 	free(srq->buf);
 
@@ -460,6 +471,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
 			      to_msrq(srq)->db_index);
 
 	mthca_dereg_mr(to_msrq(srq)->mr);
+	madvise(to_msrq(srq)->mr->addr, to_msrq(srq)->mr->length, MADV_DOFORK);
 
 	free(to_msrq(srq)->buf);
 	free(to_msrq(srq)->wrid);
@@ -499,6 +511,7 @@ struct ibv_qp *mthca_create_qp(struct ib
 	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
 		goto err_free;
 
+	madvise(qp->buf, qp->buf_size, MADV_DONTFORK);
 	qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
 	if (!qp->mr)
 		goto err_free;
@@ -565,6 +578,7 @@ err_unreg:
 	mthca_dereg_mr(qp->mr);
 
 err_free:
+	madvise(qp->buf, qp->buf_size, MADV_DOFORK);
 	free(qp->wrid);
 	free(qp->buf);
 
@@ -647,6 +661,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
 	}
 
 	mthca_dereg_mr(to_mqp(qp)->mr);
+	madvise(to_mqp(qp)->mr->addr, to_mqp(qp)->mr->length, MADV_DOFORK);
 
 	free(to_mqp(qp)->buf);
 	free(to_mqp(qp)->wrid);
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c	(revision 6750)
+++ libmthca/src/ah.c	(working copy)
@@ -64,8 +64,10 @@ static struct mthca_ah_page *__add_page(
 		return NULL;
 	}
 
+	madvise(page->buf, page_size, MADV_DONTFORK);
 	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
 	if (!page->mr) {
+		madvise(page->buf, page_size, MADV_DOFORK);
 		free(page->buf);
 		free(page);
 		return NULL;
@@ -183,6 +185,7 @@ void mthca_free_av(struct mthca_ah *ah)
 				page->next->prev = page->prev;
 
 			mthca_dereg_mr(page->mr);
+			madvise(page->mr->addr, page->mr->length, MADV_DOFORK);
 			free(page->buf);
 			free(page);
 		}
--
			Gleb.



More information about the general mailing list