[openib-general] [resend][RFC][PATCH] adding call to madvise

Gleb Natapov glebn at voltaire.com
Thu May 11 06:42:17 PDT 2006


Hello

I've sent this mail a week ago and had no response. I hope this is not
because of lack of interest in user space support :)
Anyway I repost it one more time to get the feedback and to find the way
to include this patch to openib ASAP and not wait till madvise defines
will propogate to libc. 

----- Forwarded message from Gleb Natapov <glebn at voltaire.com> -----

Hello Roland,

Included patch adds call to madvise(MADV_DO[NT]FORK) to libibverbs and
libmthca. In libibverbs it uses memory.c to do reference counting on
overlapping user registrations and in libmthca it marks all internal qp/cq
memory.

The MADV_DOFORK/MADV_DONTFORK defines not yet propagate to libc so I
added them in local header files just to be able to compile. I think the 
proper way to handle this is in configure. Suggestions are welcome.

Note that this patch also changes ABI since struct ibv_mr is bigger now.

Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h	(revision 7112)
+++ libibverbs/include/infiniband/verbs.h	(working copy)
@@ -289,6 +289,8 @@
 	uint32_t		handle;
 	uint32_t		lkey;
 	uint32_t		rkey;
+	void                   *addr;
+	size_t                 length;
 };
 
 struct ibv_global_route {
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c	(revision 7112)
+++ libibverbs/src/verbs.c	(working copy)
@@ -154,10 +154,15 @@
 {
 	struct ibv_mr *mr;
 
+	ibv_dontfork_range(addr, length);
 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
 	if (mr) {
 		mr->context = pd->context;
 		mr->pd      = pd;
+		mr->addr    = addr;
+		mr->length  = length;
+	} else {
+		ibv_dofork_range(addr, length);
 	}
 
 	return mr;
@@ -165,7 +170,12 @@
 
 int ibv_dereg_mr(struct ibv_mr *mr)
 {
-	return mr->context->ops.dereg_mr(mr);
+	int rc = mr->context->ops.dereg_mr(mr);
+
+	if (!rc)
+		ibv_dofork_range(mr->addr, mr->length);
+	
+	return rc;
 }
 
 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h	(revision 7112)
+++ libibverbs/src/ibverbs.h	(working copy)
@@ -61,8 +61,8 @@
 extern HIDDEN int ibverbs_init(struct ibv_device ***list);
 
 extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+extern HIDDEN int ibv_dontfork_range(void *base, size_t size);
+extern HIDDEN int ibv_dofork_range(void *base, size_t size);
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\
 	do {								\
@@ -85,4 +85,11 @@
 		(cmd)->response  = (uintptr_t) (out);			\
 	} while (0)
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
+
 #endif /* IB_VERBS_H */
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c	(revision 7112)
+++ libibverbs/src/memory.c	(working copy)
@@ -136,7 +136,7 @@
 		node->next->prev = node->prev;
 }
 
-int ibv_lock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -187,8 +187,8 @@
 
 
 		if (node->refcnt++ == 0) {
-			ret = mlock((void *) node->start,
-				    node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				    node->end - node->start + 1, MADV_DONTFORK);
 			if (ret)
 				goto out;
 		}
@@ -202,7 +202,7 @@
 	return ret;
 }
 
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dofork_range(void *base, size_t size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -226,8 +226,8 @@
 
 	while (node && node->end <= end) {
 		if (--node->refcnt == 0) {
-			ret = munlock((void *) node->start,
-				      node->end - node->start + 1);
+			ret = madvise((void *) node->start,
+				      node->end - node->start + 1, MADV_DOFORK);
 		}
 
 		if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h	(revision 7112)
+++ libmthca/src/mthca.h	(working copy)
@@ -341,4 +341,10 @@
 int mthca_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int mthca_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK 10
+#endif
+#ifndef MADV_DOFORK
+#define MADV_DOFORK 11
+#endif
 #endif /* MTHCA_H */
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c	(revision 7112)
+++ libmthca/src/verbs.c	(working copy)
@@ -134,6 +134,9 @@
 		return NULL;
 	}
 
+	mr->addr = addr;
+	mr->length = length;
+
 	return mr;
 }
 
@@ -188,6 +191,7 @@
 	if (!cq->buf)
 		goto err;
 
+	madvise(cq->buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DONTFORK);
 	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
 				cqe * MTHCA_CQ_ENTRY_SIZE,
 				0, IBV_ACCESS_LOCAL_WRITE);
@@ -247,6 +251,7 @@
 	mthca_dereg_mr(cq->mr);
 
 err_buf:
+	madvise(cq->buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DOFORK);
 	free(cq->buf);
 
 err:
@@ -278,6 +283,7 @@
 		goto out;
 	}
 
+	madvise(buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DONTFORK);
 	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
 			    cqe * MTHCA_CQ_ENTRY_SIZE,
 			    0, IBV_ACCESS_LOCAL_WRITE);
@@ -302,12 +308,14 @@
 	mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
 
 	mthca_dereg_mr(cq->mr);
+	madvise(cq->mr->addr, cq->mr->length, MADV_DOFORK);
 	free(cq->buf);
 
 	cq->buf = buf;
 	cq->mr  = mr;
 
 out:
+	madvise(buf, cqe * MTHCA_CQ_ENTRY_SIZE, MADV_DOFORK);
 	pthread_spin_unlock(&cq->lock);
 	return ret;
 }
@@ -328,6 +336,7 @@
 	}
 
 	mthca_dereg_mr(to_mcq(cq)->mr);
+	madvise(to_mcq(cq)->mr->addr, to_mcq(cq)->mr->length, MADV_DOFORK);
 
 	free(to_mcq(cq)->buf);
 	free(to_mcq(cq));
@@ -381,6 +390,7 @@
 	if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
 		goto err;
 
+	madvise(srq->buf, srq->buf_size, MADV_DONTFORK);
 	srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
 	if (!srq->mr)
 		goto err_free;
@@ -421,6 +431,7 @@
 	mthca_dereg_mr(srq->mr);
 
 err_free:
+	madvise(srq->buf, srq->buf_size, MADV_DOFORK);
 	free(srq->wrid);
 	free(srq->buf);
 
@@ -460,6 +471,7 @@
 			      to_msrq(srq)->db_index);
 
 	mthca_dereg_mr(to_msrq(srq)->mr);
+	madvise(to_msrq(srq)->mr->addr, to_msrq(srq)->mr->length, MADV_DOFORK);
 
 	free(to_msrq(srq)->buf);
 	free(to_msrq(srq)->wrid);
@@ -499,6 +511,7 @@
 	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
 		goto err_free;
 
+	madvise(qp->buf, qp->buf_size, MADV_DONTFORK);
 	qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
 	if (!qp->mr)
 		goto err_free;
@@ -565,6 +578,7 @@
 	mthca_dereg_mr(qp->mr);
 
 err_free:
+	madvise(qp->buf, qp->buf_size, MADV_DOFORK);
 	free(qp->wrid);
 	free(qp->buf);
 
@@ -647,6 +661,7 @@
 	}
 
 	mthca_dereg_mr(to_mqp(qp)->mr);
+	madvise(to_mqp(qp)->mr->addr, to_mqp(qp)->mr->length, MADV_DOFORK);
 
 	free(to_mqp(qp)->buf);
 	free(to_mqp(qp)->wrid);
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c	(revision 7112)
+++ libmthca/src/ah.c	(working copy)
@@ -64,8 +64,10 @@
 		return NULL;
 	}
 
+	madvise(page->buf, page_size, MADV_DONTFORK);
 	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
 	if (!page->mr) {
+		madvise(page->buf, page_size, MADV_DOFORK);
 		free(page->buf);
 		free(page);
 		return NULL;
@@ -183,6 +185,7 @@
 				page->next->prev = page->prev;
 
 			mthca_dereg_mr(page->mr);
+			madvise(page->mr->addr, page->mr->length, MADV_DOFORK);
 			free(page->buf);
 			free(page);
 		}
----- End forwarded message -----

--
			Gleb.



More information about the general mailing list