[openib-general] [PATCH/RFC] libibverbs and libmthca fork support

Roland Dreier rdreier at cisco.com
Mon Jul 31 11:52:18 PDT 2006


Here's an initial cut (based on Gleb Natapov's work) at using
madvise(MADV_DONTFORK) to support fork() from libibverbs.  The main
changes from Gleb's earlier work are:

 - I added code to handle doorbell pages in libmthca.  As far as I can
   see this is necessary -- my tests don't work without it.  Gleb, did
   you ever test your changes on memfree HCAs?

 - I added a new API function, ibv_fork_init(), which must be called
   before everything else if an app expects to do fork().  I did this
   because I wanted a way for apps to know if fork() was expected to
   work or not, and also because the vast majority of apps don't
   fork() and probably don't want to pay the price of an extra system
   call plus RB tree operation for every memory registration.

 - And the bulk of this patch is converting memory.c over to use RB
   trees -- I just couldn't bring myself to use an O(N) algorithm at
   this stage...

Comments welcome...

 - R.


Index: libibverbs/include/infiniband/driver.h
===================================================================
--- libibverbs/include/infiniband/driver.h	(revision 8791)
+++ libibverbs/include/infiniband/driver.h	(working copy)
@@ -135,6 +135,9 @@ int ibv_cmd_destroy_ah(struct ibv_ah *ah
 int ibv_cmd_attach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 int ibv_cmd_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+int ibv_dontfork_range(void *base, size_t size);
+int ibv_dofork_range(void *base, size_t size);
+
 /*
  * sysfs helper functions
  */
Index: libibverbs/include/infiniband/verbs.h
===================================================================
--- libibverbs/include/infiniband/verbs.h	(revision 8791)
+++ libibverbs/include/infiniband/verbs.h	(working copy)
@@ -285,6 +285,8 @@ struct ibv_pd {
 struct ibv_mr {
 	struct ibv_context     *context;
 	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
 	uint32_t		handle;
 	uint32_t		lkey;
 	uint32_t		rkey;
@@ -1016,6 +1018,14 @@ int ibv_attach_mcast(struct ibv_qp *qp, 
  */
 int ibv_detach_mcast(struct ibv_qp *qp, union ibv_gid *gid, uint16_t lid);
 
+/**
+ * ibv_fork_init - Prepare data structures so that fork() may be used
+ * safely.  If this function is not called or returns a non-zero
+ * status, then libibverbs data structures are not fork()-safe and the
+ * effect of an application calling fork() is undefined.
+ */
+int ibv_fork_init(void);
+
 END_C_DECLS
 
 #  undef __attribute_const
Index: libibverbs/ChangeLog
===================================================================
--- libibverbs/ChangeLog	(revision 8791)
+++ libibverbs/ChangeLog	(working copy)
@@ -1,3 +1,29 @@
+2006-07-26  Roland Dreier  <rdreier at cisco.com>
+
+	* src/verbs.c (ibv_reg_mr, ibv_dereg_mr): Add calls to
+	ibv_dontfork_range() and ibv_dofork_range() for memory regions
+	registered by library consumers.
+
+	* include/infiniband/verbs.h: Add declaration of ibv_fork_init().
+
+	* include/infiniband/driver.h: Add declarations of
+	ibv_dontfork_range() and ibv_dofork_range().
+
+	* src/memory.c: Rewrite to use a red-black tree instead of a
+	linked list.  Change from doing mlock()/munlock() to
+	madvise(..., MADV_DONTFORK) and madvise(..., MADV_DOFORK), and
+	change the name of the entry points to ibv_dontfork_range() and
+	ibv_dofork_range().  Add ibv_fork_init() for applications to
+	request fork-safe behavior.
+
+	* src/ibverbs.h: Kill off unused declarations.
+
+	* src/init.c (ibverbs_init): Get rid of call to ibv_init_mem_map().
+
+	* include/infiniband/verbs.h: Add addr and length field to struct
+	ibv_mr so that memory regions can be madvised().  This changes the
+	ABI, since the layout of struct ibv_mr is changed.
+
 2006-07-04  Roland Dreier  <rdreier at cisco.com>
 
 	* include/infiniband/arch.h: Fix typo in sparc mb()
Index: libibverbs/src/libibverbs.map
===================================================================
--- libibverbs/src/libibverbs.map	(revision 8791)
+++ libibverbs/src/libibverbs.map	(working copy)
@@ -74,6 +74,9 @@ IBVERBS_1.0 {
 		mult_to_ibv_rate;
 		ibv_get_sysfs_path;
 		ibv_read_sysfs_file;
+		ibv_fork_init;
+		ibv_dontfork_range;
+		ibv_dofork_range;
 
 	local: *;
 };
Index: libibverbs/src/ibverbs.h
===================================================================
--- libibverbs/src/ibverbs.h	(revision 8791)
+++ libibverbs/src/ibverbs.h	(working copy)
@@ -58,11 +58,7 @@ struct ibv_abi_compat_v2 {
 
 extern HIDDEN int abi_ver;
 
-extern HIDDEN int ibverbs_init(struct ibv_device ***list);
-
-extern HIDDEN int ibv_init_mem_map(void);
-extern HIDDEN int ibv_lock_range(void *base, size_t size);
-extern HIDDEN int ibv_unlock_range(void *base, size_t size);
+HIDDEN int ibverbs_init(struct ibv_device ***list);
 
 #define IBV_INIT_CMD(cmd, size, opcode)					\
 	do {								\
Index: libibverbs/src/verbs.c
===================================================================
--- libibverbs/src/verbs.c	(revision 8791)
+++ libibverbs/src/verbs.c	(working copy)
@@ -155,18 +155,32 @@ struct ibv_mr *ibv_reg_mr(struct ibv_pd 
 {
 	struct ibv_mr *mr;
 
+	if (ibv_dontfork_range(addr, length))
+		return NULL;
+
 	mr = pd->context->ops.reg_mr(pd, addr, length, access);
 	if (mr) {
 		mr->context = pd->context;
 		mr->pd      = pd;
-	}
+		mr->addr    = addr;
+		mr->length  = length;
+	} else
+		ibv_dofork_range(addr, length);
 
 	return mr;
 }
 
 int ibv_dereg_mr(struct ibv_mr *mr)
 {
-	return mr->context->ops.dereg_mr(mr);
+	int ret;
+	void *addr	= mr->addr;
+	size_t length	= mr->length;
+
+	ret = mr->context->ops.dereg_mr(mr);
+	if (!ret)
+		ibv_dofork_range(addr, length);
+
+	return ret;
 }
 
 static struct ibv_comp_channel *ibv_create_comp_channel_v2(struct ibv_context *context)
Index: libibverbs/src/init.c
===================================================================
--- libibverbs/src/init.c	(revision 8791)
+++ libibverbs/src/init.c	(working copy)
@@ -205,9 +205,6 @@ HIDDEN int ibverbs_init(struct ibv_devic
 
 	*list = NULL;
 
-	if (ibv_init_mem_map())
-		return 0;
-
 	find_drivers(default_path);
 
 	/*
Index: libibverbs/src/memory.c
===================================================================
--- libibverbs/src/memory.c	(revision 8791)
+++ libibverbs/src/memory.c	(working copy)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,7 @@
 #  include <config.h>
 #endif /* HAVE_CONFIG_H */
 
+#include <errno.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -44,114 +46,424 @@
 #include "ibverbs.h"
 
 /*
- * We keep a linked list of page ranges that have been locked along with a
- * reference count to manage overlapping registrations, etc.
- *
- * Eventually we should turn this into an RB-tree or something similar
- * to avoid the O(n) cost of registering/unregistering memory.
+ * Most distro's headers don't have these yet.
  */
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK	10
+#endif
+
+#ifndef MADV_DOFORK
+#define MADV_DOFORK	11
+#endif
 
 struct ibv_mem_node {
-	struct ibv_mem_node *prev, *next;
-	uintptr_t            start, end;
-	int                  refcnt;
+	enum {
+		IBV_RED,
+		IBV_BLACK
+	}			color;
+	struct ibv_mem_node    *parent;
+	struct ibv_mem_node    *left, *right;
+	uintptr_t		start, end;
+	int			refcnt;
 };
 
-static struct {
-	struct ibv_mem_node *first;
-	pthread_mutex_t      mutex;
-	uintptr_t            page_size;
-} mem_map;
+static struct ibv_mem_node *mm_root;
+static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int page_size;
+static int too_late;
 
-int ibv_init_mem_map(void)
+int ibv_fork_init(void)
 {
-	struct ibv_mem_node *node = NULL;
-
-	node = malloc(sizeof *node);
-	if (!node)
-		goto fail;
-
-	node->prev   = node->next = NULL;
-	node->start  = 0;
-	node->end    = UINTPTR_MAX;
-	node->refcnt = 0;
+	void *tmp;
 
-	mem_map.first = node;
+	if (mm_root)
+		return 0;
 
-	mem_map.page_size = sysconf(_SC_PAGESIZE);
-	if (mem_map.page_size < 0)
-		goto fail;
+	if (too_late)
+		return EINVAL;
 
-	if (pthread_mutex_init(&mem_map.mutex, NULL))
-		goto fail;
+	page_size = sysconf(_SC_PAGESIZE);
+	if (page_size < 0)
+		return errno;
+
+	if (posix_memalign(&tmp, page_size, page_size))
+		return ENOMEM;
+
+	if (madvise(tmp, page_size, MADV_DONTFORK) ||
+	    madvise(tmp, page_size, MADV_DOFORK))
+		return ENOSYS;
+
+	free(tmp);
+
+	mm_root = malloc(sizeof *mm_root);
+	if (!mm_root)
+		return ENOMEM;
+
+	mm_root->parent = NULL;
+	mm_root->left   = NULL;
+	mm_root->right  = NULL;
+	mm_root->color  = IBV_BLACK;
+	mm_root->start  = 0;
+	mm_root->end    = UINTPTR_MAX;
+	mm_root->refcnt = 0;
 
 	return 0;
+}
 
-fail:
-	if (node)
-		free(node);
+static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+{
+	if (node->left) {
+		node = node->left;
+		while (node->right)
+			node = node->right;
+	} else {
+		while (node->parent && node == node->parent->left)
+			node = node->parent;
 
-	return -1;
+		node = node->parent;
+	}
+
+	return node;
 }
 
-static struct ibv_mem_node *__mm_find_first(uintptr_t start, uintptr_t end)
+static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
 {
-	struct ibv_mem_node *node = mem_map.first;
+	if (node->right) {
+		node = node->right;
+		while (node->left)
+			node = node->left;
+	} else {
+		while (node->parent && node == node->parent->right)
+			node = node->parent;
 
-	while (node) {
-		if ((node->start <= start && node->end >= start) ||
-		    (node->start <= end   && node->end >= end))
-			break;
-		node = node->next;
+		node = node->parent;
 	}
 
 	return node;
 }
 
-static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+static void __mm_rotate_right(struct ibv_mem_node *node)
 {
-	return node->prev;
+	struct ibv_mem_node *tmp;
+
+	tmp = node->left;
+
+	node->left = tmp->right;
+	if (node->left)
+		node->left->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->right = node;
+	node->parent = tmp;
 }
 
-static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
+static void __mm_rotate_left(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *tmp;
+
+	tmp = node->right;
+
+	node->right = tmp->left;
+	if (node->right)
+		node->right->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->left = node;
+	node->parent = tmp;
+}
+
+static int verify(struct ibv_mem_node *node)
+{
+	int hl, hr;
+
+	if (!node)
+		return 1;
+
+	hl = verify(node->left);
+	hr = verify(node->left);
+
+	if (!hl || !hr)
+		return 0;
+	if (hl != hr)
+		return 0;
+
+	if (node->color == IBV_RED) {
+		if (node->left && node->left->color != IBV_BLACK)
+			return 0;
+		if (node->right && node->right->color != IBV_BLACK)
+			return 0;
+		return hl;
+	}
+
+	return hl + 1;
+}
+
+static void __mm_add_rebalance(struct ibv_mem_node *node)
 {
-	return node->next;
+	struct ibv_mem_node *parent, *gp, *uncle;
+
+	while (node->parent && node->parent->color == IBV_RED) {
+		parent = node->parent;
+		gp     = node->parent->parent;
+
+		if (parent == gp->left) {
+			uncle = gp->right;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->right) {
+					__mm_rotate_left(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_right(gp);
+			}
+		} else {
+			uncle = gp->left;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->left) {
+					__mm_rotate_right(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_left(gp);
+			}
+		}
+	}
+
+	mm_root->color = IBV_BLACK;
 }
 
-static void __mm_add(struct ibv_mem_node *node,
-		     struct ibv_mem_node *new)
+static void __mm_add(struct ibv_mem_node *new)
 {
-	new->prev  = node;
-	new->next  = node->next;
-	node->next = new;
-	if (new->next)
-		new->next->prev = new;
+	struct ibv_mem_node *node, *parent = NULL;
+
+	node = mm_root;
+	while (node) {
+		parent = node;
+		if (node->start < new->start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	if (parent->start < new->start)
+		parent->right = new;
+	else
+		parent->left = new;
+
+	new->parent = parent;
+	new->left   = NULL;
+	new->right  = NULL;
+
+	new->color = IBV_RED;
+	__mm_add_rebalance(new);
 }
 
 static void __mm_remove(struct ibv_mem_node *node)
 {
-	/* Never have to remove the first node, so we can use prev */
-	node->prev->next = node->next;
-	if (node->next)
-		node->next->prev = node->prev;
+	struct ibv_mem_node *child, *parent, *sib, *tmp;
+	int nodecol;
+
+	if (node->left && node->right) {
+		tmp = node->left;
+		while (tmp->right)
+			tmp = tmp->right;
+
+		nodecol    = tmp->color;
+		child      = tmp->left;
+		tmp->color = node->color;
+
+		if (tmp->parent != node) {
+			parent        = tmp->parent;
+			parent->right = tmp->left;
+			if (tmp->left)
+				tmp->left->parent = parent;
+
+			tmp->left   	   = node->left;
+			node->left->parent = tmp;
+		} else
+			parent = tmp;
+
+		tmp->right          = node->right;
+		node->right->parent = tmp;
+
+		tmp->parent = node->parent;
+		if (node->parent) {
+			if (node->parent->left == node)
+				node->parent->left = tmp;
+			else
+				node->parent->right = tmp;
+		} else
+			mm_root = tmp;
+	} else {
+		nodecol = node->color;
+
+		child  = node->left ? node->left : node->right;
+		parent = node->parent;
+
+		if (child)
+			child->parent = parent;
+		if (parent) {
+			if (parent->left == node)
+				parent->left = child;
+			else
+				parent->right = child;
+		} else
+			mm_root = child;
+	}
+
+	free(node);
+
+	if (nodecol == IBV_RED)
+		return;
+
+	while ((!child || child->color == IBV_BLACK) && child != mm_root) {
+		if (parent->left == child) {
+			sib = parent->right;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_left(parent);
+				sib = parent->right;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->right || sib->right->color == IBV_BLACK) {
+					if (sib->left)
+						sib->left->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_right(sib);
+					sib = parent->right;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->right)
+					sib->right->color = IBV_BLACK;
+				__mm_rotate_left(parent);
+				child = mm_root;
+				break;
+			}
+		} else {
+			sib = parent->left;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_right(parent);
+				sib = parent->left;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->left || sib->left->color == IBV_BLACK) {
+					if (sib->right)
+						sib->right->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_left(sib);
+					sib = parent->left;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->left)
+					sib->left->color = IBV_BLACK;
+				__mm_rotate_right(parent);
+				child = mm_root;
+				break;
+			}
+		}
+	}
+
+	if (child)
+		child->color = IBV_BLACK;
+}
+
+static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
+{
+	struct ibv_mem_node *node = mm_root;
+
+	while (node) {
+		if (node->start <= start && node->end >= start)
+			break;
+
+		if (node->start < start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	return node;
 }
 
-int ibv_lock_range(void *base, size_t size)
+static int ibv_madvise_range(void *base, size_t size, int advice)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
+	int inc;
 	int ret = 0;
 
 	if (!size)
 		return 0;
 
-	start = (uintptr_t) base & ~(mem_map.page_size - 1);
-	end   = ((uintptr_t) (base + size + mem_map.page_size - 1) &
-		 ~(mem_map.page_size - 1)) - 1;
+	inc = advice == MADV_DONTFORK ? 1 : -1;
+
+	start = (uintptr_t) base & ~(page_size - 1);
+	end   = ((uintptr_t) (base + size + page_size - 1) &
+		 ~(page_size - 1)) - 1;
 
-	pthread_mutex_lock(&mem_map.mutex);
+	pthread_mutex_lock(&mm_mutex);
 
-	node = __mm_find_first(start, end);
+	node = __mm_find_start(start, end);
 
 	if (node->start < start) {
 		tmp = malloc(sizeof *tmp);
@@ -165,11 +477,19 @@ int ibv_lock_range(void *base, size_t si
 		tmp->refcnt = node->refcnt;
 		node->end   = start - 1;
 
-		__mm_add(node, tmp);
+		__mm_add(tmp);
 		node = tmp;
+	} else {
+		tmp = __mm_prev(node);
+		if (tmp && tmp->refcnt == node->refcnt + inc) {
+			tmp->end = node->end;
+			tmp->refcnt = node->refcnt;
+			__mm_remove(node);
+			node = tmp;
+		}
 	}
 
-	while (node->start <= end) {
+	while (node && node->start <= end) {
 		if (node->end > end) {
 			tmp = malloc(sizeof *tmp);
 			if (!tmp) {
@@ -182,13 +502,16 @@ int ibv_lock_range(void *base, size_t si
 			tmp->refcnt = node->refcnt;
 			node->end   = end;
 
-			__mm_add(node, tmp);
+			__mm_add(tmp);
 		}
 
+		node->refcnt += inc;
 
-		if (node->refcnt++ == 0) {
-			ret = mlock((void *) node->start,
-				    node->end - node->start + 1);
+		if ((inc == -1 && node->refcnt == 0) ||
+		    (inc ==  1 && node->refcnt == 1)) {
+			ret = madvise((void *) node->start,
+				      node->end - node->start + 1,
+				      advice);
 			if (ret)
 				goto out;
 		}
@@ -196,63 +519,36 @@ int ibv_lock_range(void *base, size_t si
 		node = __mm_next(node);
 	}
 
+	if (node) {
+		tmp = __mm_prev(node);
+		if (tmp && node->refcnt == tmp->refcnt) {
+			tmp->end = node->end;
+			__mm_remove(node);
+		}
+	}
+
 out:
-	pthread_mutex_unlock(&mem_map.mutex);
+	pthread_mutex_unlock(&mm_mutex);
 
 	return ret;
 }
 
-int ibv_unlock_range(void *base, size_t size)
+int ibv_dontfork_range(void *base, size_t size)
 {
-	uintptr_t start, end;
-	struct ibv_mem_node *node, *tmp;
-	int ret = 0;
-
-	if (!size)
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DONTFORK);
+	else {
+		too_late = 1;
 		return 0;
-
-	start = (uintptr_t) base & ~(mem_map.page_size - 1);
-	end   = ((uintptr_t) (base + size + mem_map.page_size - 1) &
-		 ~(mem_map.page_size - 1)) - 1;
-
-	pthread_mutex_lock(&mem_map.mutex);
-
-	node = __mm_find_first(start, end);
-
-	if (node->start != start) {
-		ret = -1;
-		goto out;
-	}
-
-	while (node && node->end <= end) {
-		if (--node->refcnt == 0) {
-			ret = munlock((void *) node->start,
-				      node->end - node->start + 1);
-		}
-
-		if (__mm_prev(node) && node->refcnt == __mm_prev(node)->refcnt) {
-			__mm_prev(node)->end = node->end;
-			tmp = __mm_prev(node);
-			__mm_remove(node);
-			node = tmp;
-		}
-
-		node = __mm_next(node);
-	}
-
-	if (node && node->refcnt == __mm_prev(node)->refcnt) {
-		__mm_prev(node)->end = node->end;
-		tmp = __mm_prev(node);
-		__mm_remove(node);
 	}
+}
 
-	if (node->end != end) {
-		ret = -1;
-		goto out;
+int ibv_dofork_range(void *base, size_t size)
+{
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DOFORK);
+	else {
+		too_late = 1;
+		return 0;
 	}
-
-out:
-	pthread_mutex_unlock(&mem_map.mutex);
-
-	return ret;
 }
Index: libmthca/configure.in
===================================================================
--- libmthca/configure.in	(revision 8791)
+++ libmthca/configure.in	(working copy)
@@ -26,7 +26,7 @@ AC_C_CONST
 AC_CHECK_SIZEOF(long)
 
 dnl Checks for library functions
-AC_CHECK_FUNCS(ibv_read_sysfs_file)
+AC_CHECK_FUNCS(ibv_read_sysfs_file ibv_dontfork_range ibv_dofork_range)
 
 AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
     if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
Index: libmthca/src/memfree.c
===================================================================
--- libmthca/src/memfree.c	(revision 8791)
+++ libmthca/src/memfree.c	(working copy)
@@ -46,8 +46,8 @@
 #define MTHCA_FREE_MAP_SIZE (MTHCA_DB_REC_PER_PAGE / (SIZEOF_LONG * 8))
 
 struct mthca_db_page {
-	unsigned long free[MTHCA_FREE_MAP_SIZE];
-	uint64_t     *db_rec;
+	unsigned long		free[MTHCA_FREE_MAP_SIZE];
+	struct mthca_buf	db_rec;
 };
 
 struct mthca_db_table {
@@ -91,7 +91,7 @@ int mthca_alloc_db(struct mthca_db_table
 	}
 
 	for (i = start; i != end; i += dir)
-		if (db_tab->page[i].db_rec)
+		if (db_tab->page[i].db_rec.buf)
 			for (j = 0; j < MTHCA_FREE_MAP_SIZE; ++j)
 				if (db_tab->page[i].free[j])
 					goto found;
@@ -101,18 +101,14 @@ int mthca_alloc_db(struct mthca_db_table
 		goto out;
 	}
 
-	{
-		void *tmp;
-
-		if (posix_memalign(&tmp, MTHCA_DB_REC_PAGE_SIZE,
-				   MTHCA_DB_REC_PAGE_SIZE)) {
-			ret = -1;
-			goto out;
-		}
-		db_tab->page[i].db_rec = tmp;
+	if (mthca_alloc_buf(&db_tab->page[i].db_rec,
+			    MTHCA_DB_REC_PAGE_SIZE,
+			    MTHCA_DB_REC_PAGE_SIZE)) {
+		ret = -1;
+		goto out;
 	}
 
-	memset(db_tab->page[i].db_rec, 0, MTHCA_DB_REC_PAGE_SIZE);
+	memset(db_tab->page[i].db_rec.buf, 0, MTHCA_DB_REC_PAGE_SIZE);
 	memset(db_tab->page[i].free, 0xff, sizeof db_tab->page[i].free);
 
 	if (group == 0)
@@ -140,7 +136,7 @@ found:
 		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
 
 	ret = i * MTHCA_DB_REC_PER_PAGE + j;
-	*db = (uint32_t *) &db_tab->page[i].db_rec[j];
+	*db = db_tab->page[i].db_rec.buf + j * 8;
 
 out:
 	pthread_mutex_unlock(&db_tab->mutex);
@@ -163,7 +159,7 @@ void mthca_free_db(struct mthca_db_table
 	page = db_tab->page + i;
 
 	pthread_mutex_lock(&db_tab->mutex);
-	page->db_rec[j] = 0;
+	*(uint64_t *) (page->db_rec.buf + j * 8) = 0;
 
 	if (i >= db_tab->min_group2)
 		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
@@ -190,7 +186,7 @@ struct mthca_db_table *mthca_alloc_db_ta
 	db_tab->min_group2 = npages - 1;
 
 	for (i = 0; i < npages; ++i)
-		db_tab->page[i].db_rec = NULL;
+		db_tab->page[i].db_rec.buf = NULL;
 
 	return db_tab;
 }
@@ -203,8 +199,8 @@ void mthca_free_db_tab(struct mthca_db_t
 		return;
 
 	for (i = 0; i < db_tab->npages; ++i)
-		if (db_tab->page[i].db_rec)
-			free(db_tab->page[i].db_rec);
+		if (db_tab->page[i].db_rec.buf)
+			mthca_free_buf(&db_tab->page[i].db_rec);
 
 	free(db_tab);
 }
Index: libmthca/src/qp.c
===================================================================
--- libmthca/src/qp.c	(revision 8791)
+++ libmthca/src/qp.c	(working copy)
@@ -58,12 +58,12 @@ static const uint8_t mthca_opcode[] = {
 
 static void *get_recv_wqe(struct mthca_qp *qp, int n)
 {
-	return qp->buf + (n << qp->rq.wqe_shift);
+	return qp->buf.buf + (n << qp->rq.wqe_shift);
 }
 
 static void *get_send_wqe(struct mthca_qp *qp, int n)
 {
-	return qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
+	return qp->buf.buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
 }
 
 void mthca_init_qp_indices(struct mthca_qp *qp)
@@ -821,13 +821,14 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd
 
 	qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
 
-	if (posix_memalign(&qp->buf, to_mdev(pd->context->device)->page_size,
-			   align(qp->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_alloc_buf(&qp->buf, 
+			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+			    to_mdev(pd->context->device)->page_size)) {
 		free(qp->wrid);
 		return -1;
 	}
 
-	memset(qp->buf, 0, qp->buf_size);
+	memset(qp->buf.buf, 0, qp->buf_size);
 
 	if (mthca_is_memfree(pd->context)) {
 		struct mthca_next_seg *next;
Index: libmthca/src/verbs.c
===================================================================
--- libmthca/src/verbs.c	(revision 8791)
+++ libmthca/src/verbs.c	(working copy)
@@ -188,11 +188,10 @@ struct ibv_cq *mthca_create_cq(struct ib
 		goto err;
 
 	cqe = align_cq_size(cqe);
-	cq->buf = mthca_alloc_cq_buf(to_mdev(context->device), cqe);
-	if (!cq->buf)
+	if (mthca_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe))
 		goto err;
 
-	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf,
+	cq->mr = __mthca_reg_mr(to_mctx(context)->pd, cq->buf.buf,
 				cqe * MTHCA_CQ_ENTRY_SIZE,
 				0, IBV_ACCESS_LOCAL_WRITE);
 	if (!cq->mr)
@@ -251,7 +250,7 @@ err_unreg:
 	mthca_dereg_mr(cq->mr);
 
 err_buf:
-	free(cq->buf);
+	mthca_free_buf(&cq->buf);
 
 err:
 	free(cq);
@@ -264,7 +263,7 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	struct mthca_cq *cq = to_mcq(ibcq);
 	struct mthca_resize_cq cmd;
 	struct ibv_mr *mr;
-	void *buf;
+	struct mthca_buf buf;
 	int old_cqe;
 	int ret;
 
@@ -280,17 +279,15 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 		goto out;
 	}
 
-	buf = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), cqe);
-	if (!buf) {
-		ret = ENOMEM;
+	ret = mthca_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe);
+	if (ret)
 		goto out;
-	}
 
-	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf,
+	mr = __mthca_reg_mr(to_mctx(ibcq->context)->pd, buf.buf,
 			    cqe * MTHCA_CQ_ENTRY_SIZE,
 			    0, IBV_ACCESS_LOCAL_WRITE);
 	if (!mr) {
-		free(buf);
+		mthca_free_buf(&buf);
 		ret = ENOMEM;
 		goto out;
 	}
@@ -303,14 +300,14 @@ int mthca_resize_cq(struct ibv_cq *ibcq,
 	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
 	if (ret) {
 		mthca_dereg_mr(mr);
-		free(buf);
+		mthca_free_buf(&buf);
 		goto out;
 	}
 
-	mthca_cq_resize_copy_cqes(cq, buf, old_cqe);
+	mthca_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
 
 	mthca_dereg_mr(cq->mr);
-	free(cq->buf);
+	mthca_free_buf(&cq->buf);
 
 	cq->buf = buf;
 	cq->mr  = mr;
@@ -336,8 +333,7 @@ int mthca_destroy_cq(struct ibv_cq *cq)
 	}
 
 	mthca_dereg_mr(to_mcq(cq)->mr);
-
-	free(to_mcq(cq)->buf);
+	mthca_free_buf(&to_mcq(cq)->buf);
 	free(to_mcq(cq));
 
 	return 0;
@@ -389,7 +385,7 @@ struct ibv_srq *mthca_create_srq(struct 
 	if (mthca_alloc_srq_buf(pd, &attr->attr, srq))
 		goto err;
 
-	srq->mr = __mthca_reg_mr(pd, srq->buf, srq->buf_size, 0, 0);
+	srq->mr = __mthca_reg_mr(pd, srq->buf.buf, srq->buf_size, 0, 0);
 	if (!srq->mr)
 		goto err_free;
 
@@ -430,7 +426,7 @@ err_unreg:
 
 err_free:
 	free(srq->wrid);
-	free(srq->buf);
+	mthca_free_buf(&srq->buf);
 
 err:
 	free(srq);
@@ -469,7 +465,7 @@ int mthca_destroy_srq(struct ibv_srq *sr
 
 	mthca_dereg_mr(to_msrq(srq)->mr);
 
-	free(to_msrq(srq)->buf);
+	mthca_free_buf(&to_msrq(srq)->buf);
 	free(to_msrq(srq)->wrid);
 	free(to_msrq(srq));
 
@@ -507,7 +503,7 @@ struct ibv_qp *mthca_create_qp(struct ib
 	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
 		goto err_free;
 
-	qp->mr = __mthca_reg_mr(pd, qp->buf, qp->buf_size, 0, 0);
+	qp->mr = __mthca_reg_mr(pd, qp->buf.buf, qp->buf_size, 0, 0);
 	if (!qp->mr)
 		goto err_free;
 
@@ -574,7 +570,7 @@ err_unreg:
 
 err_free:
 	free(qp->wrid);
-	free(qp->buf);
+	mthca_free_buf(&qp->buf);
 
 err:
 	free(qp);
@@ -655,8 +651,7 @@ int mthca_destroy_qp(struct ibv_qp *qp)
 	}
 
 	mthca_dereg_mr(to_mqp(qp)->mr);
-
-	free(to_mqp(qp)->buf);
+	mthca_free_buf(&to_mqp(qp)->buf);
 	free(to_mqp(qp)->wrid);
 	free(to_mqp(qp));
 
Index: libmthca/src/mthca.h
===================================================================
--- libmthca/src/mthca.h	(revision 8791)
+++ libmthca/src/mthca.h	(working copy)
@@ -112,6 +112,11 @@ struct mthca_context {
 	int		       qp_table_mask;
 };
 
+struct mthca_buf {
+	void		       *buf;
+	size_t			length;
+};
+
 struct mthca_pd {
 	struct ibv_pd         ibv_pd;
 	struct mthca_ah_page *ah_list;
@@ -121,7 +126,7 @@ struct mthca_pd {
 
 struct mthca_cq {
 	struct ibv_cq  	   ibv_cq;
-	void           	  *buf;
+	struct mthca_buf   buf;
 	pthread_spinlock_t lock;
 	struct ibv_mr  	  *mr;
 	uint32_t       	   cqn;
@@ -137,7 +142,7 @@ struct mthca_cq {
 
 struct mthca_srq {
 	struct ibv_srq     ibv_srq;
-	void              *buf;
+	struct mthca_buf   buf;
 	void           	  *last;
 	pthread_spinlock_t lock;
 	struct ibv_mr 	  *mr;
@@ -174,7 +179,7 @@ struct mthca_wq {
 
 struct mthca_qp {
 	struct ibv_qp    ibv_qp;
-	void            *buf;
+	struct mthca_buf buf;
 	uint64_t        *wrid;
 	int              send_wqe_offset;
 	int              max_inline_data;
@@ -259,6 +264,9 @@ static inline int mthca_is_memfree(struc
 	return to_mdev(ibctx->device)->hca_type == MTHCA_ARBEL;
 }
 
+int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+void mthca_free_buf(struct mthca_buf *buf);
+
 int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
 		   uint32_t **db);
 void mthca_set_db_qn(uint32_t *db, enum mthca_db_type type, uint32_t qn);
@@ -290,7 +298,7 @@ void mthca_arbel_cq_event(struct ibv_cq 
 void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn,
 		    struct mthca_srq *srq);
 void mthca_cq_resize_copy_cqes(struct mthca_cq *cq, void *buf, int new_cqe);
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int cqe);
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent);
 
 struct ibv_srq *mthca_create_srq(struct ibv_pd *pd,
 				 struct ibv_srq_init_attr *attr);
Index: libmthca/src/cq.c
===================================================================
--- libmthca/src/cq.c	(revision 8791)
+++ libmthca/src/cq.c	(working copy)
@@ -126,7 +126,7 @@ struct mthca_err_cqe {
 
 static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
 {
-	return cq->buf + entry * MTHCA_CQ_ENTRY_SIZE;
+	return cq->buf.buf + entry * MTHCA_CQ_ENTRY_SIZE;
 }
 
 static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
@@ -612,17 +612,16 @@ void mthca_cq_resize_copy_cqes(struct mt
 		       get_cqe(cq, i & old_cqe), MTHCA_CQ_ENTRY_SIZE);
 }
 
-void *mthca_alloc_cq_buf(struct mthca_device *dev, int nent)
+int mthca_alloc_cq_buf(struct mthca_device *dev, struct mthca_buf *buf, int nent)
 {
-	void *buf;
 	int i;
 
-	if (posix_memalign(&buf, dev->page_size,
-			   align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size)))
-		return NULL;
+	if (mthca_alloc_buf(buf, align(nent * MTHCA_CQ_ENTRY_SIZE, dev->page_size),
+		    dev->page_size))
+		return -1;
 
 	for (i = 0; i < nent; ++i)
-		((struct mthca_cqe *) buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
+		((struct mthca_cqe *) buf->buf)[i].owner = MTHCA_CQ_ENTRY_OWNER_HW;
 
-	return buf;
+	return 0;
 }
Index: libmthca/src/srq.c
===================================================================
--- libmthca/src/srq.c	(revision 8791)
+++ libmthca/src/srq.c	(working copy)
@@ -47,7 +47,7 @@
 
 static void *get_wqe(struct mthca_srq *srq, int n)
 {
-	return srq->buf + (n << srq->wqe_shift);
+	return srq->buf.buf + (n << srq->wqe_shift);
 }
 
 /*
@@ -292,13 +292,14 @@ int mthca_alloc_srq_buf(struct ibv_pd *p
 
 	srq->buf_size = srq->max << srq->wqe_shift;
 
-	if (posix_memalign(&srq->buf, to_mdev(pd->context->device)->page_size,
-			   align(srq->buf_size, to_mdev(pd->context->device)->page_size))) {
+	if (mthca_alloc_buf(&srq->buf,
+			    align(srq->buf_size, to_mdev(pd->context->device)->page_size),
+			    to_mdev(pd->context->device)->page_size)) {
 		free(srq->wrid);
 		return -1;
 	}
 
-	memset(srq->buf, 0, srq->buf_size);
+	memset(srq->buf.buf, 0, srq->buf_size);
 
 	/*
 	 * Now initialize the SRQ buffer so that all of the WQEs are
Index: libmthca/src/ah.c
===================================================================
--- libmthca/src/ah.c	(revision 8791)
+++ libmthca/src/ah.c	(working copy)
@@ -45,7 +45,7 @@
 
 struct mthca_ah_page {
 	struct mthca_ah_page *prev, *next;
-	void           	     *buf;
+	struct mthca_buf      buf;
 	struct ibv_mr 	     *mr;
 	int           	      use_cnt;
 	unsigned      	      free[0];
@@ -60,14 +60,14 @@ static struct mthca_ah_page *__add_page(
 	if (!page)
 		return NULL;
 
-	if (posix_memalign(&page->buf, page_size, page_size)) {
+	if (mthca_alloc_buf(&page->buf, page_size, page_size)) {
 		free(page);
 		return NULL;
 	}
 
-	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf, page_size, 0);
+	page->mr = mthca_reg_mr(&pd->ibv_pd, page->buf.buf, page_size, 0);
 	if (!page->mr) {
-		free(page->buf);
+		mthca_free_buf(&page->buf);
 		free(page);
 		return NULL;
 	}
@@ -123,7 +123,7 @@ int mthca_alloc_av(struct mthca_pd *pd, 
 			if (page->free[i]) {
 				j = ffs(page->free[i]);
 				page->free[i] &= ~(1 << (j - 1));
-				ah->av = page->buf +
+				ah->av = page->buf.buf +
 					(i * 8 * sizeof (int) + (j - 1)) * sizeof *ah->av;
 				break;
 			}
@@ -172,7 +172,7 @@ void mthca_free_av(struct mthca_ah *ah)
 		pthread_mutex_lock(&pd->ah_mutex);
 
 		page = ah->page;
-		i = ((void *) ah->av - page->buf) / sizeof *ah->av;
+		i = ((void *) ah->av - page->buf.buf) / sizeof *ah->av;
 		page->free[i / (8 * sizeof (int))] |= 1 << (i % (8 * sizeof (int)));
 
 		if (!--page->use_cnt) {
@@ -184,7 +184,7 @@ void mthca_free_av(struct mthca_ah *ah)
 				page->next->prev = page->prev;
 
 			mthca_dereg_mr(page->mr);
-			free(page->buf);
+			mthca_free_buf(&page->buf);
 			free(page);
 		}
 
Index: libmthca/ChangeLog
===================================================================
--- libmthca/ChangeLog	(revision 8791)
+++ libmthca/ChangeLog	(working copy)
@@ -1,3 +1,19 @@
+2006-07-26  Roland Dreier  <rdreier at cisco.com>
+
+	* src/mthca.h, src/ah.c, src/cq.c, src/memfree.c, src/qp.c,
+	src/srq.c, src/verbs.c: Convert internal allocations for AH pages
+	(for non-memfree HCAs), CQ buffers, doorbell pages (for memfree
+	HCAs), QP buffers and SRQ buffers to use the new buffer
+	allocator.  This makes libmthca fork()-clean when built against
+	libibverbs 1.1.
+
+	* src/buf.c (mthca_alloc_buf, mthca_free_buf): Add new functions
+	to wrap up allocating page-aligned buffers.  The new functions
+	will call ibv_dontfork_range()/ibv_dofork_range() to do proper
+	madvise()ing to handle fork(), if applicable.
+
+	* configure.in: Check for ibv_dontfork_range() and ibv_dontfork_range().
+
 2006-07-04  Dotan Barak  <dotanb at mellanox.co.il>
 
 	* src/verbs.c (mthca_create_cq, mthca_resize_cq): Passing huge
Index: libmthca/Makefile.am
===================================================================
--- libmthca/Makefile.am	(revision 8791)
+++ libmthca/Makefile.am	(working copy)
@@ -12,10 +12,9 @@ else
     mthca_version_script =
 endif
 
-src_mthca_la_SOURCES = src/ah.c src/cq.c src/memfree.c src/mthca.c src/qp.c \
-    src/srq.c src/verbs.c
-src_mthca_la_LDFLAGS = -avoid-version -module \
-    $(mthca_version_script)
+src_mthca_la_SOURCES = src/ah.c src/buf.c src/cq.c src/memfree.c src/mthca.c \
+    src/qp.c src/srq.c src/verbs.c
+src_mthca_la_LDFLAGS = -avoid-version -module $(mthca_version_script)
 
 DEBIAN = debian/changelog debian/compat debian/control debian/copyright \
     debian/libmthca1.install debian/libmthca-dev.install debian/rules




More information about the general mailing list